ExtensionCrawler/ExtensionCrawler/file_identifiers.py

300 lines
9.8 KiB
Python
Raw Normal View History

2019-01-15 18:41:12 +00:00
#!/usr/bin/env python3.7
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
2017-11-19 00:36:15 +00:00
""" Module for obtaining (normalized) hashes for files."""
import hashlib
import os
import re
import zlib
import mimetypes
from io import StringIO
2017-11-19 00:36:15 +00:00
from simhash import Simhash
import cchardet as chardet
import magic
from ExtensionCrawler.js_mincer import mince_js
2018-04-21 18:00:07 +00:00
def is_binary_resource(mimetype_magic):
return (mimetype_magic.startswith("image/") or
mimetype_magic.startswith("video/") or
mimetype_magic.startswith("audio/") or
mimetype_magic == "application/pdf")
2017-11-19 00:36:15 +00:00
2018-04-21 18:00:07 +00:00
def normalize_jsdata(str_data):
"""Compute normalized code blocks of a JavaScript file"""
txt = ""
loc = 0
with StringIO(str_data) as str_obj:
for block in mince_js(str_obj):
if block.is_code():
for line in block.content.splitlines():
txt += line.strip()
loc += 1
return txt.encode(), loc
2017-11-20 22:42:21 +00:00
2017-11-19 00:36:15 +00:00
def get_features(s):
"""Compute feature set of text (represented as string)."""
width = 3
s = s.lower()
s = re.sub(r'[^\w]+', '', s)
return (s[i:i + width] for i in range(max(len(s) - width + 1, 1)))
2017-11-19 00:36:15 +00:00
2017-11-20 22:42:21 +00:00
def get_simhash(encoding, data):
2017-11-19 00:36:15 +00:00
"""Compute simhash of text."""
2018-04-21 18:00:07 +00:00
if encoding is not None:
2018-07-21 00:50:59 +00:00
# VISCII is not supported by python, UTF-8 parses at least the for us important parts
if encoding == "VISCII":
encoding = "UTF-8"
2018-04-21 18:00:07 +00:00
str_data = data.decode(encoding=encoding, errors="replace")
else:
str_data = str(data)
simhash = Simhash(get_features(str_data)).value
return simhash
2017-11-23 21:57:58 +00:00
2017-11-19 00:36:15 +00:00
def compute_difference(hx, hy):
"""Compute difference between two simhashes."""
assert hx.bit_length() == hy.bit_length()
h = (hx ^ hy) & ((1 << 64) - 1)
d = 0
while h:
d += 1
h &= h - 1
return d
2017-11-20 22:42:21 +00:00
def get_data_identifiers(data):
"""Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
2018-04-21 18:00:07 +00:00
data_identifier = {
'encoding': None,
'description': None,
'size': None,
'loc': None,
'mimetype_magic': None,
'md5': None,
'sha1': None,
'sha256': None,
'simhash': None,
'size_stripped': None,
'normalized_encoding': None,
'normalized_description': None,
'normalized_size': None,
'normalized_loc': None,
'normalized_mimetype_magic': None,
'normalized_md5': None,
'normalized_sha1': None,
'normalized_sha256': None,
'normalized_simhash': None
}
mimetype_magic = magic.from_buffer(data, mime=True)
2017-10-10 14:35:02 +00:00
try:
magic_desc = magic.from_buffer(data)
except magic.MagicException as exp:
rgx = re.compile(r' name use count.*$')
msg = str(exp.message)
if re.search(rgx, msg):
magic_desc = re.sub(rgx, '', msg)
else:
raise exp
data_identifier['mimetype_magic'] = mimetype_magic
data_identifier['md5'] = hashlib.md5(data).digest()
data_identifier['sha1'] = hashlib.sha1(data).digest()
data_identifier['sha256'] = hashlib.sha256(data).digest()
data_identifier['size'] = len(data)
data_identifier['description'] = magic_desc
# We don't continue here with binary files, as that consumes too many
# resources.
if is_binary_resource(mimetype_magic):
return data_identifier
encoding = chardet.detect(data)['encoding']
data_identifier['simhash'] = get_simhash(encoding, data)
data_identifier['size_stripped'] = len(data.strip())
data_identifier['loc'] = len(data.splitlines())
data_identifier['encoding'] = encoding
try:
normalized_data, normalized_loc = normalize_jsdata(
2018-04-21 18:00:07 +00:00
data.decode(encoding=data_identifier['encoding'], errors="replace"))
except Exception:
normalized_data = None
2018-04-21 18:00:07 +00:00
normalized_loc = 0
if normalized_data is not None:
2017-10-10 14:35:02 +00:00
normalized_magic_desc = ""
try:
normalized_magic_desc = magic.from_buffer(normalized_data)
except magic.MagicException as exp:
rgx = re.compile(r' name use count.*$')
msg = str(exp.message)
if re.search(rgx, msg):
2018-04-21 18:00:07 +00:00
normalized_magic_desc = re.sub(rgx, '', msg)
2017-10-10 14:35:02 +00:00
else:
raise exp
normalized_encoding = chardet.detect(normalized_data)['encoding']
data_identifier['normalized_encoding'] = normalized_encoding
2017-10-10 14:35:02 +00:00
data_identifier['normalized_description'] = normalized_magic_desc
data_identifier['normalized_size'] = len(normalized_data)
data_identifier['normalized_loc'] = normalized_loc
data_identifier['normalized_mimetype_magic'] = magic.from_buffer(normalized_data, mime=True)
data_identifier['normalized_md5'] = hashlib.md5(
normalized_data).digest()
data_identifier['normalized_sha1'] = hashlib.sha1(
normalized_data).digest()
data_identifier['normalized_sha256'] = hashlib.sha256(
normalized_data).digest()
2017-11-20 22:42:21 +00:00
data_identifier['normalized_simhash'] = get_simhash(
normalized_encoding, normalized_data)
return data_identifier
def get_file_identifiers(path, data=None):
"""Get basic file identifiers (path, filename, etc.) and data identifiers."""
dec_data_identifier = {
'mimetype_magic': None,
'md5': None,
'sha1': None,
'sha256': None,
'simhash': None,
'size': None,
2017-10-11 19:16:33 +00:00
'size_stripped': None,
'loc': None,
'description': None,
'encoding': None,
'normalized_mimetype_magic': None,
'normalized_loc': None,
'normalized_encoding': None,
'normalized_description': None,
'normalized_size': None,
'normalized_md5': None,
'normalized_sha1': None,
'normalized_sha256': None,
'normalized_simhash': None
}
if data is None:
with open(path, 'rb') as fileobj:
data = fileobj.read()
data_identifier = get_data_identifiers(data)
if data_identifier['description'].startswith('gzip'):
try:
2017-09-22 07:42:02 +00:00
dec = zlib.decompressobj(zlib.MAX_WBITS | 16)
dec_data = dec.decompress(data, 100 * data_identifier['size'])
dec_data_identifier = get_data_identifiers(dec_data)
del dec_data
except Exception as e:
dec_data_identifier[
'description'] = "Exception during compression (likely zip-bomb:" + str(
e)
file_identifier = {
2017-11-20 22:42:21 +00:00
'filename':
os.path.basename(path),
'path':
path,
'mimetype':
mimetypes.guess_type(path),
'mimetype_magic':
data_identifier['mimetype_magic'],
2017-11-20 22:42:21 +00:00
'md5':
data_identifier['md5'],
'sha1':
data_identifier['sha1'],
'sha256':
data_identifier['sha256'],
'simhash':
data_identifier['simhash'],
'size':
data_identifier['size'],
'size_stripped':
data_identifier['size_stripped'],
'loc':
data_identifier['loc'],
'description':
data_identifier['description'],
'encoding':
data_identifier['encoding'],
'normalized_encoding':
data_identifier['normalized_encoding'],
'normalized_description':
data_identifier['normalized_description'],
'normalized_size':
data_identifier['normalized_size'],
'normalized_loc':
data_identifier['normalized_loc'],
'normalized_mimetype_magic':
data_identifier['normalized_mimetype_magic'],
2017-11-20 22:42:21 +00:00
'normalized_md5':
data_identifier['normalized_md5'],
'normalized_sha1':
data_identifier['normalized_sha1'],
'normalized_sha256':
data_identifier['normalized_sha256'],
'normalized_simhash':
data_identifier['normalized_simhash'],
'dec_mimetype_magic':
dec_data_identifier['mimetype_magic'],
2017-11-20 22:42:21 +00:00
'dec_md5':
dec_data_identifier['md5'],
'dec_sha1':
dec_data_identifier['sha1'],
'dec_sha256':
dec_data_identifier['sha256'],
'dec_simhash':
dec_data_identifier['simhash'],
'dec_size':
dec_data_identifier['size'],
'dec_size_stripped':
dec_data_identifier['size_stripped'],
'dec_loc':
dec_data_identifier['loc'],
'dec_description':
dec_data_identifier['description'],
'dec_encoding':
dec_data_identifier['encoding'],
'dec_normalized_encoding':
dec_data_identifier['normalized_encoding'],
'dec_normalized_description':
dec_data_identifier['normalized_description'],
'dec_normalized_size':
dec_data_identifier['normalized_size'],
'dec_normalized_loc':
dec_data_identifier['normalized_loc'],
'dec_normalized_mimetype_magic':
dec_data_identifier['normalized_mimetype_magic'],
2017-11-20 22:42:21 +00:00
'dec_normalized_md5':
dec_data_identifier['normalized_md5'],
'dec_normalized_sha1':
dec_data_identifier['normalized_sha1'],
'dec_normalized_sha256':
dec_data_identifier['normalized_sha256'],
'dec_normalized_simhash':
dec_data_identifier['normalized_simhash']
}
return file_identifier