ExtensionCrawler/ExtensionCrawler/file_identifiers.py

297 lines
9.7 KiB
Python

#!/usr/bin/env python3.6
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
""" Module for obtaining (normalized) hashes for files."""
import hashlib
import os
import re
import zlib
import mimetypes
from io import StringIO
from simhash import Simhash
import cchardet as chardet
import magic
from ExtensionCrawler.js_mincer import mince_js
def is_binary_resource(mimetype_magic):
return (mimetype_magic.startswith("image/") or
mimetype_magic.startswith("video/") or
mimetype_magic.startswith("audio/") or
mimetype_magic == "application/pdf")
def normalize_jsdata(str_data):
"""Compute normalized code blocks of a JavaScript file"""
txt = ""
loc = 0
with StringIO(str_data) as str_obj:
for block in mince_js(str_obj):
if block.is_code():
for line in block.content.splitlines():
txt += line.strip()
loc += 1
return txt.encode(), loc
def get_features(s):
"""Compute feature set of text (represented as string)."""
width = 3
s = s.lower()
s = re.sub(r'[^\w]+', '', s)
return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]
def get_simhash(encoding, data):
"""Compute simhash of text."""
if encoding is not None:
str_data = data.decode(encoding=encoding, errors="replace")
else:
str_data = str(data)
simhash = Simhash(get_features(str_data)).value
return simhash
def compute_difference(hx, hy):
"""Compute difference between two simhashes."""
assert hx.bit_length() == hy.bit_length()
h = (hx ^ hy) & ((1 << 64) - 1)
d = 0
while h:
d += 1
h &= h - 1
return d
def get_data_identifiers(data):
"""Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
data_identifier = {
'encoding': None,
'description': None,
'size': None,
'loc': None,
'mimetype_magic': None,
'md5': None,
'sha1': None,
'sha256': None,
'simhash': None,
'size_stripped': None,
'normalized_encoding': None,
'normalized_description': None,
'normalized_size': None,
'normalized_loc': None,
'normalized_mimetype_magic': None,
'normalized_md5': None,
'normalized_sha1': None,
'normalized_sha256': None,
'normalized_simhash': None
}
mimetype_magic = magic.from_buffer(data, mime=True)
try:
magic_desc = magic.from_buffer(data)
except magic.MagicException as exp:
rgx = re.compile(r' name use count.*$')
msg = str(exp.message)
if re.search(rgx, msg):
magic_desc = re.sub(rgx, '', msg)
else:
raise exp
data_identifier['mimetype_magic'] = mimetype_magic
data_identifier['md5'] = hashlib.md5(data).digest()
data_identifier['sha1'] = hashlib.sha1(data).digest()
data_identifier['sha256'] = hashlib.sha256(data).digest()
data_identifier['size'] = len(data)
data_identifier['description'] = magic_desc
# We don't continue here with binary files, as that consumes too many
# resources.
if is_binary_resource(mimetype_magic):
return data_identifier
encoding = chardet.detect(data)['encoding']
data_identifier['simhash'] = get_simhash(encoding, data)
data_identifier['size_stripped'] = len(data.strip())
data_identifier['loc'] = len(data.splitlines())
data_identifier['encoding'] = encoding
try:
normalized_data, normalized_loc = normalize_jsdata(
data.decode(encoding=data_identifier['encoding'], errors="replace"))
except Exception:
normalized_data = None
normalized_loc = 0
if normalized_data is not None:
normalized_magic_desc = ""
try:
normalized_magic_desc = magic.from_buffer(normalized_data)
except magic.MagicException as exp:
rgx = re.compile(r' name use count.*$')
msg = str(exp.message)
if re.search(rgx, msg):
normalized_magic_desc = re.sub(rgx, '', msg)
else:
raise exp
normalized_encoding = chardet.detect(normalized_data)['encoding']
data_identifier['normalized_encoding'] = normalized_encoding
data_identifier['normalized_description'] = normalized_magic_desc
data_identifier['normalized_size'] = len(normalized_data)
data_identifier['normalized_loc'] = normalized_loc
data_identifier['normalized_mimetype_magic'] = magic.from_buffer(normalized_data, mime=True)
data_identifier['normalized_md5'] = hashlib.md5(
normalized_data).digest()
data_identifier['normalized_sha1'] = hashlib.sha1(
normalized_data).digest()
data_identifier['normalized_sha256'] = hashlib.sha256(
normalized_data).digest()
data_identifier['normalized_simhash'] = get_simhash(
normalized_encoding, normalized_data)
return data_identifier
def get_file_identifiers(path, data=None):
"""Get basic file identifiers (path, filename, etc.) and data identifiers."""
dec_data_identifier = {
'mimetype_magic': None,
'md5': None,
'sha1': None,
'sha256': None,
'simhash': None,
'size': None,
'size_stripped': None,
'loc': None,
'description': None,
'encoding': None,
'normalized_mimetype_magic': None,
'normalized_loc': None,
'normalized_encoding': None,
'normalized_description': None,
'normalized_size': None,
'normalized_md5': None,
'normalized_sha1': None,
'normalized_sha256': None,
'normalized_simhash': None
}
if data is None:
with open(path, 'rb') as fileobj:
data = fileobj.read()
data_identifier = get_data_identifiers(data)
if data_identifier['description'].startswith('gzip'):
try:
dec = zlib.decompressobj(zlib.MAX_WBITS | 16)
dec_data = dec.decompress(data, 100 * data_identifier['size'])
dec_data_identifier = get_data_identifiers(dec_data)
del dec_data
except Exception as e:
dec_data_identifier[
'description'] = "Exception during compression (likely zip-bomb:" + str(
e)
file_identifier = {
'filename':
os.path.basename(path),
'path':
path,
'mimetype':
mimetypes.guess_type(path),
'mimetype_magic':
data_identifier['mimetype_magic'],
'md5':
data_identifier['md5'],
'sha1':
data_identifier['sha1'],
'sha256':
data_identifier['sha256'],
'simhash':
data_identifier['simhash'],
'size':
data_identifier['size'],
'size_stripped':
data_identifier['size_stripped'],
'loc':
data_identifier['loc'],
'description':
data_identifier['description'],
'encoding':
data_identifier['encoding'],
'normalized_encoding':
data_identifier['normalized_encoding'],
'normalized_description':
data_identifier['normalized_description'],
'normalized_size':
data_identifier['normalized_size'],
'normalized_loc':
data_identifier['normalized_loc'],
'normalized_mimetype_magic':
data_identifier['normalized_mimetype_magic'],
'normalized_md5':
data_identifier['normalized_md5'],
'normalized_sha1':
data_identifier['normalized_sha1'],
'normalized_sha256':
data_identifier['normalized_sha256'],
'normalized_simhash':
data_identifier['normalized_simhash'],
'dec_mimetype_magic':
dec_data_identifier['mimetype_magic'],
'dec_md5':
dec_data_identifier['md5'],
'dec_sha1':
dec_data_identifier['sha1'],
'dec_sha256':
dec_data_identifier['sha256'],
'dec_simhash':
dec_data_identifier['simhash'],
'dec_size':
dec_data_identifier['size'],
'dec_size_stripped':
dec_data_identifier['size_stripped'],
'dec_loc':
dec_data_identifier['loc'],
'dec_description':
dec_data_identifier['description'],
'dec_encoding':
dec_data_identifier['encoding'],
'dec_normalized_encoding':
dec_data_identifier['normalized_encoding'],
'dec_normalized_description':
dec_data_identifier['normalized_description'],
'dec_normalized_size':
dec_data_identifier['normalized_size'],
'dec_normalized_loc':
dec_data_identifier['normalized_loc'],
'dec_normalized_mimetype_magic':
dec_data_identifier['normalized_mimetype_magic'],
'dec_normalized_md5':
dec_data_identifier['normalized_md5'],
'dec_normalized_sha1':
dec_data_identifier['normalized_sha1'],
'dec_normalized_sha256':
dec_data_identifier['normalized_sha256'],
'dec_normalized_simhash':
dec_data_identifier['normalized_simhash']
}
return file_identifier