Refactoring: moved generic file identifiers into own module.

This commit is contained in:
Achim D. Brucker 2017-09-16 17:19:36 +01:00
parent e98f58fff8
commit 4cf41e2e4f
2 changed files with 141 additions and 120 deletions

View File

@ -20,25 +20,18 @@
import gc
import glob
import hashlib
import logging
import mimetypes
import os
import re
import zlib
from functools import partial, reduce
from io import StringIO
from multiprocessing import Pool
import csv
import sys
import cchardet as chardet
import dateutil.parser
import git
import magic
from ExtensionCrawler.js_mincer import mince_js
from ExtensionCrawler.file_identifiers import get_file_identifiers
def get_add_date(git_path, filename):
"""Method for getting the initial add/commit date of a file."""
@ -104,118 +97,6 @@ def hackish_pull_list_changed_files(git_path):
files.add(changed_file.strip())
return list(files)
def normalize_jsdata(str_data):
"""Compute normalized code blocks of a JavaScript file"""
txt = ""
loc = 0
with StringIO(str_data) as str_obj:
for block in mince_js(str_obj):
if block.is_code():
for line in block.content.splitlines():
txt += line.strip()
loc += 1
return txt.encode(), loc
def get_data_identifiers(data):
"""Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
data_identifier = {
'md5': hashlib.md5(data).digest(),
'sha1': hashlib.sha1(data).digest(),
'sha256': hashlib.sha256(data).digest(),
'size': len(data),
'loc': len(data.splitlines()),
'description': magic.from_buffer(data),
'encoding': chardet.detect(data)['encoding'],
}
try:
normalized_data, normalized_loc = normalize_jsdata(
data.decode(data_identifier['encoding']))
except Exception:
normalized_data = None
if normalized_data is None:
data_identifier['normalized_loc'] = None
data_identifier['normalized_md5'] = None
data_identifier['normalized_sha1'] = None
data_identifier['normalized_sha256'] = None
else:
data_identifier['normalized_loc'] = normalized_loc
data_identifier['normalized_md5'] = hashlib.md5(
normalized_data).digest()
data_identifier['normalized_sha1'] = hashlib.sha1(
normalized_data).digest()
data_identifier['normalized_sha256'] = hashlib.sha256(
normalized_data).digest()
return data_identifier
def get_file_identifiers(path):
"""Get basic file identifiers (path, filename, etc.) and data identifiers."""
dec_data_identifier = {
'md5': None,
'sha1': None,
'sha256': None,
'size': None,
'loc': None,
'description': None,
'encoding': None,
'normalized_loc': None,
'normalized_md5': None,
'normalized_sha1': None,
'normalized_sha256': None
}
with open(path, 'rb') as fileobj:
data = fileobj.read()
data_identifier = get_data_identifiers(data)
if data_identifier['description'].startswith('gzip'):
try:
with zlib.decompressobj(zlib.MAX_WBITS | 16) as dec:
dec_data = dec.decompress(data, 100 * data_identifier['size'])
del data
dec_data_identifier = get_data_identifiers(dec_data)
del dec_data
except Exception as e:
dec_data_identifier[
'description'] = "Exception during compression (likely zip-bomb:" + str(
e)
else:
del data
gc.collect()
file_identifier = {
'filename': os.path.basename(path),
'path': path,
'mimetype': mimetypes.guess_type(path),
'md5': data_identifier['md5'],
'sha1': data_identifier['sha1'],
'sha256': data_identifier['sha256'],
'size': data_identifier['size'],
'loc': data_identifier['loc'],
'description': data_identifier['description'],
'encoding': data_identifier['encoding'],
'normalized_loc': data_identifier['normalized_loc'],
'normalized_md5': data_identifier['normalized_md5'],
'normalized_sha1': data_identifier['normalized_sha1'],
'normalized_sha256': data_identifier['normalized_sha256'],
'dec_md5': dec_data_identifier['md5'],
'dec_sha1': dec_data_identifier['sha1'],
'dec_sha256': dec_data_identifier['sha256'],
'dec_size': dec_data_identifier['size'],
'dec_loc': dec_data_identifier['loc'],
'dec_description': dec_data_identifier['description'],
'dec_encoding': dec_data_identifier['encoding'],
'dec_normalized_loc': dec_data_identifier['normalized_loc'],
'dec_normalized_md5': dec_data_identifier['normalized_md5'],
'dec_normalized_sha1': dec_data_identifier['normalized_sha1'],
'dec_normalized_sha256': dec_data_identifier['normalized_sha256']
}
return file_identifier
def path_to_list(path):
"""Convert a path (string) to a list of folders/files."""
plist = []

View File

@ -0,0 +1,140 @@
#!/usr/bin/env python3.5
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
""" Module for obtaining (normalized) md5/sha1/sha256 hashes for files."""
import gc
import hashlib
import mimetypes
import os
import zlib
from io import StringIO
import cchardet as chardet
import magic
from ExtensionCrawler.js_mincer import mince_js
def normalize_jsdata(str_data):
"""Compute normalized code blocks of a JavaScript file"""
txt = ""
loc = 0
with StringIO(str_data) as str_obj:
for block in mince_js(str_obj):
if block.is_code():
for line in block.content.splitlines():
txt += line.strip()
loc += 1
return txt.encode(), loc
def get_data_identifiers(data):
"""Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
data_identifier = {
'md5': hashlib.md5(data).digest(),
'sha1': hashlib.sha1(data).digest(),
'sha256': hashlib.sha256(data).digest(),
'size': len(data),
'loc': len(data.splitlines()),
'description': magic.from_buffer(data),
'encoding': chardet.detect(data)['encoding'],
}
try:
normalized_data, normalized_loc = normalize_jsdata(
data.decode(data_identifier['encoding']))
except Exception:
normalized_data = None
if normalized_data is None:
data_identifier['normalized_loc'] = None
data_identifier['normalized_md5'] = None
data_identifier['normalized_sha1'] = None
data_identifier['normalized_sha256'] = None
else:
data_identifier['normalized_loc'] = normalized_loc
data_identifier['normalized_md5'] = hashlib.md5(
normalized_data).digest()
data_identifier['normalized_sha1'] = hashlib.sha1(
normalized_data).digest()
data_identifier['normalized_sha256'] = hashlib.sha256(
normalized_data).digest()
return data_identifier
def get_file_identifiers(path):
"""Get basic file identifiers (path, filename, etc.) and data identifiers."""
dec_data_identifier = {
'md5': None,
'sha1': None,
'sha256': None,
'size': None,
'loc': None,
'description': None,
'encoding': None,
'normalized_loc': None,
'normalized_md5': None,
'normalized_sha1': None,
'normalized_sha256': None
}
with open(path, 'rb') as fileobj:
data = fileobj.read()
data_identifier = get_data_identifiers(data)
if data_identifier['description'].startswith('gzip'):
try:
with zlib.decompressobj(zlib.MAX_WBITS | 16) as dec:
dec_data = dec.decompress(data, 100 * data_identifier['size'])
del data
dec_data_identifier = get_data_identifiers(dec_data)
del dec_data
except Exception as e:
dec_data_identifier[
'description'] = "Exception during compression (likely zip-bomb:" + str(
e)
else:
del data
gc.collect()
file_identifier = {
'filename': os.path.basename(path),
'path': path,
'mimetype': mimetypes.guess_type(path),
'md5': data_identifier['md5'],
'sha1': data_identifier['sha1'],
'sha256': data_identifier['sha256'],
'size': data_identifier['size'],
'loc': data_identifier['loc'],
'description': data_identifier['description'],
'encoding': data_identifier['encoding'],
'normalized_loc': data_identifier['normalized_loc'],
'normalized_md5': data_identifier['normalized_md5'],
'normalized_sha1': data_identifier['normalized_sha1'],
'normalized_sha256': data_identifier['normalized_sha256'],
'dec_md5': dec_data_identifier['md5'],
'dec_sha1': dec_data_identifier['sha1'],
'dec_sha256': dec_data_identifier['sha256'],
'dec_size': dec_data_identifier['size'],
'dec_loc': dec_data_identifier['loc'],
'dec_description': dec_data_identifier['description'],
'dec_encoding': dec_data_identifier['encoding'],
'dec_normalized_loc': dec_data_identifier['normalized_loc'],
'dec_normalized_md5': dec_data_identifier['normalized_md5'],
'dec_normalized_sha1': dec_data_identifier['normalized_sha1'],
'dec_normalized_sha256': dec_data_identifier['normalized_sha256']
}
return file_identifier