Refactoring: moved generic file identifiers into own module.

2017-09-16 17:19:36 +01:00 · 2017-09-16 17:19:36 +01:00 · 4cf41e2e4f
parent e98f58fff8
commit 4cf41e2e4f
2 changed files with 141 additions and 120 deletions
--- a/ExtensionCrawler/cdnjs_git.py
+++ b/ExtensionCrawler/cdnjs_git.py
@ -20,25 +20,18 @@
 import gc
 import glob
 import hashlib
 import logging
 import mimetypes
 import os
 import re
 import zlib
 from functools import partial, reduce
 from io import StringIO
 from multiprocessing import Pool
 import csv
 import sys
 import cchardet as chardet
 import dateutil.parser
 import git
 import magic
 from ExtensionCrawler.js_mincer import mince_js
 from ExtensionCrawler.file_identifiers import get_file_identifiers
 def get_add_date(git_path, filename):
    """Method for getting the initial add/commit date of a file."""
@ -104,118 +97,6 @@ def hackish_pull_list_changed_files(git_path):
                files.add(changed_file.strip())
    return list(files)
 def normalize_jsdata(str_data):
    """Compute normalized code blocks of a JavaScript file"""
    txt = ""
    loc = 0
    with StringIO(str_data) as str_obj:
        for block in mince_js(str_obj):
            if block.is_code():
                for line in block.content.splitlines():
                    txt += line.strip()
                    loc += 1
    return txt.encode(), loc
 def get_data_identifiers(data):
    """Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
    data_identifier = {
        'md5': hashlib.md5(data).digest(),
        'sha1': hashlib.sha1(data).digest(),
        'sha256': hashlib.sha256(data).digest(),
        'size': len(data),
        'loc': len(data.splitlines()),
        'description': magic.from_buffer(data),
        'encoding': chardet.detect(data)['encoding'],
    }
    try:
        normalized_data, normalized_loc = normalize_jsdata(
            data.decode(data_identifier['encoding']))
    except Exception:
        normalized_data = None
    if normalized_data is None:
        data_identifier['normalized_loc'] = None
        data_identifier['normalized_md5'] = None
        data_identifier['normalized_sha1'] = None
        data_identifier['normalized_sha256'] = None
    else:
        data_identifier['normalized_loc'] = normalized_loc
        data_identifier['normalized_md5'] = hashlib.md5(
            normalized_data).digest()
        data_identifier['normalized_sha1'] = hashlib.sha1(
            normalized_data).digest()
        data_identifier['normalized_sha256'] = hashlib.sha256(
            normalized_data).digest()
    return data_identifier
 def get_file_identifiers(path):
    """Get basic file identifiers (path, filename, etc.) and data identifiers."""
    dec_data_identifier = {
        'md5': None,
        'sha1': None,
        'sha256': None,
        'size': None,
        'loc': None,
        'description': None,
        'encoding': None,
        'normalized_loc': None,
        'normalized_md5': None,
        'normalized_sha1': None,
        'normalized_sha256': None
    }
    with open(path, 'rb') as fileobj:
        data = fileobj.read()
    data_identifier = get_data_identifiers(data)
    if data_identifier['description'].startswith('gzip'):
        try:
            with zlib.decompressobj(zlib.MAX_WBITS | 16) as dec:
                dec_data = dec.decompress(data, 100 * data_identifier['size'])
                del data
            dec_data_identifier = get_data_identifiers(dec_data)
            del dec_data
        except Exception as e:
            dec_data_identifier[
                'description'] = "Exception during compression (likely zip-bomb:" + str(
                    e)
    else:
        del data
    gc.collect()
    file_identifier = {
        'filename': os.path.basename(path),
        'path': path,
        'mimetype': mimetypes.guess_type(path),
        'md5': data_identifier['md5'],
        'sha1': data_identifier['sha1'],
        'sha256': data_identifier['sha256'],
        'size': data_identifier['size'],
        'loc': data_identifier['loc'],
        'description': data_identifier['description'],
        'encoding': data_identifier['encoding'],
        'normalized_loc': data_identifier['normalized_loc'],
        'normalized_md5': data_identifier['normalized_md5'],
        'normalized_sha1': data_identifier['normalized_sha1'],
        'normalized_sha256': data_identifier['normalized_sha256'],
        'dec_md5': dec_data_identifier['md5'],
        'dec_sha1': dec_data_identifier['sha1'],
        'dec_sha256': dec_data_identifier['sha256'],
        'dec_size': dec_data_identifier['size'],
        'dec_loc': dec_data_identifier['loc'],
        'dec_description': dec_data_identifier['description'],
        'dec_encoding': dec_data_identifier['encoding'],
        'dec_normalized_loc': dec_data_identifier['normalized_loc'],
        'dec_normalized_md5': dec_data_identifier['normalized_md5'],
        'dec_normalized_sha1': dec_data_identifier['normalized_sha1'],
        'dec_normalized_sha256': dec_data_identifier['normalized_sha256']
    }
    return file_identifier
 def path_to_list(path):
    """Convert a path (string) to a list of folders/files."""
    plist = []
--- a/ExtensionCrawler/file_identifiers.py
+++ b/ExtensionCrawler/file_identifiers.py
@ -0,0 +1,140 @@
 #!/usr/bin/env python3.5
 #
 # Copyright (C) 2016,2017 The University of Sheffield, UK
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 """ Module for obtaining (normalized) md5/sha1/sha256 hashes for files."""
 import gc
 import hashlib
 import mimetypes
 import os
 import zlib
 from io import StringIO
 import cchardet as chardet
 import magic
 from ExtensionCrawler.js_mincer import mince_js
 def normalize_jsdata(str_data):
    """Compute normalized code blocks of a JavaScript file"""
    txt = ""
    loc = 0
    with StringIO(str_data) as str_obj:
        for block in mince_js(str_obj):
            if block.is_code():
                for line in block.content.splitlines():
                    txt += line.strip()
                    loc += 1
    return txt.encode(), loc
 def get_data_identifiers(data):
    """Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
    data_identifier = {
        'md5': hashlib.md5(data).digest(),
        'sha1': hashlib.sha1(data).digest(),
        'sha256': hashlib.sha256(data).digest(),
        'size': len(data),
        'loc': len(data.splitlines()),
        'description': magic.from_buffer(data),
        'encoding': chardet.detect(data)['encoding'],
    }
    try:
        normalized_data, normalized_loc = normalize_jsdata(
            data.decode(data_identifier['encoding']))
    except Exception:
        normalized_data = None
    if normalized_data is None:
        data_identifier['normalized_loc'] = None
        data_identifier['normalized_md5'] = None
        data_identifier['normalized_sha1'] = None
        data_identifier['normalized_sha256'] = None
    else:
        data_identifier['normalized_loc'] = normalized_loc
        data_identifier['normalized_md5'] = hashlib.md5(
            normalized_data).digest()
        data_identifier['normalized_sha1'] = hashlib.sha1(
            normalized_data).digest()
        data_identifier['normalized_sha256'] = hashlib.sha256(
            normalized_data).digest()
    return data_identifier
 def get_file_identifiers(path):
    """Get basic file identifiers (path, filename, etc.) and data identifiers."""
    dec_data_identifier = {
        'md5': None,
        'sha1': None,
        'sha256': None,
        'size': None,
        'loc': None,
        'description': None,
        'encoding': None,
        'normalized_loc': None,
        'normalized_md5': None,
        'normalized_sha1': None,
        'normalized_sha256': None
    }
    with open(path, 'rb') as fileobj:
        data = fileobj.read()
    data_identifier = get_data_identifiers(data)
    if data_identifier['description'].startswith('gzip'):
        try:
            with zlib.decompressobj(zlib.MAX_WBITS | 16) as dec:
                dec_data = dec.decompress(data, 100 * data_identifier['size'])
                del data
            dec_data_identifier = get_data_identifiers(dec_data)
            del dec_data
        except Exception as e:
            dec_data_identifier[
                'description'] = "Exception during compression (likely zip-bomb:" + str(
                    e)
    else:
        del data
    gc.collect()
    file_identifier = {
        'filename': os.path.basename(path),
        'path': path,
        'mimetype': mimetypes.guess_type(path),
        'md5': data_identifier['md5'],
        'sha1': data_identifier['sha1'],
        'sha256': data_identifier['sha256'],
        'size': data_identifier['size'],
        'loc': data_identifier['loc'],
        'description': data_identifier['description'],
        'encoding': data_identifier['encoding'],
        'normalized_loc': data_identifier['normalized_loc'],
        'normalized_md5': data_identifier['normalized_md5'],
        'normalized_sha1': data_identifier['normalized_sha1'],
        'normalized_sha256': data_identifier['normalized_sha256'],
        'dec_md5': dec_data_identifier['md5'],
        'dec_sha1': dec_data_identifier['sha1'],
        'dec_sha256': dec_data_identifier['sha256'],
        'dec_size': dec_data_identifier['size'],
        'dec_loc': dec_data_identifier['loc'],
        'dec_description': dec_data_identifier['description'],
        'dec_encoding': dec_data_identifier['encoding'],
        'dec_normalized_loc': dec_data_identifier['normalized_loc'],
        'dec_normalized_md5': dec_data_identifier['normalized_md5'],
        'dec_normalized_sha1': dec_data_identifier['normalized_sha1'],
        'dec_normalized_sha256': dec_data_identifier['normalized_sha256']
    }
    return file_identifier