Refactoring: moved generic file identifiers into own module.

2017-09-16 17:19:36 +01:00 · 2017-09-16 17:19:36 +01:00 · 4cf41e2e4f
parent e98f58fff8
commit 4cf41e2e4f
2 changed files with 141 additions and 120 deletions
--- a/ExtensionCrawler/cdnjs_git.py
+++ b/ExtensionCrawler/cdnjs_git.py
@ -20,25 +20,18 @@

 import gc
 import glob
-import hashlib
 import logging
-import mimetypes
 import os
 import re
-import zlib
 from functools import partial, reduce
-from io import StringIO
 from multiprocessing import Pool
 import csv
 import sys

-import cchardet as chardet
 import dateutil.parser
 import git
-import magic
-
-from ExtensionCrawler.js_mincer import mince_js

+from ExtensionCrawler.file_identifiers import get_file_identifiers

 def get_add_date(git_path, filename):
    """Method for getting the initial add/commit date of a file."""
@ -104,118 +97,6 @@ def hackish_pull_list_changed_files(git_path):
                files.add(changed_file.strip())
    return list(files)

-
-def normalize_jsdata(str_data):
-    """Compute normalized code blocks of a JavaScript file"""
-    txt = ""
-    loc = 0
-    with StringIO(str_data) as str_obj:
-        for block in mince_js(str_obj):
-            if block.is_code():
-                for line in block.content.splitlines():
-                    txt += line.strip()
-                    loc += 1
-    return txt.encode(), loc
-
-
-def get_data_identifiers(data):
-    """Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
-    data_identifier = {
-        'md5': hashlib.md5(data).digest(),
-        'sha1': hashlib.sha1(data).digest(),
-        'sha256': hashlib.sha256(data).digest(),
-        'size': len(data),
-        'loc': len(data.splitlines()),
-        'description': magic.from_buffer(data),
-        'encoding': chardet.detect(data)['encoding'],
-    }
-    try:
-        normalized_data, normalized_loc = normalize_jsdata(
-            data.decode(data_identifier['encoding']))
-    except Exception:
-        normalized_data = None
-
-    if normalized_data is None:
-        data_identifier['normalized_loc'] = None
-        data_identifier['normalized_md5'] = None
-        data_identifier['normalized_sha1'] = None
-        data_identifier['normalized_sha256'] = None
-    else:
-        data_identifier['normalized_loc'] = normalized_loc
-        data_identifier['normalized_md5'] = hashlib.md5(
-            normalized_data).digest()
-        data_identifier['normalized_sha1'] = hashlib.sha1(
-            normalized_data).digest()
-        data_identifier['normalized_sha256'] = hashlib.sha256(
-            normalized_data).digest()
-    return data_identifier
-
-
-def get_file_identifiers(path):
-    """Get basic file identifiers (path, filename, etc.) and data identifiers."""
-    dec_data_identifier = {
-        'md5': None,
-        'sha1': None,
-        'sha256': None,
-        'size': None,
-        'loc': None,
-        'description': None,
-        'encoding': None,
-        'normalized_loc': None,
-        'normalized_md5': None,
-        'normalized_sha1': None,
-        'normalized_sha256': None
-    }
-    with open(path, 'rb') as fileobj:
-        data = fileobj.read()
-
-    data_identifier = get_data_identifiers(data)
-
-    if data_identifier['description'].startswith('gzip'):
-        try:
-            with zlib.decompressobj(zlib.MAX_WBITS | 16) as dec:
-                dec_data = dec.decompress(data, 100 * data_identifier['size'])
-                del data
-            dec_data_identifier = get_data_identifiers(dec_data)
-            del dec_data
-        except Exception as e:
-            dec_data_identifier[
-                'description'] = "Exception during compression (likely zip-bomb:" + str(
-                    e)
-    else:
-        del data
-    gc.collect()
-    file_identifier = {
-        'filename': os.path.basename(path),
-        'path': path,
-        'mimetype': mimetypes.guess_type(path),
-        'md5': data_identifier['md5'],
-        'sha1': data_identifier['sha1'],
-        'sha256': data_identifier['sha256'],
-        'size': data_identifier['size'],
-        'loc': data_identifier['loc'],
-        'description': data_identifier['description'],
-        'encoding': data_identifier['encoding'],
-        'normalized_loc': data_identifier['normalized_loc'],
-        'normalized_md5': data_identifier['normalized_md5'],
-        'normalized_sha1': data_identifier['normalized_sha1'],
-        'normalized_sha256': data_identifier['normalized_sha256'],
-        'dec_md5': dec_data_identifier['md5'],
-        'dec_sha1': dec_data_identifier['sha1'],
-        'dec_sha256': dec_data_identifier['sha256'],
-        'dec_size': dec_data_identifier['size'],
-        'dec_loc': dec_data_identifier['loc'],
-        'dec_description': dec_data_identifier['description'],
-        'dec_encoding': dec_data_identifier['encoding'],
-        'dec_normalized_loc': dec_data_identifier['normalized_loc'],
-        'dec_normalized_md5': dec_data_identifier['normalized_md5'],
-        'dec_normalized_sha1': dec_data_identifier['normalized_sha1'],
-        'dec_normalized_sha256': dec_data_identifier['normalized_sha256']
-    }
-
-    return file_identifier
-
-
 def path_to_list(path):
    """Convert a path (string) to a list of folders/files."""
    plist = []
--- a/ExtensionCrawler/file_identifiers.py
+++ b/ExtensionCrawler/file_identifiers.py
@ -0,0 +1,140 @@
+#!/usr/bin/env python3.5
+#
+# Copyright (C) 2016,2017 The University of Sheffield, UK
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+""" Module for obtaining (normalized) md5/sha1/sha256 hashes for files."""
+
+import gc
+import hashlib
+import mimetypes
+import os
+import zlib
+from io import StringIO
+
+import cchardet as chardet
+import magic
+
+from ExtensionCrawler.js_mincer import mince_js
+
+def normalize_jsdata(str_data):
+    """Compute normalized code blocks of a JavaScript file"""
+    txt = ""
+    loc = 0
+    with StringIO(str_data) as str_obj:
+        for block in mince_js(str_obj):
+            if block.is_code():
+                for line in block.content.splitlines():
+                    txt += line.strip()
+                    loc += 1
+    return txt.encode(), loc
+
+
+def get_data_identifiers(data):
+    """Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
+    data_identifier = {
+        'md5': hashlib.md5(data).digest(),
+        'sha1': hashlib.sha1(data).digest(),
+        'sha256': hashlib.sha256(data).digest(),
+        'size': len(data),
+        'loc': len(data.splitlines()),
+        'description': magic.from_buffer(data),
+        'encoding': chardet.detect(data)['encoding'],
+    }
+    try:
+        normalized_data, normalized_loc = normalize_jsdata(
+            data.decode(data_identifier['encoding']))
+    except Exception:
+        normalized_data = None
+
+    if normalized_data is None:
+        data_identifier['normalized_loc'] = None
+        data_identifier['normalized_md5'] = None
+        data_identifier['normalized_sha1'] = None
+        data_identifier['normalized_sha256'] = None
+    else:
+        data_identifier['normalized_loc'] = normalized_loc
+        data_identifier['normalized_md5'] = hashlib.md5(
+            normalized_data).digest()
+        data_identifier['normalized_sha1'] = hashlib.sha1(
+            normalized_data).digest()
+        data_identifier['normalized_sha256'] = hashlib.sha256(
+            normalized_data).digest()
+    return data_identifier
+
+
+def get_file_identifiers(path):
+    """Get basic file identifiers (path, filename, etc.) and data identifiers."""
+    dec_data_identifier = {
+        'md5': None,
+        'sha1': None,
+        'sha256': None,
+        'size': None,
+        'loc': None,
+        'description': None,
+        'encoding': None,
+        'normalized_loc': None,
+        'normalized_md5': None,
+        'normalized_sha1': None,
+        'normalized_sha256': None
+    }
+    with open(path, 'rb') as fileobj:
+        data = fileobj.read()
+
+    data_identifier = get_data_identifiers(data)
+
+    if data_identifier['description'].startswith('gzip'):
+        try:
+            with zlib.decompressobj(zlib.MAX_WBITS | 16) as dec:
+                dec_data = dec.decompress(data, 100 * data_identifier['size'])
+                del data
+            dec_data_identifier = get_data_identifiers(dec_data)
+            del dec_data
+        except Exception as e:
+            dec_data_identifier[
+                'description'] = "Exception during compression (likely zip-bomb:" + str(
+                    e)
+    else:
+        del data
+    gc.collect()
+    file_identifier = {
+        'filename': os.path.basename(path),
+        'path': path,
+        'mimetype': mimetypes.guess_type(path),
+        'md5': data_identifier['md5'],
+        'sha1': data_identifier['sha1'],
+        'sha256': data_identifier['sha256'],
+        'size': data_identifier['size'],
+        'loc': data_identifier['loc'],
+        'description': data_identifier['description'],
+        'encoding': data_identifier['encoding'],
+        'normalized_loc': data_identifier['normalized_loc'],
+        'normalized_md5': data_identifier['normalized_md5'],
+        'normalized_sha1': data_identifier['normalized_sha1'],
+        'normalized_sha256': data_identifier['normalized_sha256'],
+        'dec_md5': dec_data_identifier['md5'],
+        'dec_sha1': dec_data_identifier['sha1'],
+        'dec_sha256': dec_data_identifier['sha256'],
+        'dec_size': dec_data_identifier['size'],
+        'dec_loc': dec_data_identifier['loc'],
+        'dec_description': dec_data_identifier['description'],
+        'dec_encoding': dec_data_identifier['encoding'],
+        'dec_normalized_loc': dec_data_identifier['normalized_loc'],
+        'dec_normalized_md5': dec_data_identifier['normalized_md5'],
+        'dec_normalized_sha1': dec_data_identifier['normalized_sha1'],
+        'dec_normalized_sha256': dec_data_identifier['normalized_sha256']
+    }
+
+    return file_identifier