From ad2af517a3603eb767ccc781e1707e7203c40397 Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Sun, 10 Sep 2017 17:40:30 +0100 Subject: [PATCH] Agressively try to normalize as many filetypes as possible. --- ExtensionCrawler/cdnjs-git.py | 37 ++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/ExtensionCrawler/cdnjs-git.py b/ExtensionCrawler/cdnjs-git.py index 2cf26ca..74ec4df 100644 --- a/ExtensionCrawler/cdnjs-git.py +++ b/ExtensionCrawler/cdnjs-git.py @@ -18,12 +18,14 @@ """ Module for obtaining md5/sha1/sha256 hashes for all files available at CDNJS.com by mining the cdnjs git repository.""" -import os import hashlib +import mimetypes +import os import cchardet as chardet import dateutil.parser import git +import magic from ExtensionCrawler.js_mincer import mince_js @@ -66,18 +68,35 @@ def get_file_identifiers(path): """Get basic file identifiers (size, hashes, normalized hashes, etc.).""" with open(path, 'rb') as fileobj: data = fileobj.read() - encoding = chardet.detect(data)['encoding'] - normalized_data = normalize_file(path, encoding) - return ({ + + file_identifier = { 'filename': os.path.basename(path), 'path': path, 'md5': hashlib.md5(data).digest(), 'sha1': hashlib.sha1(data).digest(), 'sha256': hashlib.sha256(data).digest(), - 'normalized_md5': hashlib.md5(normalized_data).digest(), - 'normalized_sha1': hashlib.sha1(normalized_data).digest(), - 'normalized_sha256': hashlib.sha256(normalized_data).digest(), 'size': len(data), + 'mimetype': mimetypes.guess_type(path), + 'description': magic.from_file(path), 'encoding': chardet.detect(data)['encoding'], - 'comment': "" - }) + } + + try: + normalized_data = normalize_file(path, file_identifier['encoding']) + except Exception: + normalized_data = None + + if normalized_data is None: + file_identifier['normalized_md5'] = None + file_identifier['normalized_sha1'] = None + file_identifier['normalized_sha256'] = None + else: + normalized_data = normalize_file(path, file_identifier['encoding']) + file_identifier['normalized_md5'] = hashlib.md5( + normalized_data).digest() + file_identifier['normalized_sha1'] = hashlib.sha1( + normalized_data).digest() + file_identifier['normalized_sha256'] = hashlib.sha256( + normalized_data).digest() + + return file_identifier