Agressively try to normalize as many filetypes as possible.

This commit is contained in:
Achim D. Brucker 2017-09-10 17:40:30 +01:00
parent 06ff5f3057
commit ad2af517a3
1 changed files with 28 additions and 9 deletions

View File

@ -18,12 +18,14 @@
""" Module for obtaining md5/sha1/sha256 hashes for all files available
at CDNJS.com by mining the cdnjs git repository."""
import os
import hashlib
import mimetypes
import os
import cchardet as chardet
import dateutil.parser
import git
import magic
from ExtensionCrawler.js_mincer import mince_js
@ -66,18 +68,35 @@ def get_file_identifiers(path):
"""Get basic file identifiers (size, hashes, normalized hashes, etc.)."""
with open(path, 'rb') as fileobj:
data = fileobj.read()
encoding = chardet.detect(data)['encoding']
normalized_data = normalize_file(path, encoding)
return ({
file_identifier = {
'filename': os.path.basename(path),
'path': path,
'md5': hashlib.md5(data).digest(),
'sha1': hashlib.sha1(data).digest(),
'sha256': hashlib.sha256(data).digest(),
'normalized_md5': hashlib.md5(normalized_data).digest(),
'normalized_sha1': hashlib.sha1(normalized_data).digest(),
'normalized_sha256': hashlib.sha256(normalized_data).digest(),
'size': len(data),
'mimetype': mimetypes.guess_type(path),
'description': magic.from_file(path),
'encoding': chardet.detect(data)['encoding'],
'comment': ""
})
}
try:
normalized_data = normalize_file(path, file_identifier['encoding'])
except Exception:
normalized_data = None
if normalized_data is None:
file_identifier['normalized_md5'] = None
file_identifier['normalized_sha1'] = None
file_identifier['normalized_sha256'] = None
else:
normalized_data = normalize_file(path, file_identifier['encoding'])
file_identifier['normalized_md5'] = hashlib.md5(
normalized_data).digest()
file_identifier['normalized_sha1'] = hashlib.sha1(
normalized_data).digest()
file_identifier['normalized_sha256'] = hashlib.sha256(
normalized_data).digest()
return file_identifier