Reformatting.
This commit is contained in:
parent
ea9339bc53
commit
88efe2b8a4
|
@ -21,8 +21,8 @@
|
||||||
import hashlib
|
import hashlib
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
from functools import reduce
|
|
||||||
import zlib
|
import zlib
|
||||||
|
from functools import reduce
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
|
||||||
import cchardet as chardet
|
import cchardet as chardet
|
||||||
|
@ -66,6 +66,7 @@ def normalize_jsdata(str_data):
|
||||||
txt += line.strip()
|
txt += line.strip()
|
||||||
return txt.encode()
|
return txt.encode()
|
||||||
|
|
||||||
|
|
||||||
def get_data_identifiers(data):
|
def get_data_identifiers(data):
|
||||||
"""Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
|
"""Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
|
||||||
data_identifier = {
|
data_identifier = {
|
||||||
|
@ -77,7 +78,8 @@ def get_data_identifiers(data):
|
||||||
'encoding': chardet.detect(data)['encoding'],
|
'encoding': chardet.detect(data)['encoding'],
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
normalized_data = normalize_jsdata(data.decode(data_identifier['encoding']))
|
normalized_data = normalize_jsdata(
|
||||||
|
data.decode(data_identifier['encoding']))
|
||||||
except Exception:
|
except Exception:
|
||||||
normalized_data = None
|
normalized_data = None
|
||||||
|
|
||||||
|
@ -94,16 +96,17 @@ def get_data_identifiers(data):
|
||||||
normalized_data).digest()
|
normalized_data).digest()
|
||||||
return data_identifier
|
return data_identifier
|
||||||
|
|
||||||
|
|
||||||
def get_file_identifiers(path):
|
def get_file_identifiers(path):
|
||||||
"""Get basic file identifiers (path, filename, etc.) and data identifiers."""
|
"""Get basic file identifiers (path, filename, etc.) and data identifiers."""
|
||||||
with open(path, 'rb') as fileobj:
|
with open(path, 'rb') as fileobj:
|
||||||
data = fileobj.read()
|
data = fileobj.read()
|
||||||
|
|
||||||
data_identifier = get_data_identifiers(data)
|
data_identifier = get_data_identifiers(data)
|
||||||
|
|
||||||
if data_identifier['description'].startswith('gzip'):
|
if data_identifier['description'].startswith('gzip'):
|
||||||
with zlib.decompressobj(zlib.MAX_WBITS | 16) as dec:
|
with zlib.decompressobj(zlib.MAX_WBITS | 16) as dec:
|
||||||
dec_data = dec.decompress(data, 30*data_identifier['size'])
|
dec_data = dec.decompress(data, 30 * data_identifier['size'])
|
||||||
dec_data_identifier = get_data_identifiers(dec_data)
|
dec_data_identifier = get_data_identifiers(dec_data)
|
||||||
else:
|
else:
|
||||||
dec_data_identifier = {
|
dec_data_identifier = {
|
||||||
|
@ -119,7 +122,7 @@ def get_file_identifiers(path):
|
||||||
}
|
}
|
||||||
data = None
|
data = None
|
||||||
dec_data = None
|
dec_data = None
|
||||||
|
|
||||||
file_identifier = {
|
file_identifier = {
|
||||||
'filename': os.path.basename(path),
|
'filename': os.path.basename(path),
|
||||||
'path': path,
|
'path': path,
|
||||||
|
@ -150,7 +153,7 @@ def get_file_identifiers(path):
|
||||||
def path_to_list(path):
|
def path_to_list(path):
|
||||||
"""Convert a path (string) to a list of folders/files."""
|
"""Convert a path (string) to a list of folders/files."""
|
||||||
plist = []
|
plist = []
|
||||||
while (True):
|
while True:
|
||||||
(head, tail) = os.path.split(path)
|
(head, tail) = os.path.split(path)
|
||||||
if head == '':
|
if head == '':
|
||||||
if tail == '':
|
if tail == '':
|
||||||
|
|
Loading…
Reference in New Issue