From 933c4d4d118cbf5b43e75a8798c27fb5d7818d1b Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Tue, 12 Sep 2017 23:23:22 +0100 Subject: [PATCH] Determine file description from buffer instead from file (avoid reading file twice). --- ExtensionCrawler/cdnjs-git.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ExtensionCrawler/cdnjs-git.py b/ExtensionCrawler/cdnjs-git.py index 58bc6f9..4ea32fc 100644 --- a/ExtensionCrawler/cdnjs-git.py +++ b/ExtensionCrawler/cdnjs-git.py @@ -22,6 +22,7 @@ import hashlib import mimetypes import os from functools import reduce +import zlib import cchardet as chardet import dateutil.parser @@ -64,7 +65,6 @@ def normalize_file(path, encoding): txt += line.strip() return txt.encode() - def get_file_identifiers(path): """Get basic file identifiers (size, hashes, normalized hashes, etc.).""" with open(path, 'rb') as fileobj: @@ -78,7 +78,7 @@ def get_file_identifiers(path): 'sha256': hashlib.sha256(data).digest(), 'size': len(data), 'mimetype': mimetypes.guess_type(path), - 'description': magic.from_file(path), + 'description': magic.from_buffer(data), 'encoding': chardet.detect(data)['encoding'], }