From d6c7fbd306441cce9905e931e0409c5807032e5a Mon Sep 17 00:00:00 2001 From: "Achim D. Brucker" Date: Mon, 26 Feb 2018 21:23:00 +0000 Subject: [PATCH] Ignore invalid bytes during character decoding. --- ExtensionCrawler/file_identifiers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ExtensionCrawler/file_identifiers.py b/ExtensionCrawler/file_identifiers.py index 8a2079a..5c24154 100644 --- a/ExtensionCrawler/file_identifiers.py +++ b/ExtensionCrawler/file_identifiers.py @@ -57,7 +57,7 @@ def get_simhash(encoding, data): """Compute simhash of text.""" str_data = "" if not encoding is None: - str_data = data.decode(encoding) + str_data = data.decode(encoding=encoding,errors="replace") else: str_data = str(data) simhash = Simhash(get_features(str_data)).value @@ -102,7 +102,7 @@ def get_data_identifiers(data): } try: normalized_data, normalized_loc = normalize_jsdata( - data.decode(data_identifier['encoding'])) + data.decode(encoding=data_identifier['encoding'],errors="replace")) except Exception: normalized_data = None