Ignore invalid bytes during character decoding.

This commit is contained in:
Achim D. Brucker 2018-02-26 21:23:00 +00:00
parent 527fca78bc
commit d6c7fbd306
1 changed files with 2 additions and 2 deletions

View File

@ -57,7 +57,7 @@ def get_simhash(encoding, data):
"""Compute simhash of text.""" """Compute simhash of text."""
str_data = "" str_data = ""
if not encoding is None: if not encoding is None:
str_data = data.decode(encoding) str_data = data.decode(encoding=encoding,errors="replace")
else: else:
str_data = str(data) str_data = str(data)
simhash = Simhash(get_features(str_data)).value simhash = Simhash(get_features(str_data)).value
@ -102,7 +102,7 @@ def get_data_identifiers(data):
} }
try: try:
normalized_data, normalized_loc = normalize_jsdata( normalized_data, normalized_loc = normalize_jsdata(
data.decode(data_identifier['encoding'])) data.decode(encoding=data_identifier['encoding'],errors="replace"))
except Exception: except Exception:
normalized_data = None normalized_data = None