Ignore invalid bytes during character decoding.

This commit is contained in:
Achim D. Brucker 2018-02-26 21:23:00 +00:00
parent 527fca78bc
commit d6c7fbd306
1 changed files with 2 additions and 2 deletions

View File

@ -57,7 +57,7 @@ def get_simhash(encoding, data):
"""Compute simhash of text."""
str_data = ""
if not encoding is None:
str_data = data.decode(encoding)
str_data = data.decode(encoding=encoding,errors="replace")
else:
str_data = str(data)
simhash = Simhash(get_features(str_data)).value
@ -102,7 +102,7 @@ def get_data_identifiers(data):
}
try:
normalized_data, normalized_loc = normalize_jsdata(
data.decode(data_identifier['encoding']))
data.decode(encoding=data_identifier['encoding'],errors="replace"))
except Exception:
normalized_data = None