diff --git a/ExtensionCrawler/file_identifiers.py b/ExtensionCrawler/file_identifiers.py index ab31a25..5a30198 100644 --- a/ExtensionCrawler/file_identifiers.py +++ b/ExtensionCrawler/file_identifiers.py @@ -55,13 +55,14 @@ def get_features(s): def get_simhash(encoding, data): """Compute simhash of text.""" - simhash = None + str_data = "" if not encoding is None: str_data = data.decode(encoding) - simhash = Simhash(get_features(str_data)).value + else: + str_data = str(data) + simhash = Simhash(get_features(str_data)).value return simhash - def compute_difference(hx, hy): """Compute difference between two simhashes.""" assert hx.bit_length() == hy.bit_length()