Compute simhash for string representation of binary data.

This commit is contained in:
Achim D. Brucker 2017-11-21 07:43:49 +00:00
parent 52045ed53d
commit f7cdc03133
1 changed files with 4 additions and 3 deletions

View File

@ -55,13 +55,14 @@ def get_features(s):
def get_simhash(encoding, data):
"""Compute simhash of text."""
simhash = None
str_data = ""
if not encoding is None:
str_data = data.decode(encoding)
simhash = Simhash(get_features(str_data)).value
else:
str_data = str(data)
simhash = Simhash(get_features(str_data)).value
return simhash
def compute_difference(hx, hy):
"""Compute difference between two simhashes."""
assert hx.bit_length() == hy.bit_length()