Compute simhash for string representation of binary data.
This commit is contained in:
parent
52045ed53d
commit
f7cdc03133
|
@ -55,13 +55,14 @@ def get_features(s):
|
||||||
|
|
||||||
def get_simhash(encoding, data):
|
def get_simhash(encoding, data):
|
||||||
"""Compute simhash of text."""
|
"""Compute simhash of text."""
|
||||||
simhash = None
|
str_data = ""
|
||||||
if not encoding is None:
|
if not encoding is None:
|
||||||
str_data = data.decode(encoding)
|
str_data = data.decode(encoding)
|
||||||
simhash = Simhash(get_features(str_data)).value
|
else:
|
||||||
|
str_data = str(data)
|
||||||
|
simhash = Simhash(get_features(str_data)).value
|
||||||
return simhash
|
return simhash
|
||||||
|
|
||||||
|
|
||||||
def compute_difference(hx, hy):
|
def compute_difference(hx, hy):
|
||||||
"""Compute difference between two simhashes."""
|
"""Compute difference between two simhashes."""
|
||||||
assert hx.bit_length() == hy.bit_length()
|
assert hx.bit_length() == hy.bit_length()
|
||||||
|
|
Loading…
Reference in New Issue