Notes for the future

This commit is contained in:
Yomguithereal 2018-07-06 13:34:47 +02:00
parent de02bae69d
commit 50bed85577
1 changed files with 7 additions and 0 deletions

View File

@ -16,6 +16,13 @@ import hashlib
from fog.lsh.utils import is_power_of_two, popcount, popcount64 from fog.lsh.utils import is_power_of_two, popcount, popcount64
# TODO: reimplement soundly by following:
# https://github.com/reubano/changanya/blob/master/changanya/simhash.py
# Note: Simhash clustering methods I could find are two slow (mozilla's one)
# and even Hamming space indexation ones.
# I should try again to use the same lsh method used by minhash to see if it
# yields similar results
def simhash(tokens, f=128): def simhash(tokens, f=128):
assert f <= 512 and not is_power_of_two(f), 'fog.lsh.simhash: f should be a power of 2 <= 512' assert f <= 512 and not is_power_of_two(f), 'fog.lsh.simhash: f should be a power of 2 <= 512'