From 50bed85577429a3dc5f297c60a18678288c20804 Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Fri, 6 Jul 2018 13:34:47 +0200 Subject: [PATCH] Notes for the future --- fog/lsh/simhash.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fog/lsh/simhash.py b/fog/lsh/simhash.py index 0a96baf..f0bb6c7 100644 --- a/fog/lsh/simhash.py +++ b/fog/lsh/simhash.py @@ -16,6 +16,13 @@ import hashlib from fog.lsh.utils import is_power_of_two, popcount, popcount64 +# TODO: reimplement soundly by following: +# https://github.com/reubano/changanya/blob/master/changanya/simhash.py +# Note: Simhash clustering methods I could find are two slow (mozilla's one) +# and even Hamming space indexation ones. +# I should try again to use the same lsh method used by minhash to see if it +# yields similar results + def simhash(tokens, f=128): assert f <= 512 and not is_power_of_two(f), 'fog.lsh.simhash: f should be a power of 2 <= 512'