diff --git a/experiments/minhash.py b/experiments/minhash.py index 3b72dd9..bba9b1b 100644 --- a/experiments/minhash.py +++ b/experiments/minhash.py @@ -55,7 +55,7 @@ with open('./data/universities.csv', 'r') as f: key = lambda x: list(ngrams(3, x)) - clusters = list(minhash(universities, precision=8, radius=0.8, key=key)) + clusters = list(minhash(universities, h=256, radius=0.8, key=key)) for cluster in clusters: print(cluster) diff --git a/fog/clustering/minhash.py b/fog/clustering/minhash.py index 730465e..f49f01a 100644 --- a/fog/clustering/minhash.py +++ b/fog/clustering/minhash.py @@ -11,7 +11,7 @@ from collections import defaultdict import math from fog.clustering.utils import merge_buckets_into_clusters -from fog.lsh.minhash import LSBMinHash +from fog.lsh.minhash import LSBMinHash, MinHash # TODO: optimize probability iteratively to find number of bands @@ -29,9 +29,7 @@ def similarity_threshold(h, bands): return (1.0 / bands) ** (1 / (h / bands)) -def guess_bands(precision, radius, probability): - h = precision * 64 - +def guess_bands(h, radius, probability): bands = 1 while bands <= h: @@ -55,12 +53,40 @@ def guess_bands(precision, radius, probability): # TODO: faster hashing https://stackoverflow.com/questions/19701052/how-many-hash-functions-are-required-in-a-minhash-algorithm -def minhash(data, precision=4, key=None, radius=0.8, probability=0.9): +def minhash(data, h=256, key=None, radius=0.8, probability=0.9): + + bands = guess_bands(h, radius, probability) + rows = h // bands + + print(bands) + print(match_probability(h, bands, radius)) + print(similarity_threshold(h, bands)) + + mh = MinHash(h) + + buckets = defaultdict(list) + + for item in data: + k = item + + if key is not None: + k = key(item) + + signature = mh.create_signature(k) + + for band in range(0, h, rows): + band_key = (band, '%'.join(str(n) for n in signature[band:band + rows])) + buckets[band_key].append(item) + + yield from merge_buckets_into_clusters(buckets.values(), mode='connected_components') + + +def minhash_lsb(data, precision=4, key=None, radius=0.8, probability=0.9): h = precision * 64 # NOTE: it seems we need to divide the bands by 2 because of LSB - bands = max(1, guess_bands(precision, radius, probability) // 2) + bands = max(1, guess_bands(h, radius, probability) // 2) rows = h // bands mh = LSBMinHash(precision=precision) @@ -73,7 +99,7 @@ def minhash(data, precision=4, key=None, radius=0.8, probability=0.9): if key is not None: k = key(item) - signature = mh.hash(k) + signature = mh.create_signature(k) binary = ''.join([bin(i)[2:].rjust(64, '0') for i in signature])