diff --git a/experiments/minhash.py b/experiments/minhash.py index bba9b1b..68dae12 100644 --- a/experiments/minhash.py +++ b/experiments/minhash.py @@ -53,12 +53,21 @@ with open('./data/universities.csv', 'r') as f: print('%i universities' % len(universities)) - key = lambda x: list(ngrams(3, x)) + key = lambda x: list(ngrams(5, x)) - clusters = list(minhash(universities, h=256, radius=0.8, key=key)) + RADIUS = 0.80 + clusters = list(minhash(universities, h=240, threshold=RADIUS, key=key)) + + c = 0 for cluster in clusters: - print(cluster) + j = jaccard_similarity(key(cluster[0]), key(cluster[1])) + + if j >= RADIUS: + c += 1 + print(cluster, j) + + print('Count', c) # for cluster in clusters: # print(cluster) @@ -66,9 +75,11 @@ with open('./data/universities.csv', 'r') as f: # TODO: Compare found items, use ngrams also print(distinct_values(clusters)) - clusters = list(pairwise(universities, mode='connected_components', similarity=jaccard_similarity, radius=0.8, key=key)) + clusters = list(pairwise(universities, mode='connected_components', similarity=jaccard_similarity, radius=RADIUS, key=key)) print(distinct_values(clusters)) for cluster in clusters: print(cluster) + + print('Count', len(clusters)) diff --git a/fog/clustering/minhash.py b/fog/clustering/minhash.py index f49f01a..35f6126 100644 --- a/fog/clustering/minhash.py +++ b/fog/clustering/minhash.py @@ -12,14 +12,18 @@ import math from fog.clustering.utils import merge_buckets_into_clusters from fog.lsh.minhash import LSBMinHash, MinHash +from fog.metrics.jaccard import jaccard_similarity -# TODO: optimize probability iteratively to find number of bands -# Note than ideally, number of rows should divide 64 evenly -# TODO: else try also to find precision -# TODO: step 1 bands option + sane iteration + experiments with threshold to see what gives +# TODO: +# * Parallelize +# * use threshold to find bands (works better) similarity just bring more +# * possibility to hash the band key +# * note that we allow uneven bands for fine grained results +# * double_check with minhash or jaccard or sub similarity even +# * superminhash to generate signature faster +# * cheap_hashes -# TODO: parallelize def match_probability(h, bands, similarity): return 1.0 - (1.0 - similarity ** (h / bands)) ** bands @@ -29,38 +33,27 @@ def similarity_threshold(h, bands): return (1.0 / bands) ** (1 / (h / bands)) -def guess_bands(h, radius, probability): +def guess_bands(h, threshold): bands = 1 while bands <= h: - p = match_probability(h, bands, radius) + t = similarity_threshold(h, bands) - if p >= probability: + if t <= threshold: break bands += 1 - while h % bands != 0: - bands += 1 - return bands -# TODO: double_check with jaccard or minhash, sub similarity or true radius -# TODO: compute on 64 * precision to avoid modulo issues and filtering out -# TODO: need to think in bands ^ not bands = precision -# TODO: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4431368/ -# TODO: superminhash https://arxiv.org/pdf/1706.05698.pdf -# TODO: faster hashing https://stackoverflow.com/questions/19701052/how-many-hash-functions-are-required-in-a-minhash-algorithm +def minhash(data, h=256, key=None, threshold=0.8, bands=None): -def minhash(data, h=256, key=None, radius=0.8, probability=0.9): + if bands is None: + bands = guess_bands(h, threshold) - bands = guess_bands(h, radius, probability) rows = h // bands - - print(bands) - print(match_probability(h, bands, radius)) - print(similarity_threshold(h, bands)) + h_upper_bound = bands * rows mh = MinHash(h) @@ -74,37 +67,18 @@ def minhash(data, h=256, key=None, radius=0.8, probability=0.9): signature = mh.create_signature(k) - for band in range(0, h, rows): + for band in range(0, h_upper_bound, rows): band_key = (band, '%'.join(str(n) for n in signature[band:band + rows])) buckets[band_key].append(item) - yield from merge_buckets_into_clusters(buckets.values(), mode='connected_components') + def double_check(A, B): + if k is not None: + return jaccard_similarity(key(A), key(B)) >= threshold + return jaccard_similarity(A, B) >= threshold -def minhash_lsb(data, precision=4, key=None, radius=0.8, probability=0.9): - - h = precision * 64 - - # NOTE: it seems we need to divide the bands by 2 because of LSB - bands = max(1, guess_bands(h, radius, probability) // 2) - rows = h // bands - - mh = LSBMinHash(precision=precision) - - buckets = defaultdict(list) - - for item in data: - k = item - - if key is not None: - k = key(item) - - signature = mh.create_signature(k) - - binary = ''.join([bin(i)[2:].rjust(64, '0') for i in signature]) - - for band in range(0, h, rows): - band_key = (band, binary[band:band + rows]) - buckets[band_key].append(item) - - yield from merge_buckets_into_clusters(buckets.values(), mode='connected_components') + yield from merge_buckets_into_clusters( + buckets.values(), + mode='connected_components', + similarity=double_check + ) diff --git a/fog/clustering/utils.py b/fog/clustering/utils.py index 52a9184..42dfaa9 100644 --- a/fog/clustering/utils.py +++ b/fog/clustering/utils.py @@ -42,7 +42,7 @@ def make_similarity_function(similarity=None, distance=None, radius=None): def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'), - mode='fuzzy_clusters'): + mode='fuzzy_clusters', similarity=None): """ Function merging buckets into fuzzy clusters. Each bucket will create relations in an undirected graph that is later solved to compose clusters. @@ -54,6 +54,8 @@ def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'), infinity. mode (string, optional): 'fuzzy_clusters' or 'connected_components'. Defaults to 'fuzzy_clusters'. + similarity (callable, optional)= similarity function to use to validate + matches from buckets. Yields: list: A viable cluster. @@ -70,8 +72,9 @@ def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'), for j in range(i + 1, n): B = bucket[j] - graph[A].add(B) - graph[B].add(A) + if similarity is None or similarity(A, B): + graph[A].add(B) + graph[B].add(A) # TODO: leader mode if mode == 'fuzzy_clusters':