From bbc65cfc06d76c36891b403a22ae19a33db9f3bf Mon Sep 17 00:00:00 2001
From: Yomguithereal <guillaumeplique@gmail.com>
Date: Tue, 19 Jun 2018 20:55:59 +0200
Subject: [PATCH] Better minhash clustering

---
 experiments/minhash.py    | 19 ++++++++--
 fog/clustering/minhash.py | 78 +++++++++++++--------------------------
 fog/clustering/utils.py   |  9 +++--
 3 files changed, 47 insertions(+), 59 deletions(-)

diff --git a/experiments/minhash.py b/experiments/minhash.py
index bba9b1b..68dae12 100644
--- a/experiments/minhash.py
+++ b/experiments/minhash.py
@@ -53,12 +53,21 @@ with open('./data/universities.csv', 'r') as f:
 
     print('%i universities' % len(universities))
 
-    key = lambda x: list(ngrams(3, x))
+    key = lambda x: list(ngrams(5, x))
 
-    clusters = list(minhash(universities, h=256, radius=0.8, key=key))
+    RADIUS = 0.80
 
+    clusters = list(minhash(universities, h=240, threshold=RADIUS, key=key))
+
+    c = 0
     for cluster in clusters:
-        print(cluster)
+        j = jaccard_similarity(key(cluster[0]), key(cluster[1]))
+
+        if j >= RADIUS:
+            c += 1
+        print(cluster, j)
+
+    print('Count', c)
 
     # for cluster in clusters:
     #     print(cluster)
@@ -66,9 +75,11 @@ with open('./data/universities.csv', 'r') as f:
     # TODO: Compare found items, use ngrams also
     print(distinct_values(clusters))
 
-    clusters = list(pairwise(universities, mode='connected_components', similarity=jaccard_similarity, radius=0.8, key=key))
+    clusters = list(pairwise(universities, mode='connected_components', similarity=jaccard_similarity, radius=RADIUS, key=key))
 
     print(distinct_values(clusters))
 
     for cluster in clusters:
         print(cluster)
+
+    print('Count', len(clusters))
diff --git a/fog/clustering/minhash.py b/fog/clustering/minhash.py
index f49f01a..35f6126 100644
--- a/fog/clustering/minhash.py
+++ b/fog/clustering/minhash.py
@@ -12,14 +12,18 @@ import math
 
 from fog.clustering.utils import merge_buckets_into_clusters
 from fog.lsh.minhash import LSBMinHash, MinHash
+from fog.metrics.jaccard import jaccard_similarity
 
 
-# TODO: optimize probability iteratively to find number of bands
-# Note than ideally, number of rows should divide 64 evenly
-# TODO: else try also to find precision
-# TODO: step 1 bands option + sane iteration + experiments with threshold to see what gives
+# TODO:
+#   * Parallelize
+#   * use threshold to find bands (works better) similarity just bring more
+#   * possibility to hash the band key
+#   * note that we allow uneven bands for fine grained results
+#   * double_check with minhash or jaccard or sub similarity even
+#   * superminhash to generate signature faster
+#   * cheap_hashes
 
-# TODO: parallelize
 
 def match_probability(h, bands, similarity):
     return 1.0 - (1.0 - similarity ** (h / bands)) ** bands
@@ -29,38 +33,27 @@ def similarity_threshold(h, bands):
     return (1.0 / bands) ** (1 / (h / bands))
 
 
-def guess_bands(h, radius, probability):
+def guess_bands(h, threshold):
     bands = 1
 
     while bands <= h:
-        p = match_probability(h, bands, radius)
+        t = similarity_threshold(h, bands)
 
-        if p >= probability:
+        if t <= threshold:
             break
 
         bands += 1
 
-        while h % bands != 0:
-            bands += 1
-
     return bands
 
-# TODO: double_check with jaccard or minhash, sub similarity or true radius
-# TODO: compute on 64 * precision to avoid modulo issues and filtering out
-# TODO: need to think in bands ^ not bands = precision
-# TODO: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4431368/
-# TODO: superminhash https://arxiv.org/pdf/1706.05698.pdf
-# TODO: faster hashing https://stackoverflow.com/questions/19701052/how-many-hash-functions-are-required-in-a-minhash-algorithm
 
+def minhash(data, h=256, key=None, threshold=0.8, bands=None):
 
-def minhash(data, h=256, key=None, radius=0.8, probability=0.9):
+    if bands is None:
+        bands = guess_bands(h, threshold)
 
-    bands = guess_bands(h, radius, probability)
     rows = h // bands
-
-    print(bands)
-    print(match_probability(h, bands, radius))
-    print(similarity_threshold(h, bands))
+    h_upper_bound = bands * rows
 
     mh = MinHash(h)
 
@@ -74,37 +67,18 @@ def minhash(data, h=256, key=None, radius=0.8, probability=0.9):
 
         signature = mh.create_signature(k)
 
-        for band in range(0, h, rows):
+        for band in range(0, h_upper_bound, rows):
             band_key = (band, '%'.join(str(n) for n in signature[band:band + rows]))
             buckets[band_key].append(item)
 
-    yield from merge_buckets_into_clusters(buckets.values(), mode='connected_components')
+    def double_check(A, B):
+        if k is not None:
+            return jaccard_similarity(key(A), key(B)) >= threshold
 
+        return jaccard_similarity(A, B) >= threshold
 
-def minhash_lsb(data, precision=4, key=None, radius=0.8, probability=0.9):
-
-    h = precision * 64
-
-    # NOTE: it seems we need to divide the bands by 2 because of LSB
-    bands = max(1, guess_bands(h, radius, probability) // 2)
-    rows = h // bands
-
-    mh = LSBMinHash(precision=precision)
-
-    buckets = defaultdict(list)
-
-    for item in data:
-        k = item
-
-        if key is not None:
-            k = key(item)
-
-        signature = mh.create_signature(k)
-
-        binary = ''.join([bin(i)[2:].rjust(64, '0') for i in signature])
-
-        for band in range(0, h, rows):
-            band_key = (band, binary[band:band + rows])
-            buckets[band_key].append(item)
-
-    yield from merge_buckets_into_clusters(buckets.values(), mode='connected_components')
+    yield from merge_buckets_into_clusters(
+        buckets.values(),
+        mode='connected_components',
+        similarity=double_check
+    )
diff --git a/fog/clustering/utils.py b/fog/clustering/utils.py
index 52a9184..42dfaa9 100644
--- a/fog/clustering/utils.py
+++ b/fog/clustering/utils.py
@@ -42,7 +42,7 @@ def make_similarity_function(similarity=None, distance=None, radius=None):
 
 
 def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'),
-                                mode='fuzzy_clusters'):
+                                mode='fuzzy_clusters', similarity=None):
     """
     Function merging buckets into fuzzy clusters. Each bucket will create
     relations in an undirected graph that is later solved to compose clusters.
@@ -54,6 +54,8 @@ def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'),
             infinity.
         mode (string, optional): 'fuzzy_clusters' or 'connected_components'.
             Defaults to 'fuzzy_clusters'.
+        similarity (callable, optional)= similarity function to use to validate
+            matches from buckets.
 
     Yields:
         list: A viable cluster.
@@ -70,8 +72,9 @@ def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'),
             for j in range(i + 1, n):
                 B = bucket[j]
 
-                graph[A].add(B)
-                graph[B].add(A)
+                if similarity is None or similarity(A, B):
+                    graph[A].add(B)
+                    graph[B].add(A)
 
     # TODO: leader mode
     if mode == 'fuzzy_clusters':