Better minhash clustering

2018-06-19 20:55:59 +02:00 · 2018-06-19 20:55:59 +02:00 · bbc65cfc06
parent 3112ecdfc3
commit bbc65cfc06
3 changed files with 47 additions and 59 deletions
--- a/experiments/minhash.py
+++ b/experiments/minhash.py
@ -53,12 +53,21 @@ with open('./data/universities.csv', 'r') as f:

    print('%i universities' % len(universities))

-    key = lambda x: list(ngrams(3, x))
+    key = lambda x: list(ngrams(5, x))

-    clusters = list(minhash(universities, h=256, radius=0.8, key=key))
+    RADIUS = 0.80

+    clusters = list(minhash(universities, h=240, threshold=RADIUS, key=key))
+
+    c = 0
    for cluster in clusters:
-        print(cluster)
+        j = jaccard_similarity(key(cluster[0]), key(cluster[1]))
+
+        if j >= RADIUS:
+            c += 1
+        print(cluster, j)
+
+    print('Count', c)

    # for cluster in clusters:
    #     print(cluster)
@ -66,9 +75,11 @@ with open('./data/universities.csv', 'r') as f:
    # TODO: Compare found items, use ngrams also
    print(distinct_values(clusters))

-    clusters = list(pairwise(universities, mode='connected_components', similarity=jaccard_similarity, radius=0.8, key=key))
+    clusters = list(pairwise(universities, mode='connected_components', similarity=jaccard_similarity, radius=RADIUS, key=key))

    print(distinct_values(clusters))

    for cluster in clusters:
        print(cluster)
+
+    print('Count', len(clusters))
--- a/fog/clustering/minhash.py
+++ b/fog/clustering/minhash.py
@ -12,14 +12,18 @@ import math

 from fog.clustering.utils import merge_buckets_into_clusters
 from fog.lsh.minhash import LSBMinHash, MinHash
+from fog.metrics.jaccard import jaccard_similarity


-# TODO: optimize probability iteratively to find number of bands
-# Note than ideally, number of rows should divide 64 evenly
-# TODO: else try also to find precision
-# TODO: step 1 bands option + sane iteration + experiments with threshold to see what gives
+# TODO:
+#   * Parallelize
+#   * use threshold to find bands (works better) similarity just bring more
+#   * possibility to hash the band key
+#   * note that we allow uneven bands for fine grained results
+#   * double_check with minhash or jaccard or sub similarity even
+#   * superminhash to generate signature faster
+#   * cheap_hashes

-# TODO: parallelize

 def match_probability(h, bands, similarity):
    return 1.0 - (1.0 - similarity ** (h / bands)) ** bands
@ -29,38 +33,27 @@ def similarity_threshold(h, bands):
    return (1.0 / bands) ** (1 / (h / bands))


-def guess_bands(h, radius, probability):
+def guess_bands(h, threshold):
    bands = 1

    while bands <= h:
-        p = match_probability(h, bands, radius)
+        t = similarity_threshold(h, bands)

-        if p >= probability:
+        if t <= threshold:
            break

        bands += 1

-        while h % bands != 0:
-            bands += 1
-
    return bands

-# TODO: double_check with jaccard or minhash, sub similarity or true radius
-# TODO: compute on 64 * precision to avoid modulo issues and filtering out
-# TODO: need to think in bands ^ not bands = precision
-# TODO: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4431368/
-# TODO: superminhash https://arxiv.org/pdf/1706.05698.pdf
-# TODO: faster hashing https://stackoverflow.com/questions/19701052/how-many-hash-functions-are-required-in-a-minhash-algorithm

+def minhash(data, h=256, key=None, threshold=0.8, bands=None):

-def minhash(data, h=256, key=None, radius=0.8, probability=0.9):
+    if bands is None:
+        bands = guess_bands(h, threshold)

-    bands = guess_bands(h, radius, probability)
    rows = h // bands
-
-    print(bands)
-    print(match_probability(h, bands, radius))
-    print(similarity_threshold(h, bands))
+    h_upper_bound = bands * rows

    mh = MinHash(h)

@ -74,37 +67,18 @@ def minhash(data, h=256, key=None, radius=0.8, probability=0.9):

        signature = mh.create_signature(k)

-        for band in range(0, h, rows):
+        for band in range(0, h_upper_bound, rows):
            band_key = (band, '%'.join(str(n) for n in signature[band:band + rows]))
            buckets[band_key].append(item)

-    yield from merge_buckets_into_clusters(buckets.values(), mode='connected_components')
+    def double_check(A, B):
+        if k is not None:
+            return jaccard_similarity(key(A), key(B)) >= threshold

+        return jaccard_similarity(A, B) >= threshold

-def minhash_lsb(data, precision=4, key=None, radius=0.8, probability=0.9):
-
-    h = precision * 64
-
-    # NOTE: it seems we need to divide the bands by 2 because of LSB
-    bands = max(1, guess_bands(h, radius, probability) // 2)
-    rows = h // bands
-
-    mh = LSBMinHash(precision=precision)
-
-    buckets = defaultdict(list)
-
-    for item in data:
-        k = item
-
-        if key is not None:
-            k = key(item)
-
-        signature = mh.create_signature(k)
-
-        binary = ''.join([bin(i)[2:].rjust(64, '0') for i in signature])
-
-        for band in range(0, h, rows):
-            band_key = (band, binary[band:band + rows])
-            buckets[band_key].append(item)
-
-    yield from merge_buckets_into_clusters(buckets.values(), mode='connected_components')
+    yield from merge_buckets_into_clusters(
+        buckets.values(),
+        mode='connected_components',
+        similarity=double_check
+    )
--- a/fog/clustering/utils.py
+++ b/fog/clustering/utils.py
@ -42,7 +42,7 @@ def make_similarity_function(similarity=None, distance=None, radius=None):


 def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'),
-                                mode='fuzzy_clusters'):
+                                mode='fuzzy_clusters', similarity=None):
    """
    Function merging buckets into fuzzy clusters. Each bucket will create
    relations in an undirected graph that is later solved to compose clusters.
@ -54,6 +54,8 @@ def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'),
            infinity.
        mode (string, optional): 'fuzzy_clusters' or 'connected_components'.
            Defaults to 'fuzzy_clusters'.
+        similarity (callable, optional)= similarity function to use to validate
+            matches from buckets.

    Yields:
        list: A viable cluster.
@ -70,8 +72,9 @@ def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'),
            for j in range(i + 1, n):
                B = bucket[j]

-                graph[A].add(B)
-                graph[B].add(A)
+                if similarity is None or similarity(A, B):
+                    graph[A].add(B)
+                    graph[B].add(A)

    # TODO: leader mode
    if mode == 'fuzzy_clusters':