threshold_algorithm

2020-10-07 14:32:40 +02:00 · 2020-10-07 14:32:40 +02:00 · c57dada0fc
parent 5684d153a2
commit c57dada0fc
3 changed files with 1189 additions and 26 deletions
--- a/data/fagin_k1_ground_truth.csv
+++ b/data/fagin_k1_ground_truth.csv
--- a/experiments/fagin.py
+++ b/experiments/fagin.py
@ -1,6 +1,7 @@
 import csv
+import pytest
 from collections import Counter
-from fog.clustering.fagin import fagin_k1
+from fog.clustering.fagin import fagin_k1, threshold_algorithm_k1
 from fog.tokenizers import ngrams
 from fog.metrics import sparse_cosine_similarity
 from experiments.utils import Timer
@ -13,37 +14,44 @@ with open('./data/fagin_k1_ground_truth.csv') as f:

 VECTORS = [Counter(ngrams(5, chars)) for chars in UNIVERSITIES]

-with Timer('quadratic'):
-    with open('./data/fagin_k1_ground_truth.csv', 'w') as f:
+# with Timer('quadratic'):
+#     with open('./data/fagin_k1_ground_truth.csv', 'w') as f:
 #         writer = csv.writer(f)

-        for i in range(len(VECTORS)):
-            v1 = VECTORS[i]
-            best = None
+#         for i in range(len(VECTORS)):
+#             v1 = VECTORS[i]
+#             best = None

-            for j in range(len(VECTORS)):
-                if i == j:
-                    continue
+#             for j in range(len(VECTORS)):
+#                 if i == j:
+#                     continue

-                v2 = VECTORS[j]
+#                 v2 = VECTORS[j]

-                c = sparse_cosine_similarity(v1, v2)
+#                 c = sparse_cosine_similarity(v1, v2)

-                # NOTE: this is stable and lower index wins
-                if best is None or c > best[0]:
-                    best = (c, j)
+#                 # NOTE: this is stable and lower index wins
+#                 if best is None or c > best[0]:
+#                     best = (c, j)

-            # print(UNIVERSITIES[i], UNIVERSITIES[best[1]])
+#             # print(UNIVERSITIES[i], UNIVERSITIES[best[1]])
 #             writer.writerow([i, best[1], str(best[0])])

-with Timer('Fagin'):
-    for i, candidates in fagin_k1(VECTORS):
-        v = VECTORS[i]
-        j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c]))
+# with Timer('FA'):
+#     for i, candidates in fagin_k1(VECTORS):
+#         v = VECTORS[i]
+#         j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c]))

-        # print("'%s'" % UNIVERSITIES[i])
-        # print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]])
-        # print("'%s'" % UNIVERSITIES[j])
-        # print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j]))
+#         # print("'%s'" % UNIVERSITIES[i])
+#         # print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]])
+#         # print("'%s'" % UNIVERSITIES[j])
+#         # print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j]))

-        assert j == GROUND_TRUTH[i][0]
+#         assert j == GROUND_TRUTH[i][0]
+
+with Timer('TA'):
+
+    # TODO: current heap comparison used is not stable
+    for i, j in threshold_algorithm_k1(VECTORS):
+        if i != j:
+            assert sparse_cosine_similarity(VECTORS[i], VECTORS[j]) == pytest.approx(GROUND_TRUTH[i][1])
--- a/fog/clustering/fagin.py
+++ b/fog/clustering/fagin.py
@ -22,6 +22,8 @@
 #
 from collections import defaultdict, Counter

+from fog.metrics.cosine import sparse_cosine_similarity
+

 def fagin_k1(vectors):
    inverted_lists = defaultdict(list)
@ -57,3 +59,67 @@ def fagin_k1(vectors):
                break

            offset += 1
+
+
+def threshold_algorithm_k1(vectors):
+    inverted_lists = defaultdict(list)
+
+    for i, vector in enumerate(vectors):
+        for d, w in vector.items():
+            inverted_lists[d].append((w, i))
+
+    for l in inverted_lists.values():
+        l.sort()
+
+    for i, vector in enumerate(vectors):
+        visited = set()
+        offset = 0
+
+        t = 0.0
+        best = [None, None]
+        t_vector = {}
+
+        while True:
+            stop = True
+
+            for d in vector:
+                l = inverted_lists[d]
+
+                if offset >= len(l):
+                    continue
+
+                stop = False
+
+                w, j = l[offset]
+                t_vector[d] = w
+
+                if j in visited:
+                    continue
+
+                cs = sparse_cosine_similarity(vector, vectors[j])
+                visited.add(j)
+
+                if best[0] is None:
+                    best[0] = (cs, j)
+                else:
+                    if cs > best[0][0]:
+                        best[1] = best[0]
+                        best[0] = (cs, j)
+                    else:
+                        if best[1] is None:
+                            best[1] = (cs, j)
+                        elif cs > best[1][0]:
+                            best[1] = (cs, j)
+
+            # Final break + return self if best cos is 0.0
+            if stop:
+                yield i, best[1][1] if best[1] is not None else best[0][1]
+                break
+
+            t = sparse_cosine_similarity(vector, t_vector)
+
+            if best[1] is not None and best[1][0] >= t:
+                yield i, best[1][1] if best[1] is not None else best[0][1]
+                break
+
+            offset += 1