More experiments

2020-10-07 18:31:02 +02:00 · 2020-10-07 18:31:02 +02:00 · a5ecda14f5
parent 18159bdef2
commit a5ecda14f5
2 changed files with 66 additions and 36 deletions
--- a/experiments/fagin.py
+++ b/experiments/fagin.py
@ -1,6 +1,6 @@
 import csv
 import pytest
-from collections import Counter
+from collections import Counter, defaultdict
 from fog.clustering.fagin import (
    fagin_k1,
    threshold_algorithm_k1,
@ -19,53 +19,53 @@ with open('./data/fagin_k1_ground_truth.csv') as f:

 VECTORS = [Counter(ngrams(2, chars)) for chars in UNIVERSITIES]

-with Timer('quadratic'):
-    # with open('./data/fagin_k1_ground_truth.csv', 'w') as f:
-    # writer = csv.writer(f)
+# with Timer('quadratic'):
+#     # with open('./data/fagin_k1_ground_truth.csv', 'w') as f:
+#     # writer = csv.writer(f)

-    for i in range(len(VECTORS)):
-        v1 = VECTORS[i]
-        best = None
+#     for i in range(len(VECTORS)):
+#         v1 = VECTORS[i]
+#         best = None

-        for j in range(len(VECTORS)):
-            if i == j:
-                continue
+#         for j in range(len(VECTORS)):
+#             if i == j:
+#                 continue

-            v2 = VECTORS[j]
+#             v2 = VECTORS[j]

-            c = sparse_cosine_similarity(v1, v2)
+#             c = sparse_cosine_similarity(v1, v2)

-            # NOTE: this is stable and lower index wins
-            if best is None or c > best[0]:
-                best = (c, j)
+#             # NOTE: this is stable and lower index wins
+#             if best is None or c > best[0]:
+#                 best = (c, j)

-        # print(UNIVERSITIES[i], UNIVERSITIES[best[1]])
-        # writer.writerow([i, best[1], str(best[0])])
+#         # print(UNIVERSITIES[i], UNIVERSITIES[best[1]])
+#         # writer.writerow([i, best[1], str(best[0])])

-with Timer('FA'):
-    for i, candidates in fagin_k1(VECTORS):
-        v = VECTORS[i]
-        j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c]))
+# with Timer('FA'):
+#     for i, candidates in fagin_k1(VECTORS):
+#         v = VECTORS[i]
+#         j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c]))

-        # print("'%s'" % UNIVERSITIES[i])
-        # print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]])
-        # print("'%s'" % UNIVERSITIES[j])
-        # print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j]))
+#         # print("'%s'" % UNIVERSITIES[i])
+#         # print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]])
+#         # print("'%s'" % UNIVERSITIES[j])
+#         # print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j]))

-        assert j == GROUND_TRUTH[i][0]
+#         assert j == GROUND_TRUTH[i][0]

-with Timer('TA'):
+# with Timer('TA'):

-    # TODO: current heap comparison used is not stable
-    for i, j in threshold_algorithm_k1(VECTORS):
-        if i != j:
-            assert sparse_cosine_similarity(VECTORS[i], VECTORS[j]) == pytest.approx(GROUND_TRUTH[i][1])
+#     # TODO: current heap comparison used is not stable
+#     for i, j in threshold_algorithm_k1(VECTORS):
+#         if i != j:
+#             assert sparse_cosine_similarity(VECTORS[i], VECTORS[j]) == pytest.approx(GROUND_TRUTH[i][1])

-with Timer('naive cosine pairs'):
-    pairs = list(naive_cosine_pairs(VECTORS))
-    n = len(VECTORS) * (len(VECTORS) - 1) // 2
+# with Timer('naive cosine pairs'):
+#     pairs = list(naive_cosine_pairs(VECTORS))
+#     n = len(VECTORS) * (len(VECTORS) - 1) // 2

-    print(len(pairs), n, len(pairs) / n)
+#     print(len(pairs), n, len(pairs) / n)

 with Timer('sqrt indices'):
    pairs = list(sqrt_indexation_pairs(VECTORS))
@ -74,3 +74,33 @@ with Timer('sqrt indices'):
    print(len(pairs), n, len(pairs) / n)

    print(len(set(i for i, _ in pairs) | set(j for _, j in pairs)), len(VECTORS))
+
+    neighbors = defaultdict(lambda: (0, None))
+
+    for _ in range(5):
+        pairs = sqrt_indexation_pairs(VECTORS)
+
+        for i, j in pairs:
+            cs = sparse_cosine_similarity(VECTORS[i], VECTORS[j])
+
+            if cs > neighbors[i][0]:
+                neighbors[i] = (cs, j)
+
+            if cs > neighbors[j][0]:
+                neighbors[j] = (cs, i)
+
+    T = 0
+    P = 0
+    for i in range(len(VECTORS)):
+        if GROUND_TRUTH[i][1] < 0.8:
+            continue
+
+        T += 1
+
+        if len(neighbors[i]) < 1:
+            continue
+
+        if neighbors[i][0] == pytest.approx(GROUND_TRUTH[i][1]):
+            P += 1
+
+    print(P / T)
--- a/fog/clustering/fagin.py
+++ b/fog/clustering/fagin.py
@ -162,7 +162,7 @@ def sqrt_indexation_pairs(vectors):
    proximities = defaultdict(list)

    for i, v in enumerate(vectors):
-        leader = max(leaders, key=lambda x: sparse_cosine_similarity(v, vectors[x]))
+        leader = min(leaders, key=lambda x: 1.0 - sparse_cosine_similarity(v, vectors[x]))

        l = proximities[leader]