diff --git a/experiments/fagin.py b/experiments/fagin.py index 5b939c9..4295efe 100644 --- a/experiments/fagin.py +++ b/experiments/fagin.py @@ -1,6 +1,6 @@ import csv import pytest -from collections import Counter +from collections import Counter, defaultdict from fog.clustering.fagin import ( fagin_k1, threshold_algorithm_k1, @@ -19,53 +19,53 @@ with open('./data/fagin_k1_ground_truth.csv') as f: VECTORS = [Counter(ngrams(2, chars)) for chars in UNIVERSITIES] -with Timer('quadratic'): - # with open('./data/fagin_k1_ground_truth.csv', 'w') as f: - # writer = csv.writer(f) +# with Timer('quadratic'): +# # with open('./data/fagin_k1_ground_truth.csv', 'w') as f: +# # writer = csv.writer(f) - for i in range(len(VECTORS)): - v1 = VECTORS[i] - best = None +# for i in range(len(VECTORS)): +# v1 = VECTORS[i] +# best = None - for j in range(len(VECTORS)): - if i == j: - continue +# for j in range(len(VECTORS)): +# if i == j: +# continue - v2 = VECTORS[j] +# v2 = VECTORS[j] - c = sparse_cosine_similarity(v1, v2) +# c = sparse_cosine_similarity(v1, v2) - # NOTE: this is stable and lower index wins - if best is None or c > best[0]: - best = (c, j) +# # NOTE: this is stable and lower index wins +# if best is None or c > best[0]: +# best = (c, j) - # print(UNIVERSITIES[i], UNIVERSITIES[best[1]]) - # writer.writerow([i, best[1], str(best[0])]) +# # print(UNIVERSITIES[i], UNIVERSITIES[best[1]]) +# # writer.writerow([i, best[1], str(best[0])]) -with Timer('FA'): - for i, candidates in fagin_k1(VECTORS): - v = VECTORS[i] - j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c])) +# with Timer('FA'): +# for i, candidates in fagin_k1(VECTORS): +# v = VECTORS[i] +# j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c])) - # print("'%s'" % UNIVERSITIES[i]) - # print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]]) - # print("'%s'" % UNIVERSITIES[j]) - # print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j])) +# # print("'%s'" % UNIVERSITIES[i]) +# # print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]]) +# # print("'%s'" % UNIVERSITIES[j]) +# # print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j])) - assert j == GROUND_TRUTH[i][0] +# assert j == GROUND_TRUTH[i][0] -with Timer('TA'): +# with Timer('TA'): - # TODO: current heap comparison used is not stable - for i, j in threshold_algorithm_k1(VECTORS): - if i != j: - assert sparse_cosine_similarity(VECTORS[i], VECTORS[j]) == pytest.approx(GROUND_TRUTH[i][1]) +# # TODO: current heap comparison used is not stable +# for i, j in threshold_algorithm_k1(VECTORS): +# if i != j: +# assert sparse_cosine_similarity(VECTORS[i], VECTORS[j]) == pytest.approx(GROUND_TRUTH[i][1]) -with Timer('naive cosine pairs'): - pairs = list(naive_cosine_pairs(VECTORS)) - n = len(VECTORS) * (len(VECTORS) - 1) // 2 +# with Timer('naive cosine pairs'): +# pairs = list(naive_cosine_pairs(VECTORS)) +# n = len(VECTORS) * (len(VECTORS) - 1) // 2 - print(len(pairs), n, len(pairs) / n) +# print(len(pairs), n, len(pairs) / n) with Timer('sqrt indices'): pairs = list(sqrt_indexation_pairs(VECTORS)) @@ -74,3 +74,33 @@ with Timer('sqrt indices'): print(len(pairs), n, len(pairs) / n) print(len(set(i for i, _ in pairs) | set(j for _, j in pairs)), len(VECTORS)) + + neighbors = defaultdict(lambda: (0, None)) + + for _ in range(5): + pairs = sqrt_indexation_pairs(VECTORS) + + for i, j in pairs: + cs = sparse_cosine_similarity(VECTORS[i], VECTORS[j]) + + if cs > neighbors[i][0]: + neighbors[i] = (cs, j) + + if cs > neighbors[j][0]: + neighbors[j] = (cs, i) + + T = 0 + P = 0 + for i in range(len(VECTORS)): + if GROUND_TRUTH[i][1] < 0.8: + continue + + T += 1 + + if len(neighbors[i]) < 1: + continue + + if neighbors[i][0] == pytest.approx(GROUND_TRUTH[i][1]): + P += 1 + + print(P / T) diff --git a/fog/clustering/fagin.py b/fog/clustering/fagin.py index fc0bccd..5182a41 100644 --- a/fog/clustering/fagin.py +++ b/fog/clustering/fagin.py @@ -162,7 +162,7 @@ def sqrt_indexation_pairs(vectors): proximities = defaultdict(list) for i, v in enumerate(vectors): - leader = max(leaders, key=lambda x: sparse_cosine_similarity(v, vectors[x])) + leader = min(leaders, key=lambda x: 1.0 - sparse_cosine_similarity(v, vectors[x])) l = proximities[leader]