More experiments

This commit is contained in:
Yomguithereal 2020-10-07 18:31:02 +02:00
parent 18159bdef2
commit a5ecda14f5
2 changed files with 66 additions and 36 deletions

View File

@ -1,6 +1,6 @@
import csv
import pytest
from collections import Counter
from collections import Counter, defaultdict
from fog.clustering.fagin import (
fagin_k1,
threshold_algorithm_k1,
@ -19,53 +19,53 @@ with open('./data/fagin_k1_ground_truth.csv') as f:
VECTORS = [Counter(ngrams(2, chars)) for chars in UNIVERSITIES]
with Timer('quadratic'):
# with open('./data/fagin_k1_ground_truth.csv', 'w') as f:
# writer = csv.writer(f)
# with Timer('quadratic'):
# # with open('./data/fagin_k1_ground_truth.csv', 'w') as f:
# # writer = csv.writer(f)
for i in range(len(VECTORS)):
v1 = VECTORS[i]
best = None
# for i in range(len(VECTORS)):
# v1 = VECTORS[i]
# best = None
for j in range(len(VECTORS)):
if i == j:
continue
# for j in range(len(VECTORS)):
# if i == j:
# continue
v2 = VECTORS[j]
# v2 = VECTORS[j]
c = sparse_cosine_similarity(v1, v2)
# c = sparse_cosine_similarity(v1, v2)
# NOTE: this is stable and lower index wins
if best is None or c > best[0]:
best = (c, j)
# # NOTE: this is stable and lower index wins
# if best is None or c > best[0]:
# best = (c, j)
# print(UNIVERSITIES[i], UNIVERSITIES[best[1]])
# writer.writerow([i, best[1], str(best[0])])
# # print(UNIVERSITIES[i], UNIVERSITIES[best[1]])
# # writer.writerow([i, best[1], str(best[0])])
with Timer('FA'):
for i, candidates in fagin_k1(VECTORS):
v = VECTORS[i]
j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c]))
# with Timer('FA'):
# for i, candidates in fagin_k1(VECTORS):
# v = VECTORS[i]
# j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c]))
# print("'%s'" % UNIVERSITIES[i])
# print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]])
# print("'%s'" % UNIVERSITIES[j])
# print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j]))
# # print("'%s'" % UNIVERSITIES[i])
# # print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]])
# # print("'%s'" % UNIVERSITIES[j])
# # print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j]))
assert j == GROUND_TRUTH[i][0]
# assert j == GROUND_TRUTH[i][0]
with Timer('TA'):
# with Timer('TA'):
# TODO: current heap comparison used is not stable
for i, j in threshold_algorithm_k1(VECTORS):
if i != j:
assert sparse_cosine_similarity(VECTORS[i], VECTORS[j]) == pytest.approx(GROUND_TRUTH[i][1])
# # TODO: current heap comparison used is not stable
# for i, j in threshold_algorithm_k1(VECTORS):
# if i != j:
# assert sparse_cosine_similarity(VECTORS[i], VECTORS[j]) == pytest.approx(GROUND_TRUTH[i][1])
with Timer('naive cosine pairs'):
pairs = list(naive_cosine_pairs(VECTORS))
n = len(VECTORS) * (len(VECTORS) - 1) // 2
# with Timer('naive cosine pairs'):
# pairs = list(naive_cosine_pairs(VECTORS))
# n = len(VECTORS) * (len(VECTORS) - 1) // 2
print(len(pairs), n, len(pairs) / n)
# print(len(pairs), n, len(pairs) / n)
with Timer('sqrt indices'):
pairs = list(sqrt_indexation_pairs(VECTORS))
@ -74,3 +74,33 @@ with Timer('sqrt indices'):
print(len(pairs), n, len(pairs) / n)
print(len(set(i for i, _ in pairs) | set(j for _, j in pairs)), len(VECTORS))
neighbors = defaultdict(lambda: (0, None))
for _ in range(5):
pairs = sqrt_indexation_pairs(VECTORS)
for i, j in pairs:
cs = sparse_cosine_similarity(VECTORS[i], VECTORS[j])
if cs > neighbors[i][0]:
neighbors[i] = (cs, j)
if cs > neighbors[j][0]:
neighbors[j] = (cs, i)
T = 0
P = 0
for i in range(len(VECTORS)):
if GROUND_TRUTH[i][1] < 0.8:
continue
T += 1
if len(neighbors[i]) < 1:
continue
if neighbors[i][0] == pytest.approx(GROUND_TRUTH[i][1]):
P += 1
print(P / T)

View File

@ -162,7 +162,7 @@ def sqrt_indexation_pairs(vectors):
proximities = defaultdict(list)
for i, v in enumerate(vectors):
leader = max(leaders, key=lambda x: sparse_cosine_similarity(v, vectors[x]))
leader = min(leaders, key=lambda x: 1.0 - sparse_cosine_similarity(v, vectors[x]))
l = proximities[leader]