mirror of https://github.com/Yomguithereal/fog.git
More experiments
This commit is contained in:
parent
18159bdef2
commit
a5ecda14f5
|
@ -1,6 +1,6 @@
|
|||
import csv
|
||||
import pytest
|
||||
from collections import Counter
|
||||
from collections import Counter, defaultdict
|
||||
from fog.clustering.fagin import (
|
||||
fagin_k1,
|
||||
threshold_algorithm_k1,
|
||||
|
@ -19,53 +19,53 @@ with open('./data/fagin_k1_ground_truth.csv') as f:
|
|||
|
||||
VECTORS = [Counter(ngrams(2, chars)) for chars in UNIVERSITIES]
|
||||
|
||||
with Timer('quadratic'):
|
||||
# with open('./data/fagin_k1_ground_truth.csv', 'w') as f:
|
||||
# writer = csv.writer(f)
|
||||
# with Timer('quadratic'):
|
||||
# # with open('./data/fagin_k1_ground_truth.csv', 'w') as f:
|
||||
# # writer = csv.writer(f)
|
||||
|
||||
for i in range(len(VECTORS)):
|
||||
v1 = VECTORS[i]
|
||||
best = None
|
||||
# for i in range(len(VECTORS)):
|
||||
# v1 = VECTORS[i]
|
||||
# best = None
|
||||
|
||||
for j in range(len(VECTORS)):
|
||||
if i == j:
|
||||
continue
|
||||
# for j in range(len(VECTORS)):
|
||||
# if i == j:
|
||||
# continue
|
||||
|
||||
v2 = VECTORS[j]
|
||||
# v2 = VECTORS[j]
|
||||
|
||||
c = sparse_cosine_similarity(v1, v2)
|
||||
# c = sparse_cosine_similarity(v1, v2)
|
||||
|
||||
# NOTE: this is stable and lower index wins
|
||||
if best is None or c > best[0]:
|
||||
best = (c, j)
|
||||
# # NOTE: this is stable and lower index wins
|
||||
# if best is None or c > best[0]:
|
||||
# best = (c, j)
|
||||
|
||||
# print(UNIVERSITIES[i], UNIVERSITIES[best[1]])
|
||||
# writer.writerow([i, best[1], str(best[0])])
|
||||
# # print(UNIVERSITIES[i], UNIVERSITIES[best[1]])
|
||||
# # writer.writerow([i, best[1], str(best[0])])
|
||||
|
||||
with Timer('FA'):
|
||||
for i, candidates in fagin_k1(VECTORS):
|
||||
v = VECTORS[i]
|
||||
j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c]))
|
||||
# with Timer('FA'):
|
||||
# for i, candidates in fagin_k1(VECTORS):
|
||||
# v = VECTORS[i]
|
||||
# j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c]))
|
||||
|
||||
# print("'%s'" % UNIVERSITIES[i])
|
||||
# print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]])
|
||||
# print("'%s'" % UNIVERSITIES[j])
|
||||
# print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j]))
|
||||
# # print("'%s'" % UNIVERSITIES[i])
|
||||
# # print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]])
|
||||
# # print("'%s'" % UNIVERSITIES[j])
|
||||
# # print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j]))
|
||||
|
||||
assert j == GROUND_TRUTH[i][0]
|
||||
# assert j == GROUND_TRUTH[i][0]
|
||||
|
||||
with Timer('TA'):
|
||||
# with Timer('TA'):
|
||||
|
||||
# TODO: current heap comparison used is not stable
|
||||
for i, j in threshold_algorithm_k1(VECTORS):
|
||||
if i != j:
|
||||
assert sparse_cosine_similarity(VECTORS[i], VECTORS[j]) == pytest.approx(GROUND_TRUTH[i][1])
|
||||
# # TODO: current heap comparison used is not stable
|
||||
# for i, j in threshold_algorithm_k1(VECTORS):
|
||||
# if i != j:
|
||||
# assert sparse_cosine_similarity(VECTORS[i], VECTORS[j]) == pytest.approx(GROUND_TRUTH[i][1])
|
||||
|
||||
with Timer('naive cosine pairs'):
|
||||
pairs = list(naive_cosine_pairs(VECTORS))
|
||||
n = len(VECTORS) * (len(VECTORS) - 1) // 2
|
||||
# with Timer('naive cosine pairs'):
|
||||
# pairs = list(naive_cosine_pairs(VECTORS))
|
||||
# n = len(VECTORS) * (len(VECTORS) - 1) // 2
|
||||
|
||||
print(len(pairs), n, len(pairs) / n)
|
||||
# print(len(pairs), n, len(pairs) / n)
|
||||
|
||||
with Timer('sqrt indices'):
|
||||
pairs = list(sqrt_indexation_pairs(VECTORS))
|
||||
|
@ -74,3 +74,33 @@ with Timer('sqrt indices'):
|
|||
print(len(pairs), n, len(pairs) / n)
|
||||
|
||||
print(len(set(i for i, _ in pairs) | set(j for _, j in pairs)), len(VECTORS))
|
||||
|
||||
neighbors = defaultdict(lambda: (0, None))
|
||||
|
||||
for _ in range(5):
|
||||
pairs = sqrt_indexation_pairs(VECTORS)
|
||||
|
||||
for i, j in pairs:
|
||||
cs = sparse_cosine_similarity(VECTORS[i], VECTORS[j])
|
||||
|
||||
if cs > neighbors[i][0]:
|
||||
neighbors[i] = (cs, j)
|
||||
|
||||
if cs > neighbors[j][0]:
|
||||
neighbors[j] = (cs, i)
|
||||
|
||||
T = 0
|
||||
P = 0
|
||||
for i in range(len(VECTORS)):
|
||||
if GROUND_TRUTH[i][1] < 0.8:
|
||||
continue
|
||||
|
||||
T += 1
|
||||
|
||||
if len(neighbors[i]) < 1:
|
||||
continue
|
||||
|
||||
if neighbors[i][0] == pytest.approx(GROUND_TRUTH[i][1]):
|
||||
P += 1
|
||||
|
||||
print(P / T)
|
||||
|
|
|
@ -162,7 +162,7 @@ def sqrt_indexation_pairs(vectors):
|
|||
proximities = defaultdict(list)
|
||||
|
||||
for i, v in enumerate(vectors):
|
||||
leader = max(leaders, key=lambda x: sparse_cosine_similarity(v, vectors[x]))
|
||||
leader = min(leaders, key=lambda x: 1.0 - sparse_cosine_similarity(v, vectors[x]))
|
||||
|
||||
l = proximities[leader]
|
||||
|
||||
|
|
Loading…
Reference in New Issue