threshold_algorithm

This commit is contained in:
Yomguithereal 2020-10-07 14:32:40 +02:00
parent 5684d153a2
commit c57dada0fc
3 changed files with 1189 additions and 26 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,7 @@
import csv
import pytest
from collections import Counter
from fog.clustering.fagin import fagin_k1
from fog.clustering.fagin import fagin_k1, threshold_algorithm_k1
from fog.tokenizers import ngrams
from fog.metrics import sparse_cosine_similarity
from experiments.utils import Timer
@ -13,37 +14,44 @@ with open('./data/fagin_k1_ground_truth.csv') as f:
VECTORS = [Counter(ngrams(5, chars)) for chars in UNIVERSITIES]
with Timer('quadratic'):
with open('./data/fagin_k1_ground_truth.csv', 'w') as f:
# writer = csv.writer(f)
# with Timer('quadratic'):
# with open('./data/fagin_k1_ground_truth.csv', 'w') as f:
# writer = csv.writer(f)
for i in range(len(VECTORS)):
v1 = VECTORS[i]
best = None
# for i in range(len(VECTORS)):
# v1 = VECTORS[i]
# best = None
for j in range(len(VECTORS)):
if i == j:
continue
# for j in range(len(VECTORS)):
# if i == j:
# continue
v2 = VECTORS[j]
# v2 = VECTORS[j]
c = sparse_cosine_similarity(v1, v2)
# c = sparse_cosine_similarity(v1, v2)
# NOTE: this is stable and lower index wins
if best is None or c > best[0]:
best = (c, j)
# # NOTE: this is stable and lower index wins
# if best is None or c > best[0]:
# best = (c, j)
# print(UNIVERSITIES[i], UNIVERSITIES[best[1]])
# writer.writerow([i, best[1], str(best[0])])
# # print(UNIVERSITIES[i], UNIVERSITIES[best[1]])
# writer.writerow([i, best[1], str(best[0])])
with Timer('Fagin'):
for i, candidates in fagin_k1(VECTORS):
v = VECTORS[i]
j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c]))
# with Timer('FA'):
# for i, candidates in fagin_k1(VECTORS):
# v = VECTORS[i]
# j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c]))
# print("'%s'" % UNIVERSITIES[i])
# print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]])
# print("'%s'" % UNIVERSITIES[j])
# print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j]))
# # print("'%s'" % UNIVERSITIES[i])
# # print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]])
# # print("'%s'" % UNIVERSITIES[j])
# # print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j]))
assert j == GROUND_TRUTH[i][0]
# assert j == GROUND_TRUTH[i][0]
with Timer('TA'):
# TODO: current heap comparison used is not stable
for i, j in threshold_algorithm_k1(VECTORS):
if i != j:
assert sparse_cosine_similarity(VECTORS[i], VECTORS[j]) == pytest.approx(GROUND_TRUTH[i][1])

View File

@ -22,6 +22,8 @@
#
from collections import defaultdict, Counter
from fog.metrics.cosine import sparse_cosine_similarity
def fagin_k1(vectors):
inverted_lists = defaultdict(list)
@ -57,3 +59,67 @@ def fagin_k1(vectors):
break
offset += 1
def threshold_algorithm_k1(vectors):
inverted_lists = defaultdict(list)
for i, vector in enumerate(vectors):
for d, w in vector.items():
inverted_lists[d].append((w, i))
for l in inverted_lists.values():
l.sort()
for i, vector in enumerate(vectors):
visited = set()
offset = 0
t = 0.0
best = [None, None]
t_vector = {}
while True:
stop = True
for d in vector:
l = inverted_lists[d]
if offset >= len(l):
continue
stop = False
w, j = l[offset]
t_vector[d] = w
if j in visited:
continue
cs = sparse_cosine_similarity(vector, vectors[j])
visited.add(j)
if best[0] is None:
best[0] = (cs, j)
else:
if cs > best[0][0]:
best[1] = best[0]
best[0] = (cs, j)
else:
if best[1] is None:
best[1] = (cs, j)
elif cs > best[1][0]:
best[1] = (cs, j)
# Final break + return self if best cos is 0.0
if stop:
yield i, best[1][1] if best[1] is not None else best[0][1]
break
t = sparse_cosine_similarity(vector, t_vector)
if best[1] is not None and best[1][0] >= t:
yield i, best[1][1] if best[1] is not None else best[0][1]
break
offset += 1