mirror of https://github.com/Yomguithereal/fog.git
threshold_algorithm
This commit is contained in:
parent
5684d153a2
commit
c57dada0fc
File diff suppressed because it is too large
Load Diff
|
@ -1,6 +1,7 @@
|
|||
import csv
|
||||
import pytest
|
||||
from collections import Counter
|
||||
from fog.clustering.fagin import fagin_k1
|
||||
from fog.clustering.fagin import fagin_k1, threshold_algorithm_k1
|
||||
from fog.tokenizers import ngrams
|
||||
from fog.metrics import sparse_cosine_similarity
|
||||
from experiments.utils import Timer
|
||||
|
@ -13,37 +14,44 @@ with open('./data/fagin_k1_ground_truth.csv') as f:
|
|||
|
||||
VECTORS = [Counter(ngrams(5, chars)) for chars in UNIVERSITIES]
|
||||
|
||||
with Timer('quadratic'):
|
||||
with open('./data/fagin_k1_ground_truth.csv', 'w') as f:
|
||||
# writer = csv.writer(f)
|
||||
# with Timer('quadratic'):
|
||||
# with open('./data/fagin_k1_ground_truth.csv', 'w') as f:
|
||||
# writer = csv.writer(f)
|
||||
|
||||
for i in range(len(VECTORS)):
|
||||
v1 = VECTORS[i]
|
||||
best = None
|
||||
# for i in range(len(VECTORS)):
|
||||
# v1 = VECTORS[i]
|
||||
# best = None
|
||||
|
||||
for j in range(len(VECTORS)):
|
||||
if i == j:
|
||||
continue
|
||||
# for j in range(len(VECTORS)):
|
||||
# if i == j:
|
||||
# continue
|
||||
|
||||
v2 = VECTORS[j]
|
||||
# v2 = VECTORS[j]
|
||||
|
||||
c = sparse_cosine_similarity(v1, v2)
|
||||
# c = sparse_cosine_similarity(v1, v2)
|
||||
|
||||
# NOTE: this is stable and lower index wins
|
||||
if best is None or c > best[0]:
|
||||
best = (c, j)
|
||||
# # NOTE: this is stable and lower index wins
|
||||
# if best is None or c > best[0]:
|
||||
# best = (c, j)
|
||||
|
||||
# print(UNIVERSITIES[i], UNIVERSITIES[best[1]])
|
||||
# writer.writerow([i, best[1], str(best[0])])
|
||||
# # print(UNIVERSITIES[i], UNIVERSITIES[best[1]])
|
||||
# writer.writerow([i, best[1], str(best[0])])
|
||||
|
||||
with Timer('Fagin'):
|
||||
for i, candidates in fagin_k1(VECTORS):
|
||||
v = VECTORS[i]
|
||||
j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c]))
|
||||
# with Timer('FA'):
|
||||
# for i, candidates in fagin_k1(VECTORS):
|
||||
# v = VECTORS[i]
|
||||
# j = max(candidates, key=lambda c: sparse_cosine_similarity(v, VECTORS[c]))
|
||||
|
||||
# print("'%s'" % UNIVERSITIES[i])
|
||||
# print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]])
|
||||
# print("'%s'" % UNIVERSITIES[j])
|
||||
# print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j]))
|
||||
# # print("'%s'" % UNIVERSITIES[i])
|
||||
# # print("'%s'" % UNIVERSITIES[GROUND_TRUTH[i][0]])
|
||||
# # print("'%s'" % UNIVERSITIES[j])
|
||||
# # print(i, j, len(candidates), GROUND_TRUTH[i], sparse_cosine_similarity(v, VECTORS[j]))
|
||||
|
||||
assert j == GROUND_TRUTH[i][0]
|
||||
# assert j == GROUND_TRUTH[i][0]
|
||||
|
||||
with Timer('TA'):
|
||||
|
||||
# TODO: current heap comparison used is not stable
|
||||
for i, j in threshold_algorithm_k1(VECTORS):
|
||||
if i != j:
|
||||
assert sparse_cosine_similarity(VECTORS[i], VECTORS[j]) == pytest.approx(GROUND_TRUTH[i][1])
|
||||
|
|
|
@ -22,6 +22,8 @@
|
|||
#
|
||||
from collections import defaultdict, Counter
|
||||
|
||||
from fog.metrics.cosine import sparse_cosine_similarity
|
||||
|
||||
|
||||
def fagin_k1(vectors):
|
||||
inverted_lists = defaultdict(list)
|
||||
|
@ -57,3 +59,67 @@ def fagin_k1(vectors):
|
|||
break
|
||||
|
||||
offset += 1
|
||||
|
||||
|
||||
def threshold_algorithm_k1(vectors):
|
||||
inverted_lists = defaultdict(list)
|
||||
|
||||
for i, vector in enumerate(vectors):
|
||||
for d, w in vector.items():
|
||||
inverted_lists[d].append((w, i))
|
||||
|
||||
for l in inverted_lists.values():
|
||||
l.sort()
|
||||
|
||||
for i, vector in enumerate(vectors):
|
||||
visited = set()
|
||||
offset = 0
|
||||
|
||||
t = 0.0
|
||||
best = [None, None]
|
||||
t_vector = {}
|
||||
|
||||
while True:
|
||||
stop = True
|
||||
|
||||
for d in vector:
|
||||
l = inverted_lists[d]
|
||||
|
||||
if offset >= len(l):
|
||||
continue
|
||||
|
||||
stop = False
|
||||
|
||||
w, j = l[offset]
|
||||
t_vector[d] = w
|
||||
|
||||
if j in visited:
|
||||
continue
|
||||
|
||||
cs = sparse_cosine_similarity(vector, vectors[j])
|
||||
visited.add(j)
|
||||
|
||||
if best[0] is None:
|
||||
best[0] = (cs, j)
|
||||
else:
|
||||
if cs > best[0][0]:
|
||||
best[1] = best[0]
|
||||
best[0] = (cs, j)
|
||||
else:
|
||||
if best[1] is None:
|
||||
best[1] = (cs, j)
|
||||
elif cs > best[1][0]:
|
||||
best[1] = (cs, j)
|
||||
|
||||
# Final break + return self if best cos is 0.0
|
||||
if stop:
|
||||
yield i, best[1][1] if best[1] is not None else best[0][1]
|
||||
break
|
||||
|
||||
t = sparse_cosine_similarity(vector, t_vector)
|
||||
|
||||
if best[1] is not None and best[1][0] >= t:
|
||||
yield i, best[1][1] if best[1] is not None else best[0][1]
|
||||
break
|
||||
|
||||
offset += 1
|
||||
|
|
Loading…
Reference in New Issue