mirror of https://github.com/Yomguithereal/fog.git
Fixing bench
This commit is contained in:
parent
9b461ec41d
commit
c394dd5ccf
|
@ -14,45 +14,45 @@ with open('./data/universities.csv', 'r') as f:
|
|||
|
||||
print('Universities: %i' % len(universities))
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(pairwise_leader(universities, distance=levenshtein, radius=2))
|
||||
# print('Leader (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(pairwise_leader(universities, distance=levenshtein, radius=2))
|
||||
print('Leader (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(pairwise_fuzzy_clusters(universities, distance=levenshtein, radius=2))
|
||||
# print('Fuzzy clusters (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(pairwise_fuzzy_clusters(universities, distance=levenshtein, radius=2))
|
||||
print('Fuzzy clusters (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(pairwise_connected_components(universities, distance=levenshtein, radius=2))
|
||||
# print('Connected components (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(pairwise_connected_components(universities, distance=levenshtein, radius=2))
|
||||
print('Connected components (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(vp_tree(universities, distance=levenshtein, radius=2))
|
||||
# print('VPTree (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(vp_tree(universities, distance=levenshtein, radius=2))
|
||||
print('VPTree (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(quickjoin(universities, distance=levenshtein, radius=2))
|
||||
print('QuickJoin (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(nn_descent(universities, distance=levenshtein, radius=2))
|
||||
# print('NN-Descent (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(nn_descent(universities, distance=levenshtein, radius=2))
|
||||
print('NN-Descent (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2))
|
||||
# print('Blocking (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2))
|
||||
print('Blocking (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(sorted_neighborhood(universities, key=omission_key, distance=levenshtein, radius=2))
|
||||
# print('SNM Omission (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(sorted_neighborhood(universities, key=omission_key, distance=levenshtein, radius=2))
|
||||
print('SNM Omission (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(sorted_neighborhood(universities, key=skeleton_key, distance=levenshtein, radius=2))
|
||||
# print('SNM Skeleton (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(sorted_neighborhood(universities, key=skeleton_key, distance=levenshtein, radius=2))
|
||||
print('SNM Skeleton (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(sorted_neighborhood(universities, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2))
|
||||
# print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(sorted_neighborhood(universities, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2))
|
||||
print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start)
|
||||
|
||||
print()
|
||||
with open('./data/musicians.csv', 'r') as f:
|
||||
|
@ -62,63 +62,54 @@ with open('./data/musicians.csv', 'r') as f:
|
|||
|
||||
print('Artists: %i' % len(artists))
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(key_collision(artists, keys=lambda x: ngrams(12, x), merge=True))
|
||||
# print('12-grams key collision (%i)' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(key_collision(artists, key=fingerprint))
|
||||
# print('Fingerprint key collision (%i)' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(blocking(artists, blocks=partial(ngrams, 6), distance=levenshtein, radius=2, processes=8))
|
||||
# print('Blocking (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(sorted_neighborhood(artists, key=omission_key, distance=levenshtein, radius=2))
|
||||
# print('SNM Omission (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(sorted_neighborhood(artists, key=skeleton_key, distance=levenshtein, radius=2))
|
||||
# print('SNM Skeleton (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(sorted_neighborhood(artists, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2))
|
||||
# print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start)
|
||||
|
||||
c = 0
|
||||
g = len(artists) * (len(artists) - 1) / 2
|
||||
def counting_levenshtein(a, b):
|
||||
global c
|
||||
c += 1
|
||||
|
||||
return levenshtein(a, b)
|
||||
start = timer()
|
||||
clusters = list(key_collision(artists, keys=lambda x: ngrams(12, x), merge=True))
|
||||
print('12-grams key collision (%i)' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(quickjoin(artists, distance=counting_levenshtein, radius=2, processes=8))
|
||||
clusters = list(key_collision(artists, key=fingerprint))
|
||||
print('Fingerprint key collision (%i)' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(blocking(artists, blocks=partial(ngrams, 6), distance=levenshtein, radius=2, processes=8))
|
||||
print('Blocking (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(sorted_neighborhood(artists, key=omission_key, distance=levenshtein, radius=2))
|
||||
print('SNM Omission (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(sorted_neighborhood(artists, key=skeleton_key, distance=levenshtein, radius=2))
|
||||
print('SNM Skeleton (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(sorted_neighborhood(artists, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2))
|
||||
print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(quickjoin(artists, distance=levenshtein, radius=2, processes=8))
|
||||
print('QuickJoin (%i):' % len(clusters), timer() - start)
|
||||
print('c', c, 'vs.', g, 'ratio', c / g)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(nn_descent(artists, distance=levenshtein, radius=2))
|
||||
# print('NN-Descent (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(nn_descent(artists, distance=levenshtein, radius=2))
|
||||
print('NN-Descent (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(minhash(artists, radius=0.8, key=lambda x: list(ngrams(5, x)), use_numpy=True))
|
||||
# print('MinHash (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(minhash(artists, radius=0.8, key=lambda x: list(ngrams(5, x)), use_numpy=True))
|
||||
print('MinHash (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(jaccard_intersection_index(artists, radius=0.8, key=lambda x: list(ngrams(5, x))))
|
||||
# print('Jaccard Intersection Index (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(jaccard_intersection_index(artists, radius=0.8, key=lambda x: list(ngrams(5, x))))
|
||||
print('Jaccard Intersection Index (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(pairwise_fuzzy_clusters(artists, distance=levenshtein, radius=2, processes=8))
|
||||
# print('Parallel Fuzzy clusters (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(pairwise_fuzzy_clusters(artists, distance=levenshtein, radius=2, processes=8))
|
||||
print('Parallel Fuzzy clusters (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(pairwise_connected_components(artists, distance=levenshtein, radius=2, processes=8))
|
||||
# print('Parallel connected components (%i):' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(pairwise_connected_components(artists, distance=levenshtein, radius=2, processes=8))
|
||||
print('Parallel connected components (%i):' % len(clusters), timer() - start)
|
||||
|
||||
# start = timer()
|
||||
# clusters = list(vp_tree(artists, distance=levenshtein, radius=2))
|
||||
# print('VPTree clusters (%i)' % len(clusters), timer() - start)
|
||||
start = timer()
|
||||
clusters = list(vp_tree(artists, distance=levenshtein, radius=2))
|
||||
print('VPTree clusters (%i)' % len(clusters), timer() - start)
|
||||
|
|
|
@ -16,6 +16,17 @@
|
|||
# Applications. SISAP 2013. Lecture Notes in Computer Science, vol 8199.
|
||||
# Springer, Berlin, Heidelberg
|
||||
#
|
||||
# [Notes]:
|
||||
# From what I could gather right now, Fredriksson K., Braithwaite B. methods
|
||||
# to improve the algorithm don't really work with my use-case. For instance,
|
||||
# the book-keeping of the join_pivots methods takes more time than the
|
||||
# saved distance computations, even with a eta parameter set to a high value.
|
||||
# I will need to test examples where the distance is more expensive (e.g.,
|
||||
# testing with quite tiny strings, the Levensthein distance is not really
|
||||
# prohibitive right now).
|
||||
#
|
||||
# Using a Vantage Point Tree does not yield faster results neither.
|
||||
#
|
||||
import dill
|
||||
import random
|
||||
from multiprocessing import Pool
|
||||
|
|
Loading…
Reference in New Issue