From c394dd5ccf6570433f3731b3552ab0acd2b78233 Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Mon, 16 Jul 2018 17:04:40 +0200 Subject: [PATCH] Fixing bench --- experiments/benchmark.py | 147 +++++++++++++++++------------------- fog/clustering/quickjoin.py | 11 +++ 2 files changed, 80 insertions(+), 78 deletions(-) diff --git a/experiments/benchmark.py b/experiments/benchmark.py index dd704a9..3e51b27 100644 --- a/experiments/benchmark.py +++ b/experiments/benchmark.py @@ -14,45 +14,45 @@ with open('./data/universities.csv', 'r') as f: print('Universities: %i' % len(universities)) - # start = timer() - # clusters = list(pairwise_leader(universities, distance=levenshtein, radius=2)) - # print('Leader (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(pairwise_leader(universities, distance=levenshtein, radius=2)) + print('Leader (%i):' % len(clusters), timer() - start) - # start = timer() - # clusters = list(pairwise_fuzzy_clusters(universities, distance=levenshtein, radius=2)) - # print('Fuzzy clusters (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(pairwise_fuzzy_clusters(universities, distance=levenshtein, radius=2)) + print('Fuzzy clusters (%i):' % len(clusters), timer() - start) - # start = timer() - # clusters = list(pairwise_connected_components(universities, distance=levenshtein, radius=2)) - # print('Connected components (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(pairwise_connected_components(universities, distance=levenshtein, radius=2)) + print('Connected components (%i):' % len(clusters), timer() - start) - # start = timer() - # clusters = list(vp_tree(universities, distance=levenshtein, radius=2)) - # print('VPTree (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(vp_tree(universities, distance=levenshtein, radius=2)) + print('VPTree (%i):' % len(clusters), timer() - start) start = timer() clusters = list(quickjoin(universities, distance=levenshtein, radius=2)) print('QuickJoin (%i):' % len(clusters), timer() - start) - # start = timer() - # clusters = list(nn_descent(universities, distance=levenshtein, radius=2)) - # print('NN-Descent (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(nn_descent(universities, distance=levenshtein, radius=2)) + print('NN-Descent (%i):' % len(clusters), timer() - start) - # start = timer() - # clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2)) - # print('Blocking (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2)) + print('Blocking (%i):' % len(clusters), timer() - start) - # start = timer() - # clusters = list(sorted_neighborhood(universities, key=omission_key, distance=levenshtein, radius=2)) - # print('SNM Omission (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(sorted_neighborhood(universities, key=omission_key, distance=levenshtein, radius=2)) + print('SNM Omission (%i):' % len(clusters), timer() - start) - # start = timer() - # clusters = list(sorted_neighborhood(universities, key=skeleton_key, distance=levenshtein, radius=2)) - # print('SNM Skeleton (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(sorted_neighborhood(universities, key=skeleton_key, distance=levenshtein, radius=2)) + print('SNM Skeleton (%i):' % len(clusters), timer() - start) - # start = timer() - # clusters = list(sorted_neighborhood(universities, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2)) - # print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(sorted_neighborhood(universities, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2)) + print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start) print() with open('./data/musicians.csv', 'r') as f: @@ -62,63 +62,54 @@ with open('./data/musicians.csv', 'r') as f: print('Artists: %i' % len(artists)) - # start = timer() - # clusters = list(key_collision(artists, keys=lambda x: ngrams(12, x), merge=True)) - # print('12-grams key collision (%i)' % len(clusters), timer() - start) - - # start = timer() - # clusters = list(key_collision(artists, key=fingerprint)) - # print('Fingerprint key collision (%i)' % len(clusters), timer() - start) - - # start = timer() - # clusters = list(blocking(artists, blocks=partial(ngrams, 6), distance=levenshtein, radius=2, processes=8)) - # print('Blocking (%i):' % len(clusters), timer() - start) - - # start = timer() - # clusters = list(sorted_neighborhood(artists, key=omission_key, distance=levenshtein, radius=2)) - # print('SNM Omission (%i):' % len(clusters), timer() - start) - - # start = timer() - # clusters = list(sorted_neighborhood(artists, key=skeleton_key, distance=levenshtein, radius=2)) - # print('SNM Skeleton (%i):' % len(clusters), timer() - start) - - # start = timer() - # clusters = list(sorted_neighborhood(artists, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2)) - # print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start) - - c = 0 - g = len(artists) * (len(artists) - 1) / 2 - def counting_levenshtein(a, b): - global c - c += 1 - - return levenshtein(a, b) + start = timer() + clusters = list(key_collision(artists, keys=lambda x: ngrams(12, x), merge=True)) + print('12-grams key collision (%i)' % len(clusters), timer() - start) start = timer() - clusters = list(quickjoin(artists, distance=counting_levenshtein, radius=2, processes=8)) + clusters = list(key_collision(artists, key=fingerprint)) + print('Fingerprint key collision (%i)' % len(clusters), timer() - start) + + start = timer() + clusters = list(blocking(artists, blocks=partial(ngrams, 6), distance=levenshtein, radius=2, processes=8)) + print('Blocking (%i):' % len(clusters), timer() - start) + + start = timer() + clusters = list(sorted_neighborhood(artists, key=omission_key, distance=levenshtein, radius=2)) + print('SNM Omission (%i):' % len(clusters), timer() - start) + + start = timer() + clusters = list(sorted_neighborhood(artists, key=skeleton_key, distance=levenshtein, radius=2)) + print('SNM Skeleton (%i):' % len(clusters), timer() - start) + + start = timer() + clusters = list(sorted_neighborhood(artists, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2)) + print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start) + + start = timer() + clusters = list(quickjoin(artists, distance=levenshtein, radius=2, processes=8)) print('QuickJoin (%i):' % len(clusters), timer() - start) - print('c', c, 'vs.', g, 'ratio', c / g) - # start = timer() - # clusters = list(nn_descent(artists, distance=levenshtein, radius=2)) - # print('NN-Descent (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(nn_descent(artists, distance=levenshtein, radius=2)) + print('NN-Descent (%i):' % len(clusters), timer() - start) - # start = timer() - # clusters = list(minhash(artists, radius=0.8, key=lambda x: list(ngrams(5, x)), use_numpy=True)) - # print('MinHash (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(minhash(artists, radius=0.8, key=lambda x: list(ngrams(5, x)), use_numpy=True)) + print('MinHash (%i):' % len(clusters), timer() - start) - # start = timer() - # clusters = list(jaccard_intersection_index(artists, radius=0.8, key=lambda x: list(ngrams(5, x)))) - # print('Jaccard Intersection Index (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(jaccard_intersection_index(artists, radius=0.8, key=lambda x: list(ngrams(5, x)))) + print('Jaccard Intersection Index (%i):' % len(clusters), timer() - start) - # start = timer() - # clusters = list(pairwise_fuzzy_clusters(artists, distance=levenshtein, radius=2, processes=8)) - # print('Parallel Fuzzy clusters (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(pairwise_fuzzy_clusters(artists, distance=levenshtein, radius=2, processes=8)) + print('Parallel Fuzzy clusters (%i):' % len(clusters), timer() - start) - # start = timer() - # clusters = list(pairwise_connected_components(artists, distance=levenshtein, radius=2, processes=8)) - # print('Parallel connected components (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(pairwise_connected_components(artists, distance=levenshtein, radius=2, processes=8)) + print('Parallel connected components (%i):' % len(clusters), timer() - start) - # start = timer() - # clusters = list(vp_tree(artists, distance=levenshtein, radius=2)) - # print('VPTree clusters (%i)' % len(clusters), timer() - start) + start = timer() + clusters = list(vp_tree(artists, distance=levenshtein, radius=2)) + print('VPTree clusters (%i)' % len(clusters), timer() - start) diff --git a/fog/clustering/quickjoin.py b/fog/clustering/quickjoin.py index 1701a72..0484312 100644 --- a/fog/clustering/quickjoin.py +++ b/fog/clustering/quickjoin.py @@ -16,6 +16,17 @@ # Applications. SISAP 2013. Lecture Notes in Computer Science, vol 8199. # Springer, Berlin, Heidelberg # +# [Notes]: +# From what I could gather right now, Fredriksson K., Braithwaite B. methods +# to improve the algorithm don't really work with my use-case. For instance, +# the book-keeping of the join_pivots methods takes more time than the +# saved distance computations, even with a eta parameter set to a high value. +# I will need to test examples where the distance is more expensive (e.g., +# testing with quite tiny strings, the Levensthein distance is not really +# prohibitive right now). +# +# Using a Vantage Point Tree does not yield faster results neither. +# import dill import random from multiprocessing import Pool