From 827ebd3d2aab5cea2861ea821f88dfb8da8f07bd Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Mon, 16 Jul 2018 16:45:23 +0200 Subject: [PATCH] Fixing edge case --- experiments/benchmark.py | 147 +++++++++++++++++++----------------- fog/clustering/quickjoin.py | 2 +- 2 files changed, 79 insertions(+), 70 deletions(-) diff --git a/experiments/benchmark.py b/experiments/benchmark.py index 3e51b27..dd704a9 100644 --- a/experiments/benchmark.py +++ b/experiments/benchmark.py @@ -14,45 +14,45 @@ with open('./data/universities.csv', 'r') as f: print('Universities: %i' % len(universities)) - start = timer() - clusters = list(pairwise_leader(universities, distance=levenshtein, radius=2)) - print('Leader (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(pairwise_leader(universities, distance=levenshtein, radius=2)) + # print('Leader (%i):' % len(clusters), timer() - start) - start = timer() - clusters = list(pairwise_fuzzy_clusters(universities, distance=levenshtein, radius=2)) - print('Fuzzy clusters (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(pairwise_fuzzy_clusters(universities, distance=levenshtein, radius=2)) + # print('Fuzzy clusters (%i):' % len(clusters), timer() - start) - start = timer() - clusters = list(pairwise_connected_components(universities, distance=levenshtein, radius=2)) - print('Connected components (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(pairwise_connected_components(universities, distance=levenshtein, radius=2)) + # print('Connected components (%i):' % len(clusters), timer() - start) - start = timer() - clusters = list(vp_tree(universities, distance=levenshtein, radius=2)) - print('VPTree (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(vp_tree(universities, distance=levenshtein, radius=2)) + # print('VPTree (%i):' % len(clusters), timer() - start) start = timer() clusters = list(quickjoin(universities, distance=levenshtein, radius=2)) print('QuickJoin (%i):' % len(clusters), timer() - start) - start = timer() - clusters = list(nn_descent(universities, distance=levenshtein, radius=2)) - print('NN-Descent (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(nn_descent(universities, distance=levenshtein, radius=2)) + # print('NN-Descent (%i):' % len(clusters), timer() - start) - start = timer() - clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2)) - print('Blocking (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2)) + # print('Blocking (%i):' % len(clusters), timer() - start) - start = timer() - clusters = list(sorted_neighborhood(universities, key=omission_key, distance=levenshtein, radius=2)) - print('SNM Omission (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(sorted_neighborhood(universities, key=omission_key, distance=levenshtein, radius=2)) + # print('SNM Omission (%i):' % len(clusters), timer() - start) - start = timer() - clusters = list(sorted_neighborhood(universities, key=skeleton_key, distance=levenshtein, radius=2)) - print('SNM Skeleton (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(sorted_neighborhood(universities, key=skeleton_key, distance=levenshtein, radius=2)) + # print('SNM Skeleton (%i):' % len(clusters), timer() - start) - start = timer() - clusters = list(sorted_neighborhood(universities, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2)) - print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(sorted_neighborhood(universities, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2)) + # print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start) print() with open('./data/musicians.csv', 'r') as f: @@ -62,54 +62,63 @@ with open('./data/musicians.csv', 'r') as f: print('Artists: %i' % len(artists)) - start = timer() - clusters = list(key_collision(artists, keys=lambda x: ngrams(12, x), merge=True)) - print('12-grams key collision (%i)' % len(clusters), timer() - start) + # start = timer() + # clusters = list(key_collision(artists, keys=lambda x: ngrams(12, x), merge=True)) + # print('12-grams key collision (%i)' % len(clusters), timer() - start) + + # start = timer() + # clusters = list(key_collision(artists, key=fingerprint)) + # print('Fingerprint key collision (%i)' % len(clusters), timer() - start) + + # start = timer() + # clusters = list(blocking(artists, blocks=partial(ngrams, 6), distance=levenshtein, radius=2, processes=8)) + # print('Blocking (%i):' % len(clusters), timer() - start) + + # start = timer() + # clusters = list(sorted_neighborhood(artists, key=omission_key, distance=levenshtein, radius=2)) + # print('SNM Omission (%i):' % len(clusters), timer() - start) + + # start = timer() + # clusters = list(sorted_neighborhood(artists, key=skeleton_key, distance=levenshtein, radius=2)) + # print('SNM Skeleton (%i):' % len(clusters), timer() - start) + + # start = timer() + # clusters = list(sorted_neighborhood(artists, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2)) + # print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start) + + c = 0 + g = len(artists) * (len(artists) - 1) / 2 + def counting_levenshtein(a, b): + global c + c += 1 + + return levenshtein(a, b) start = timer() - clusters = list(key_collision(artists, key=fingerprint)) - print('Fingerprint key collision (%i)' % len(clusters), timer() - start) - - start = timer() - clusters = list(blocking(artists, blocks=partial(ngrams, 6), distance=levenshtein, radius=2, processes=8)) - print('Blocking (%i):' % len(clusters), timer() - start) - - start = timer() - clusters = list(sorted_neighborhood(artists, key=omission_key, distance=levenshtein, radius=2)) - print('SNM Omission (%i):' % len(clusters), timer() - start) - - start = timer() - clusters = list(sorted_neighborhood(artists, key=skeleton_key, distance=levenshtein, radius=2)) - print('SNM Skeleton (%i):' % len(clusters), timer() - start) - - start = timer() - clusters = list(sorted_neighborhood(artists, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2)) - print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start) - - start = timer() - clusters = list(quickjoin(artists, distance=levenshtein, radius=2, processes=8)) + clusters = list(quickjoin(artists, distance=counting_levenshtein, radius=2, processes=8)) print('QuickJoin (%i):' % len(clusters), timer() - start) + print('c', c, 'vs.', g, 'ratio', c / g) - start = timer() - clusters = list(nn_descent(artists, distance=levenshtein, radius=2)) - print('NN-Descent (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(nn_descent(artists, distance=levenshtein, radius=2)) + # print('NN-Descent (%i):' % len(clusters), timer() - start) - start = timer() - clusters = list(minhash(artists, radius=0.8, key=lambda x: list(ngrams(5, x)), use_numpy=True)) - print('MinHash (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(minhash(artists, radius=0.8, key=lambda x: list(ngrams(5, x)), use_numpy=True)) + # print('MinHash (%i):' % len(clusters), timer() - start) - start = timer() - clusters = list(jaccard_intersection_index(artists, radius=0.8, key=lambda x: list(ngrams(5, x)))) - print('Jaccard Intersection Index (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(jaccard_intersection_index(artists, radius=0.8, key=lambda x: list(ngrams(5, x)))) + # print('Jaccard Intersection Index (%i):' % len(clusters), timer() - start) - start = timer() - clusters = list(pairwise_fuzzy_clusters(artists, distance=levenshtein, radius=2, processes=8)) - print('Parallel Fuzzy clusters (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(pairwise_fuzzy_clusters(artists, distance=levenshtein, radius=2, processes=8)) + # print('Parallel Fuzzy clusters (%i):' % len(clusters), timer() - start) - start = timer() - clusters = list(pairwise_connected_components(artists, distance=levenshtein, radius=2, processes=8)) - print('Parallel connected components (%i):' % len(clusters), timer() - start) + # start = timer() + # clusters = list(pairwise_connected_components(artists, distance=levenshtein, radius=2, processes=8)) + # print('Parallel connected components (%i):' % len(clusters), timer() - start) - start = timer() - clusters = list(vp_tree(artists, distance=levenshtein, radius=2)) - print('VPTree clusters (%i)' % len(clusters), timer() - start) + # start = timer() + # clusters = list(vp_tree(artists, distance=levenshtein, radius=2)) + # print('VPTree clusters (%i)' % len(clusters), timer() - start) diff --git a/fog/clustering/quickjoin.py b/fog/clustering/quickjoin.py index e20cc24..c16c086 100644 --- a/fog/clustering/quickjoin.py +++ b/fog/clustering/quickjoin.py @@ -150,7 +150,7 @@ def quickjoin_join_pivots(S1, S2, distance, radius): f = False for l in range(k): - if P[l * N2 + j] - D[l] > radius: + if abs(P[l * N2 + j] - D[l]) > radius: f = True break