diff --git a/experiments/benchmark.py b/experiments/benchmark.py index 0776585..3e51b27 100644 --- a/experiments/benchmark.py +++ b/experiments/benchmark.py @@ -50,6 +50,10 @@ with open('./data/universities.csv', 'r') as f: clusters = list(sorted_neighborhood(universities, key=skeleton_key, distance=levenshtein, radius=2)) print('SNM Skeleton (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(sorted_neighborhood(universities, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2)) + print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start) + print() with open('./data/musicians.csv', 'r') as f: reader = csv.DictReader(f) @@ -78,6 +82,10 @@ with open('./data/musicians.csv', 'r') as f: clusters = list(sorted_neighborhood(artists, key=skeleton_key, distance=levenshtein, radius=2)) print('SNM Skeleton (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(sorted_neighborhood(artists, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2)) + print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start) + start = timer() clusters = list(quickjoin(artists, distance=levenshtein, radius=2, processes=8)) print('QuickJoin (%i):' % len(clusters), timer() - start) diff --git a/fog/clustering/sorted_neighborhood.py b/fog/clustering/sorted_neighborhood.py index 6cf624e..8ff2dcf 100644 --- a/fog/clustering/sorted_neighborhood.py +++ b/fog/clustering/sorted_neighborhood.py @@ -19,10 +19,10 @@ from collections import defaultdict from fog.clustering.utils import make_similarity_function, clusters_from_pairs -# TODO: multi-pass, adaptive etc. +# TODO: adaptive etc. -def sorted_neighborhood(data, key=None, similarity=None, distance=None, +def sorted_neighborhood(data, key=None, keys=None, similarity=None, distance=None, radius=None, window=10, min_size=2, max_size=float('inf'), mode='connected_components'): """ @@ -45,6 +45,8 @@ def sorted_neighborhood(data, key=None, similarity=None, distance=None, data (iterable): Arbitrary iterable containing data points to gather into clusters. Will be fully consumed. key (callable, optional): key on which to sort the data. + keys (iterable, optional): list of keys on which to sort for multipass + sorted neighborhood method. similarity (callable): If radius is specified, a function returning the similarity between two points. Else, a function returning whether two points should be deemed similar. Alternatively, one can @@ -72,25 +74,27 @@ def sorted_neighborhood(data, key=None, similarity=None, distance=None, similarity = make_similarity_function(similarity=similarity, distance=distance, radius=radius) # Iterating over sorted data - S = sorted(data, key=key) - n = len(S) - - graph = defaultdict(list) - def clustering(): - for i in range(n): - A = S[i] + multipass_keys = keys if keys is not None else [key] - for j in range(i + 1, min(n, i + window)): - B = S[j] + for k in multipass_keys: + S = sorted(data, key=k) + n = len(S) - if similarity(A, B): - yield (A, B) + for i in range(n): + A = S[i] + + for j in range(i + 1, min(n, i + window)): + B = S[j] + + if similarity(A, B): + yield (A, B) # Building clusters yield from clusters_from_pairs( clustering(), min_size=min_size, max_size=max_size, - mode=mode + mode=mode, + fuzzy=keys is not None )