diff --git a/experiments/pairwise_benchmark.py b/experiments/benchmark.py similarity index 85% rename from experiments/pairwise_benchmark.py rename to experiments/benchmark.py index e6ffa81..77fcdbb 100644 --- a/experiments/pairwise_benchmark.py +++ b/experiments/benchmark.py @@ -3,7 +3,7 @@ from functools import partial from timeit import default_timer as timer from fog.clustering import * from fog.tokenizers import ngrams -from fog.key import fingerprint +from fog.key import fingerprint, omission_key from Levenshtein import distance as levenshtein with open('./data/universities.csv', 'r') as f: @@ -33,6 +33,10 @@ with open('./data/universities.csv', 'r') as f: clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2)) print('Blocking (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(sorted_neighborhood(universities, key=omission_key, distance=levenshtein, radius=2)) + print('SNM Omission (%i):' % len(clusters), timer() - start) + print() with open('./data/musicians.csv', 'r') as f: reader = csv.DictReader(f) @@ -53,6 +57,10 @@ with open('./data/musicians.csv', 'r') as f: clusters = list(blocking(artists, blocks=partial(ngrams, 6), distance=levenshtein, radius=2, processes=8)) print('Blocking (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(sorted_neighborhood(artists, key=omission_key, distance=levenshtein, radius=2)) + print('SNM Omission (%i):' % len(clusters), timer() - start) + start = timer() clusters = list(pairwise_fuzzy_clusters(artists, distance=levenshtein, radius=2, processes=8)) print('Parallel Fuzzy clusters (%i):' % len(clusters), timer() - start) diff --git a/fog/clustering/__init__.py b/fog/clustering/__init__.py index edcbcc0..0eaea75 100644 --- a/fog/clustering/__init__.py +++ b/fog/clustering/__init__.py @@ -10,4 +10,5 @@ from fog.clustering.pairwise import ( pairwise_fuzzy_clusters, pairwise_connected_components ) +from fog.clustering.sorted_neighborhood import sorted_neighborhood from fog.clustering.vp_tree import vp_tree diff --git a/fog/clustering/blocking.py b/fog/clustering/blocking.py index b46ad90..6773a43 100644 --- a/fog/clustering/blocking.py +++ b/fog/clustering/blocking.py @@ -44,8 +44,8 @@ def blocking_worker(payload): def blocking(data, block=None, blocks=None, similarity=None, distance=None, radius=None, min_size=2, max_size=float('inf'), processes=1): """ - Function returning an iterator over found clusters using the leader - algorithm. + Function returning an iterator over found clusters using the blocking + method. It works by dispatching given items into one or more buckets before computing pairwise comparisons on each bucket. diff --git a/fog/clustering/sorted_neighborhood.py b/fog/clustering/sorted_neighborhood.py new file mode 100644 index 0000000..419a1a3 --- /dev/null +++ b/fog/clustering/sorted_neighborhood.py @@ -0,0 +1,84 @@ +# ============================================================================= +# Fog Sorted Neighborhood Clustering +# ============================================================================= +# +# Implementation of the Sorted Neighborhood method. +# +from collections import defaultdict +from fog.clustering.utils import make_similarity_function + + +def sorted_neighborhood(data, key=None, similarity=None, distance=None, + radius=None, window=10, min_size=2, max_size=float('inf')): + """ + Function returning an iterator over found clusters using the sorted + neighborhood method. + + It works by first sorting the data according to a key which could, if + cleverly chosen, put similar items next to one another in the result. + + We then attempt to find clusters by computing pairwise similarity/distances + in small blocks of constant size in the sorted list. + + Omission key & skeleton keys by Pollock & Zamora are a good choice of + sorting key if you try to find mispellings, for instance. + + Args: + data (iterable): Arbitrary iterable containing data points to gather + into clusters. Will be fully consumed. + key (callable, optional): key on which to sort the data. + similarity (callable): If radius is specified, a function returning + the similarity between two points. Else, a function returning + whether two points should be deemed similar. Alternatively, one can + specify `distance` instead. + distance (callable): If radius is specified, a function returning + the distance between two points. Else, a function returning + whether two point should not be deemed similar. Alternatively, one + can specify `similarity` instead. + radius (number, optional): produced clusters' radius. + window (number, optional): Size of the window in which to look for + matches. Defaults to 10. + min_size (number, optional): minimum number of items in a cluster for + it to be considered viable. Defaults to 2. + max_size (number, optional): maximum number of items in a cluster for + it to be considered viable. Defaults to infinity. + + Yields: + list: A viable cluster. + + """ + + # Formatting similarity + similarity = make_similarity_function(similarity=similarity, distance=distance, radius=radius) + + # Iterating over sorted data + S = sorted(data, key=key) + n = len(S) + + graph = defaultdict(list) + + for i in range(n): + A = S[i] + + for j in range(i + 1, min(n, i + window + 1)): + B = S[j] + + if similarity(A, B): + graph[i].append(j) + graph[j].append(i) + + # Building clusters + visited = set() + for i, neighbors in graph.items(): + if i in visited: + continue + + if len(neighbors) + 1 < min_size: + continue + if len(neighbors) + 1 > max_size: + continue + + visited.update(neighbors) + + cluster = [S[i]] + [S[j] for j in neighbors] + yield cluster diff --git a/fog/key/omission.py b/fog/key/omission.py index 952f370..96287ac 100644 --- a/fog/key/omission.py +++ b/fog/key/omission.py @@ -19,8 +19,6 @@ UNDESIRABLES_RE = re.compile(r'[^A-Z]') CONSONANTS = 'JKQXZVWYBFMGPDHCLNTSR' VOWELS = set('AEIOU') -# TODO: omission/skeleton key clustering for distance = 1 - def omission_key(string): """ diff --git a/test/clustering/blocking_test.py b/test/clustering/blocking_test.py index d473a4c..c1a2c13 100644 --- a/test/clustering/blocking_test.py +++ b/test/clustering/blocking_test.py @@ -5,7 +5,6 @@ import csv from test.clustering.utils import Clusters from Levenshtein import distance as levenshtein from fog.clustering import blocking -from fog.tokenizers import ngrams DATA = [ 'Abelard', diff --git a/test/clustering/sorted_neighborhood_test.py b/test/clustering/sorted_neighborhood_test.py new file mode 100644 index 0000000..00056a8 --- /dev/null +++ b/test/clustering/sorted_neighborhood_test.py @@ -0,0 +1,31 @@ +# ============================================================================= +# Fog Sorted Neighborhood Unit Tests +# ============================================================================= +import csv +from test.clustering.utils import Clusters +from Levenshtein import distance as levenshtein +from fog.clustering import sorted_neighborhood + +DATA = [ + 'Abelard', + 'Abelar', + 'Atrium', + 'Atrides', + 'Belgian', + 'Belgia', + 'Telgia' +] + +CLUSTERS = Clusters([ + ('Abelard', 'Abelar'), + ('Belgian', 'Belgia') +]) + + +class TestSortedNeighborhood(object): + def test_basics(self): + + # Sorting alphabetically + clusters = Clusters(sorted_neighborhood(DATA, distance=levenshtein, radius=1, window=1)) + + assert clusters == CLUSTERS