diff --git a/experiments/benchmark.py b/experiments/benchmark.py index 730476e..d2f0a22 100644 --- a/experiments/benchmark.py +++ b/experiments/benchmark.py @@ -29,6 +29,10 @@ with open('./data/universities.csv', 'r') as f: clusters = list(vp_tree(universities, distance=levenshtein, radius=2)) print('VPTree (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(quickjoin(universities, distance=levenshtein, radius=2)) + print('QuickJoin (%i):' % len(clusters), timer() - start) + start = timer() clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2)) print('Blocking (%i):' % len(clusters), timer() - start) @@ -69,6 +73,10 @@ with open('./data/musicians.csv', 'r') as f: clusters = list(sorted_neighborhood(artists, key=skeleton_key, distance=levenshtein, radius=2)) print('SNM Skeleton (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(quickjoin(artists, distance=levenshtein, radius=2)) + print('QuickJoin (%i):' % len(clusters), timer() - start) + start = timer() clusters = list(pairwise_fuzzy_clusters(artists, distance=levenshtein, radius=2, processes=8)) print('Parallel Fuzzy clusters (%i):' % len(clusters), timer() - start) diff --git a/fog/clustering/__init__.py b/fog/clustering/__init__.py index 0eaea75..ffdf6d9 100644 --- a/fog/clustering/__init__.py +++ b/fog/clustering/__init__.py @@ -10,5 +10,6 @@ from fog.clustering.pairwise import ( pairwise_fuzzy_clusters, pairwise_connected_components ) +from fog.clustering.quickjoin import quickjoin from fog.clustering.sorted_neighborhood import sorted_neighborhood from fog.clustering.vp_tree import vp_tree diff --git a/fog/clustering/blocking.py b/fog/clustering/blocking.py index 388e61c..c96522a 100644 --- a/fog/clustering/blocking.py +++ b/fog/clustering/blocking.py @@ -11,6 +11,7 @@ from multiprocessing import Pool from fog.clustering.utils import make_similarity_function, clusters_from_pairs # TODO: max_block_size to avoid ngrams with high DF +# TODO: worker using a VPTree def blocking_worker(payload): diff --git a/fog/clustering/pairwise.py b/fog/clustering/pairwise.py index 36e2772..ad0099c 100644 --- a/fog/clustering/pairwise.py +++ b/fog/clustering/pairwise.py @@ -363,6 +363,7 @@ def pairwise_connected_components(data, similarity=None, distance=None, radius=N for i, j in matches: sets.union(i, j) + # TODO: Should really be using the sparse version for component in sets.components(min_size=min_size, max_size=max_size): yield [data[i] for i in component] diff --git a/fog/clustering/quickjoin.py b/fog/clustering/quickjoin.py new file mode 100644 index 0000000..c8e0b7e --- /dev/null +++ b/fog/clustering/quickjoin.py @@ -0,0 +1,177 @@ +# ============================================================================= +# Fog QuickJoin Clustering +# ============================================================================= +# +# Implementation of the Quick Join algorithm that works by recursively +# partitionning the given data with regards to the triangular inequality in +# order to reduce the amount of necessary distance computations. +# +# [Reference]: +# Jacox, Edwin H., et Hanan Samet. « Metric Space Similarity Joins ». +# ACM Transactions on Database Systems 33, no 2 (1 juin 2008): 1‑38. +# https://doi.org/10.1145/1366102.1366104. +# +# Fredriksson K., Braithwaite B. (2013) Quicker Similarity Joins in Metric +# Spaces. In: Brisaboa N., Pedreira O., Zezula P. (eds) Similarity Search and +# Applications. SISAP 2013. Lecture Notes in Computer Science, vol 8199. +# Springer, Berlin, Heidelberg +# +import random +from fog.clustering.utils import clusters_from_pairs + + +def partition(S, distance, p, radius, rho): + L = [] + G = [] + Lw = [] + Gw = [] + + l = rho - radius + g = rho + radius + + for item in S: + d = distance(p, item) + + if d < rho: + L.append(item) + + if l <= d: + Lw.append(item) + else: + G.append(item) + + if d <= g: + Gw.append(item) + + return L, G, Lw, Gw + + +def quickjoin_bruteforce(S1, S2, distance, radius): + for i in range(len(S1)): + A = S1[i] + + for j in range(len(S2)): + B = S2[j] + + if distance(A, B) <= radius: + yield (A, B) + + +def quickjoin_self_bruteforce(S, distance, radius): + n = len(S) + + for i in range(n): + A = S[i] + + for j in range(i + 1, n): + B = S[j] + + if distance(A, B) <= radius: + yield (A, B) + + +def quickjoin(data, distance, radius, block_size=500, + min_size=2, max_size=float('inf'), + mode='connected_components', + seed=None): + """ + Function returning an iterator over found clusters using the QuickJoin + algorithm. + + Args: + data (iterable): Arbitrary iterable containing data points to gather + into clusters. Will be fully consumed. + distance (callable): The distance function to use. Must be a true + metric, e.g. the Levenshtein distance. + radius (number, optional): produced clusters' radius. + block_size (number, optional): block size where the algorithm will + switch to brute. Defaults to 500. + min_size (number, optional): minimum number of items in a cluster for + it to be considered viable. Defaults to 2. + max_size (number, optional): maximum number of items in a cluster for + it to be considered viable. Defaults to infinity. + mode (string, optional): 'fuzzy_clusters', 'connected_components'. + Defaults to 'connected_components'. + seed (number, optional): Seed for RNG. Defaults to None. + + Yields: + list: A viable cluster. + + """ + + rng = random.Random(seed) + + if type(data) is not list: + data = list(data) + + def clustering(): + stack = [(data, None)] + + while len(stack) != 0: + S1, S2 = stack.pop() + + # QuickJoin procedure + if S2 is None: + + S = S1 + N = len(S) + + if N <= block_size: + yield from quickjoin_self_bruteforce(S, distance, radius) + continue + + # Randomly selecting pivots. They must be different + p1 = rng.randint(0, N - 1) + p2 = None + + while p2 is None or p1 == p2: + p2 = rng.randint(0, N - 1) + + p1 = S[p1] + p2 = S[p2] + + rho = distance(p1, p2) + + L, G, Lw, Gw = partition(S, distance, p1, radius, rho) + + # Recursion + stack.append((G, None)) + stack.append((L, None)) + stack.append((Lw, Gw)) + + # QuickJoinWin procedure + else: + N1 = len(S1) + N2 = len(S2) + N = N1 + N2 + + if N <= block_size: + yield from quickjoin_bruteforce(S1, S2, distance, radius) + continue + + p1 = rng.randint(0, N - 1) + p2 = None + + while p2 is None or p1 == p2: + p2 = rng.randint(0, N - 1) + + p1 = S1[p1] if p1 < N1 else S2[p1 - N1] + p2 = S1[p2] if p2 < N1 else S2[p2 - N1] + + rho = distance(p1, p2) + + L1, G1, Lw1, Gw1 = partition(S1, distance, p1, radius, rho) + L2, G2, Lw2, Gw2 = partition(S2, distance, p1, radius, rho) + + stack.append((L1, L2)) + stack.append((G1, G2)) + stack.append((Lw1, Gw2)) + stack.append((Gw1, Lw2)) + + yield from clusters_from_pairs( + clustering(), + min_size=min_size, + max_size=max_size, + mode=mode, + fuzzy=True # TODO: Reconsider when using SparseSets + ) diff --git a/test/clustering/quickjoin_test.py b/test/clustering/quickjoin_test.py new file mode 100644 index 0000000..103c7b0 --- /dev/null +++ b/test/clustering/quickjoin_test.py @@ -0,0 +1,41 @@ +# ============================================================================= +# Fog QuickJoin Clustering Unit Tests +# ============================================================================= +import csv +from test.clustering.utils import Clusters +from Levenshtein import distance as levenshtein +from fog.clustering import quickjoin + +DATA = [ + 'Mister Hyde', + 'Mister Hide', + 'Claudia Loc', + 'Cladia Loc' +] + +CLUSTERS = Clusters([ + ('Mister Hyde', 'Mister Hide'), + ('Claudia Loc', 'Cladia Loc') +]) + +UNIVERSITY_CLUSTERS = Clusters([ + ('Universidad De Manila', 'Universidad de Manila'), + ('DePaul University', 'DePauw University'), + ('Seton Hall University', 'Seton Hill University'), + ('Baylor University', 'Taylor University') +]) + +with open('./data/universities.csv', 'r') as f: + UNIVERSITIES = set([line['university'] for line in csv.DictReader(f)]) + + +class TestQuickJoin(object): + def test_basics(self): + clusters = Clusters(quickjoin(DATA, distance=levenshtein, radius=1)) + + assert clusters == CLUSTERS + + def test_universities(self): + clusters = Clusters(quickjoin(UNIVERSITIES, distance=levenshtein, radius=1)) + + assert clusters == UNIVERSITY_CLUSTERS