diff --git a/Makefile b/Makefile index efd48ed..52cd2fa 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ clean: lint: @echo Linting source code using pep8... - pycodestyle --ignore E501,E722,E741,W503,W504 $(SOURCE) test + pycodestyle --ignore E501,E722,E731,E741,W503,W504 $(SOURCE) test @echo unit: diff --git a/experiments/benchmark.py b/experiments/benchmark.py index cb03edd..0776585 100644 --- a/experiments/benchmark.py +++ b/experiments/benchmark.py @@ -2,6 +2,7 @@ import csv from functools import partial from timeit import default_timer as timer from fog.clustering import * +from fog.metrics import jaccard_similarity from fog.tokenizers import ngrams from fog.key import fingerprint, omission_key, skeleton_key from Levenshtein import distance as levenshtein @@ -33,6 +34,10 @@ with open('./data/universities.csv', 'r') as f: clusters = list(quickjoin(universities, distance=levenshtein, radius=2)) print('QuickJoin (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(nn_descent(universities, distance=levenshtein, radius=2)) + print('NN-Descent (%i):' % len(clusters), timer() - start) + start = timer() clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2)) print('Blocking (%i):' % len(clusters), timer() - start) @@ -77,6 +82,10 @@ with open('./data/musicians.csv', 'r') as f: clusters = list(quickjoin(artists, distance=levenshtein, radius=2, processes=8)) print('QuickJoin (%i):' % len(clusters), timer() - start) + start = timer() + clusters = list(nn_descent(artists, distance=levenshtein, radius=2)) + print('NN-Descent (%i):' % len(clusters), timer() - start) + start = timer() clusters = list(minhash(artists, radius=0.8, key=lambda x: list(ngrams(5, x)), use_numpy=True)) print('MinHash (%i):' % len(clusters), timer() - start) diff --git a/fog/clustering/__init__.py b/fog/clustering/__init__.py index ffdf6d9..f9dfac3 100644 --- a/fog/clustering/__init__.py +++ b/fog/clustering/__init__.py @@ -4,6 +4,7 @@ from fog.clustering.jaccard_intersection_index import ( ) from fog.clustering.key_collision import key_collision from fog.clustering.minhash import minhash +from fog.clustering.nn_descent import nn_descent from fog.clustering.pairwise import ( pairwise, pairwise_leader, diff --git a/fog/clustering/blocking.py b/fog/clustering/blocking.py index d899f06..21ad015 100644 --- a/fog/clustering/blocking.py +++ b/fog/clustering/blocking.py @@ -12,6 +12,7 @@ from fog.clustering.utils import make_similarity_function, clusters_from_pairs # TODO: max_block_size to avoid ngrams with high DF # TODO: worker using a VPTree +# TODO: custom inner algorithm def blocking_worker(payload): diff --git a/fog/clustering/nn_descent.py b/fog/clustering/nn_descent.py new file mode 100644 index 0000000..c1ce560 --- /dev/null +++ b/fog/clustering/nn_descent.py @@ -0,0 +1,163 @@ +# ============================================================================= +# Fog NN-Descent Clustering +# ============================================================================= +# +# Implementation of the probabilistic NN-Descent algorithm able to build +# an approximate k-nn graph from a dataset in subquadratic time. +# +# [Reference]: +# Dong, Wei, Charikar Moses, et Kai Li. « Efficient K-Nearest Neighbor Graph +# Construction for Generic Similarity Measures », 577. ACM Press, 2011. +# https://doi.org/10.1145/1963405.1963487. +# +import heapq +import random +from fog.clustering.utils import make_similarity_function, clusters_from_pairs + +# TODO: implement the "full" version + + +def sample(rng, N, k, i): + """ + Function sampling k indices from the range 0-N without the i index. + + """ + + S = set() + + while len(S) < k: + random_index = rng.randint(0, N - 1) + + if random_index == i: + continue + + S.add(random_index) + + return list(S) + + +def reverse(B): + """ + Returns the list of in-neighbors from the list of out-neighbors. + + """ + R = [[] for _ in range(len(B))] + + for i, neighbors in enumerate(B): + for _, j in neighbors: + R[j].append(i) + + return R + + +def nn_descent(data, similarity=None, distance=None, k=5, radius=None, + min_size=2, max_size=float('inf'), + mode='connected_components', + seed=None): + """ + Function returning an iterator over found clusters using the NN-Descent + algorithm. + + The issue of this algorithm is that you need to increase k to increase + recall and increasing k increases time complexity towards O(n^2). + + Args: + data (iterable): Arbitrary iterable containing data points to gather + into clusters. Will be fully consumed. + k (number, optional): number of nearest neighbor to find per item. + Defaults to 5. + similarity (callable): If radius is specified, a function returning + the similarity between two points. Else, a function returning + whether two points should be deemed similar. Alternatively, one can + specify `distance` instead. + distance (callable): If radius is specified, a function returning + the distance between two points. Else, a function returning + whether two point should not be deemed similar. Alternatively, one + can specify `similarity` instead. + radius (number, optional): produced clusters' radius. + min_size (number, optional): minimum number of items in a cluster for + it to be considered viable. Defaults to 2. + max_size (number, optional): maximum number of items in a cluster for + it to be considered viable. Defaults to infinity. + mode (string, optional): 'fuzzy_clusters', 'connected_components'. + Defaults to 'connected_components'. + seed (number, optional): Seed for RNG. Defaults to None. + + Yields: + list: A viable cluster. + + """ + + # Seeding rng + rng = random.Random(seed) + + # Inverting distance if needed + if distance is not None: + similarity = lambda x, y: -distance(x, y) + radius = -radius + + # Making data set into indexable list + if type(data) is not list: + data = list(data) + + # Note that B & R could be flat arrays + V = data + B = [] + N = len(V) + + def min_similarity_key(x): + return x[1][1] + + # Initial samples + for i, item in enumerate(V): + neighbors = [(similarity(item, V[j]), j) for j in sample(rng, N, k, i)] + heapq.heapify(neighbors) + B.append(neighbors) + + c = 1 + + while c != 0: + R = reverse(B) + C = [] + + c = 0 + + for i, item in enumerate(V): + candidates = set(j for _, j in B[i]) + candidates.update(R[i]) + + C.append(list(candidates)) + + for i in range(N): + BA = C[i] + + for ii in BA: + BB = C[ii] + + for jj in BB: + + if i == jj: + continue + + s = similarity(V[i], V[jj]) + + if s > B[i][0][0]: + c += 1 + heapq.heapreplace(B[i], (s, jj)) + + def clustering(): + for i, neighbors in enumerate(B): + for s, j in neighbors: + if s >= radius: + yield (i, j) + + gen = clusters_from_pairs( + clustering(), + min_size=min_size, + max_size=max_size, + mode=mode, + fuzzy=True + ) + + for cluster in gen: + yield [V[i] for i in cluster] diff --git a/fog/clustering/quickjoin.py b/fog/clustering/quickjoin.py index 449f279..cf4f3f9 100644 --- a/fog/clustering/quickjoin.py +++ b/fog/clustering/quickjoin.py @@ -22,6 +22,8 @@ from multiprocessing import Pool from fog.clustering.utils import clusters_from_pairs +# TODO: using vp_tree + def partition(S, distance, p, radius, rho): L = [] @@ -92,6 +94,9 @@ def quickjoin(data, distance, radius, block_size=500, Function returning an iterator over found clusters using the QuickJoin algorithm. + Note that this algorithm returns the same result as pairwise computations + would. + Args: data (iterable): Arbitrary iterable containing data points to gather into clusters. Will be fully consumed. @@ -207,6 +212,7 @@ def quickjoin(data, distance, radius, block_size=500, for pairs in pool.imap_unordered(worker, pool_iter): yield from pairs + # TODO: I thinks we need to have `fuzzy=True` here but cannot be sure yield from clusters_from_pairs( clustering() if processes == 1 else clustering_parallel(), min_size=min_size, diff --git a/test/clustering/nn_descent_test.py b/test/clustering/nn_descent_test.py new file mode 100644 index 0000000..a89f601 --- /dev/null +++ b/test/clustering/nn_descent_test.py @@ -0,0 +1,45 @@ +# ============================================================================= +# Fog NN-Descent Clustering Unit Tests +# ============================================================================= +import csv +from test.clustering.utils import Clusters +from Levenshtein import distance as levenshtein +from fog.clustering import nn_descent + +DATA = [ + 'Mister Hyde', + 'Mister Hide', + 'Claudia Loc', + 'Cladia Loc' +] + +CLUSTERS = Clusters([ + ('Mister Hyde', 'Mister Hide'), + ('Claudia Loc', 'Cladia Loc') +]) + +UNIVERSITY_CLUSTERS = Clusters([ + ('Universidad De Manila', 'Universidad de Manila'), + ('DePaul University', 'DePauw University'), + ('Seton Hall University', 'Seton Hill University'), + ('Baylor University', 'Taylor University') +]) + +with open('./data/universities.csv', 'r') as f: + UNIVERSITIES = set([line['university'] for line in csv.DictReader(f)]) + + +class TestNNDescent(object): + def test_basics(self): + clusters = Clusters(nn_descent(DATA, k=1, distance=levenshtein, radius=1, seed=123)) + + assert clusters == CLUSTERS + + def test_universities(self): + clusters = Clusters(nn_descent(UNIVERSITIES, distance=levenshtein, radius=1, seed=123)) + + assert clusters == UNIVERSITY_CLUSTERS + + parallel_clusters = Clusters(nn_descent(UNIVERSITIES, distance=levenshtein, radius=1, seed=123)) + + assert parallel_clusters == UNIVERSITY_CLUSTERS