Drafting NN-Descent

This commit is contained in:
Yomguithereal 2018-07-12 15:01:35 +02:00
parent bfc4241b3e
commit 7014dccf32
7 changed files with 226 additions and 1 deletions

View File

@ -11,7 +11,7 @@ clean:
lint:
@echo Linting source code using pep8...
pycodestyle --ignore E501,E722,E741,W503,W504 $(SOURCE) test
pycodestyle --ignore E501,E722,E731,E741,W503,W504 $(SOURCE) test
@echo
unit:

View File

@ -2,6 +2,7 @@ import csv
from functools import partial
from timeit import default_timer as timer
from fog.clustering import *
from fog.metrics import jaccard_similarity
from fog.tokenizers import ngrams
from fog.key import fingerprint, omission_key, skeleton_key
from Levenshtein import distance as levenshtein
@ -33,6 +34,10 @@ with open('./data/universities.csv', 'r') as f:
clusters = list(quickjoin(universities, distance=levenshtein, radius=2))
print('QuickJoin (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(nn_descent(universities, distance=levenshtein, radius=2))
print('NN-Descent (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2))
print('Blocking (%i):' % len(clusters), timer() - start)
@ -77,6 +82,10 @@ with open('./data/musicians.csv', 'r') as f:
clusters = list(quickjoin(artists, distance=levenshtein, radius=2, processes=8))
print('QuickJoin (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(nn_descent(artists, distance=levenshtein, radius=2))
print('NN-Descent (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(minhash(artists, radius=0.8, key=lambda x: list(ngrams(5, x)), use_numpy=True))
print('MinHash (%i):' % len(clusters), timer() - start)

View File

@ -4,6 +4,7 @@ from fog.clustering.jaccard_intersection_index import (
)
from fog.clustering.key_collision import key_collision
from fog.clustering.minhash import minhash
from fog.clustering.nn_descent import nn_descent
from fog.clustering.pairwise import (
pairwise,
pairwise_leader,

View File

@ -12,6 +12,7 @@ from fog.clustering.utils import make_similarity_function, clusters_from_pairs
# TODO: max_block_size to avoid ngrams with high DF
# TODO: worker using a VPTree
# TODO: custom inner algorithm
def blocking_worker(payload):

View File

@ -0,0 +1,163 @@
# =============================================================================
# Fog NN-Descent Clustering
# =============================================================================
#
# Implementation of the probabilistic NN-Descent algorithm able to build
# an approximate k-nn graph from a dataset in subquadratic time.
#
# [Reference]:
# Dong, Wei, Charikar Moses, et Kai Li. « Efficient K-Nearest Neighbor Graph
# Construction for Generic Similarity Measures », 577. ACM Press, 2011.
# https://doi.org/10.1145/1963405.1963487.
#
import heapq
import random
from fog.clustering.utils import make_similarity_function, clusters_from_pairs
# TODO: implement the "full" version
def sample(rng, N, k, i):
"""
Function sampling k indices from the range 0-N without the i index.
"""
S = set()
while len(S) < k:
random_index = rng.randint(0, N - 1)
if random_index == i:
continue
S.add(random_index)
return list(S)
def reverse(B):
"""
Returns the list of in-neighbors from the list of out-neighbors.
"""
R = [[] for _ in range(len(B))]
for i, neighbors in enumerate(B):
for _, j in neighbors:
R[j].append(i)
return R
def nn_descent(data, similarity=None, distance=None, k=5, radius=None,
min_size=2, max_size=float('inf'),
mode='connected_components',
seed=None):
"""
Function returning an iterator over found clusters using the NN-Descent
algorithm.
The issue of this algorithm is that you need to increase k to increase
recall and increasing k increases time complexity towards O(n^2).
Args:
data (iterable): Arbitrary iterable containing data points to gather
into clusters. Will be fully consumed.
k (number, optional): number of nearest neighbor to find per item.
Defaults to 5.
similarity (callable): If radius is specified, a function returning
the similarity between two points. Else, a function returning
whether two points should be deemed similar. Alternatively, one can
specify `distance` instead.
distance (callable): If radius is specified, a function returning
the distance between two points. Else, a function returning
whether two point should not be deemed similar. Alternatively, one
can specify `similarity` instead.
radius (number, optional): produced clusters' radius.
min_size (number, optional): minimum number of items in a cluster for
it to be considered viable. Defaults to 2.
max_size (number, optional): maximum number of items in a cluster for
it to be considered viable. Defaults to infinity.
mode (string, optional): 'fuzzy_clusters', 'connected_components'.
Defaults to 'connected_components'.
seed (number, optional): Seed for RNG. Defaults to None.
Yields:
list: A viable cluster.
"""
# Seeding rng
rng = random.Random(seed)
# Inverting distance if needed
if distance is not None:
similarity = lambda x, y: -distance(x, y)
radius = -radius
# Making data set into indexable list
if type(data) is not list:
data = list(data)
# Note that B & R could be flat arrays
V = data
B = []
N = len(V)
def min_similarity_key(x):
return x[1][1]
# Initial samples
for i, item in enumerate(V):
neighbors = [(similarity(item, V[j]), j) for j in sample(rng, N, k, i)]
heapq.heapify(neighbors)
B.append(neighbors)
c = 1
while c != 0:
R = reverse(B)
C = []
c = 0
for i, item in enumerate(V):
candidates = set(j for _, j in B[i])
candidates.update(R[i])
C.append(list(candidates))
for i in range(N):
BA = C[i]
for ii in BA:
BB = C[ii]
for jj in BB:
if i == jj:
continue
s = similarity(V[i], V[jj])
if s > B[i][0][0]:
c += 1
heapq.heapreplace(B[i], (s, jj))
def clustering():
for i, neighbors in enumerate(B):
for s, j in neighbors:
if s >= radius:
yield (i, j)
gen = clusters_from_pairs(
clustering(),
min_size=min_size,
max_size=max_size,
mode=mode,
fuzzy=True
)
for cluster in gen:
yield [V[i] for i in cluster]

View File

@ -22,6 +22,8 @@ from multiprocessing import Pool
from fog.clustering.utils import clusters_from_pairs
# TODO: using vp_tree
def partition(S, distance, p, radius, rho):
L = []
@ -92,6 +94,9 @@ def quickjoin(data, distance, radius, block_size=500,
Function returning an iterator over found clusters using the QuickJoin
algorithm.
Note that this algorithm returns the same result as pairwise computations
would.
Args:
data (iterable): Arbitrary iterable containing data points to gather
into clusters. Will be fully consumed.
@ -207,6 +212,7 @@ def quickjoin(data, distance, radius, block_size=500,
for pairs in pool.imap_unordered(worker, pool_iter):
yield from pairs
# TODO: I thinks we need to have `fuzzy=True` here but cannot be sure
yield from clusters_from_pairs(
clustering() if processes == 1 else clustering_parallel(),
min_size=min_size,

View File

@ -0,0 +1,45 @@
# =============================================================================
# Fog NN-Descent Clustering Unit Tests
# =============================================================================
import csv
from test.clustering.utils import Clusters
from Levenshtein import distance as levenshtein
from fog.clustering import nn_descent
DATA = [
'Mister Hyde',
'Mister Hide',
'Claudia Loc',
'Cladia Loc'
]
CLUSTERS = Clusters([
('Mister Hyde', 'Mister Hide'),
('Claudia Loc', 'Cladia Loc')
])
UNIVERSITY_CLUSTERS = Clusters([
('Universidad De Manila', 'Universidad de Manila'),
('DePaul University', 'DePauw University'),
('Seton Hall University', 'Seton Hill University'),
('Baylor University', 'Taylor University')
])
with open('./data/universities.csv', 'r') as f:
UNIVERSITIES = set([line['university'] for line in csv.DictReader(f)])
class TestNNDescent(object):
def test_basics(self):
clusters = Clusters(nn_descent(DATA, k=1, distance=levenshtein, radius=1, seed=123))
assert clusters == CLUSTERS
def test_universities(self):
clusters = Clusters(nn_descent(UNIVERSITIES, distance=levenshtein, radius=1, seed=123))
assert clusters == UNIVERSITY_CLUSTERS
parallel_clusters = Clusters(nn_descent(UNIVERSITIES, distance=levenshtein, radius=1, seed=123))
assert parallel_clusters == UNIVERSITY_CLUSTERS