mirror of https://github.com/Yomguithereal/fog.git
Drafting NN-Descent
This commit is contained in:
parent
bfc4241b3e
commit
7014dccf32
2
Makefile
2
Makefile
|
@ -11,7 +11,7 @@ clean:
|
|||
|
||||
lint:
|
||||
@echo Linting source code using pep8...
|
||||
pycodestyle --ignore E501,E722,E741,W503,W504 $(SOURCE) test
|
||||
pycodestyle --ignore E501,E722,E731,E741,W503,W504 $(SOURCE) test
|
||||
@echo
|
||||
|
||||
unit:
|
||||
|
|
|
@ -2,6 +2,7 @@ import csv
|
|||
from functools import partial
|
||||
from timeit import default_timer as timer
|
||||
from fog.clustering import *
|
||||
from fog.metrics import jaccard_similarity
|
||||
from fog.tokenizers import ngrams
|
||||
from fog.key import fingerprint, omission_key, skeleton_key
|
||||
from Levenshtein import distance as levenshtein
|
||||
|
@ -33,6 +34,10 @@ with open('./data/universities.csv', 'r') as f:
|
|||
clusters = list(quickjoin(universities, distance=levenshtein, radius=2))
|
||||
print('QuickJoin (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(nn_descent(universities, distance=levenshtein, radius=2))
|
||||
print('NN-Descent (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2))
|
||||
print('Blocking (%i):' % len(clusters), timer() - start)
|
||||
|
@ -77,6 +82,10 @@ with open('./data/musicians.csv', 'r') as f:
|
|||
clusters = list(quickjoin(artists, distance=levenshtein, radius=2, processes=8))
|
||||
print('QuickJoin (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(nn_descent(artists, distance=levenshtein, radius=2))
|
||||
print('NN-Descent (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(minhash(artists, radius=0.8, key=lambda x: list(ngrams(5, x)), use_numpy=True))
|
||||
print('MinHash (%i):' % len(clusters), timer() - start)
|
||||
|
|
|
@ -4,6 +4,7 @@ from fog.clustering.jaccard_intersection_index import (
|
|||
)
|
||||
from fog.clustering.key_collision import key_collision
|
||||
from fog.clustering.minhash import minhash
|
||||
from fog.clustering.nn_descent import nn_descent
|
||||
from fog.clustering.pairwise import (
|
||||
pairwise,
|
||||
pairwise_leader,
|
||||
|
|
|
@ -12,6 +12,7 @@ from fog.clustering.utils import make_similarity_function, clusters_from_pairs
|
|||
|
||||
# TODO: max_block_size to avoid ngrams with high DF
|
||||
# TODO: worker using a VPTree
|
||||
# TODO: custom inner algorithm
|
||||
|
||||
|
||||
def blocking_worker(payload):
|
||||
|
|
|
@ -0,0 +1,163 @@
|
|||
# =============================================================================
|
||||
# Fog NN-Descent Clustering
|
||||
# =============================================================================
|
||||
#
|
||||
# Implementation of the probabilistic NN-Descent algorithm able to build
|
||||
# an approximate k-nn graph from a dataset in subquadratic time.
|
||||
#
|
||||
# [Reference]:
|
||||
# Dong, Wei, Charikar Moses, et Kai Li. « Efficient K-Nearest Neighbor Graph
|
||||
# Construction for Generic Similarity Measures », 577. ACM Press, 2011.
|
||||
# https://doi.org/10.1145/1963405.1963487.
|
||||
#
|
||||
import heapq
|
||||
import random
|
||||
from fog.clustering.utils import make_similarity_function, clusters_from_pairs
|
||||
|
||||
# TODO: implement the "full" version
|
||||
|
||||
|
||||
def sample(rng, N, k, i):
|
||||
"""
|
||||
Function sampling k indices from the range 0-N without the i index.
|
||||
|
||||
"""
|
||||
|
||||
S = set()
|
||||
|
||||
while len(S) < k:
|
||||
random_index = rng.randint(0, N - 1)
|
||||
|
||||
if random_index == i:
|
||||
continue
|
||||
|
||||
S.add(random_index)
|
||||
|
||||
return list(S)
|
||||
|
||||
|
||||
def reverse(B):
|
||||
"""
|
||||
Returns the list of in-neighbors from the list of out-neighbors.
|
||||
|
||||
"""
|
||||
R = [[] for _ in range(len(B))]
|
||||
|
||||
for i, neighbors in enumerate(B):
|
||||
for _, j in neighbors:
|
||||
R[j].append(i)
|
||||
|
||||
return R
|
||||
|
||||
|
||||
def nn_descent(data, similarity=None, distance=None, k=5, radius=None,
|
||||
min_size=2, max_size=float('inf'),
|
||||
mode='connected_components',
|
||||
seed=None):
|
||||
"""
|
||||
Function returning an iterator over found clusters using the NN-Descent
|
||||
algorithm.
|
||||
|
||||
The issue of this algorithm is that you need to increase k to increase
|
||||
recall and increasing k increases time complexity towards O(n^2).
|
||||
|
||||
Args:
|
||||
data (iterable): Arbitrary iterable containing data points to gather
|
||||
into clusters. Will be fully consumed.
|
||||
k (number, optional): number of nearest neighbor to find per item.
|
||||
Defaults to 5.
|
||||
similarity (callable): If radius is specified, a function returning
|
||||
the similarity between two points. Else, a function returning
|
||||
whether two points should be deemed similar. Alternatively, one can
|
||||
specify `distance` instead.
|
||||
distance (callable): If radius is specified, a function returning
|
||||
the distance between two points. Else, a function returning
|
||||
whether two point should not be deemed similar. Alternatively, one
|
||||
can specify `similarity` instead.
|
||||
radius (number, optional): produced clusters' radius.
|
||||
min_size (number, optional): minimum number of items in a cluster for
|
||||
it to be considered viable. Defaults to 2.
|
||||
max_size (number, optional): maximum number of items in a cluster for
|
||||
it to be considered viable. Defaults to infinity.
|
||||
mode (string, optional): 'fuzzy_clusters', 'connected_components'.
|
||||
Defaults to 'connected_components'.
|
||||
seed (number, optional): Seed for RNG. Defaults to None.
|
||||
|
||||
Yields:
|
||||
list: A viable cluster.
|
||||
|
||||
"""
|
||||
|
||||
# Seeding rng
|
||||
rng = random.Random(seed)
|
||||
|
||||
# Inverting distance if needed
|
||||
if distance is not None:
|
||||
similarity = lambda x, y: -distance(x, y)
|
||||
radius = -radius
|
||||
|
||||
# Making data set into indexable list
|
||||
if type(data) is not list:
|
||||
data = list(data)
|
||||
|
||||
# Note that B & R could be flat arrays
|
||||
V = data
|
||||
B = []
|
||||
N = len(V)
|
||||
|
||||
def min_similarity_key(x):
|
||||
return x[1][1]
|
||||
|
||||
# Initial samples
|
||||
for i, item in enumerate(V):
|
||||
neighbors = [(similarity(item, V[j]), j) for j in sample(rng, N, k, i)]
|
||||
heapq.heapify(neighbors)
|
||||
B.append(neighbors)
|
||||
|
||||
c = 1
|
||||
|
||||
while c != 0:
|
||||
R = reverse(B)
|
||||
C = []
|
||||
|
||||
c = 0
|
||||
|
||||
for i, item in enumerate(V):
|
||||
candidates = set(j for _, j in B[i])
|
||||
candidates.update(R[i])
|
||||
|
||||
C.append(list(candidates))
|
||||
|
||||
for i in range(N):
|
||||
BA = C[i]
|
||||
|
||||
for ii in BA:
|
||||
BB = C[ii]
|
||||
|
||||
for jj in BB:
|
||||
|
||||
if i == jj:
|
||||
continue
|
||||
|
||||
s = similarity(V[i], V[jj])
|
||||
|
||||
if s > B[i][0][0]:
|
||||
c += 1
|
||||
heapq.heapreplace(B[i], (s, jj))
|
||||
|
||||
def clustering():
|
||||
for i, neighbors in enumerate(B):
|
||||
for s, j in neighbors:
|
||||
if s >= radius:
|
||||
yield (i, j)
|
||||
|
||||
gen = clusters_from_pairs(
|
||||
clustering(),
|
||||
min_size=min_size,
|
||||
max_size=max_size,
|
||||
mode=mode,
|
||||
fuzzy=True
|
||||
)
|
||||
|
||||
for cluster in gen:
|
||||
yield [V[i] for i in cluster]
|
|
@ -22,6 +22,8 @@ from multiprocessing import Pool
|
|||
|
||||
from fog.clustering.utils import clusters_from_pairs
|
||||
|
||||
# TODO: using vp_tree
|
||||
|
||||
|
||||
def partition(S, distance, p, radius, rho):
|
||||
L = []
|
||||
|
@ -92,6 +94,9 @@ def quickjoin(data, distance, radius, block_size=500,
|
|||
Function returning an iterator over found clusters using the QuickJoin
|
||||
algorithm.
|
||||
|
||||
Note that this algorithm returns the same result as pairwise computations
|
||||
would.
|
||||
|
||||
Args:
|
||||
data (iterable): Arbitrary iterable containing data points to gather
|
||||
into clusters. Will be fully consumed.
|
||||
|
@ -207,6 +212,7 @@ def quickjoin(data, distance, radius, block_size=500,
|
|||
for pairs in pool.imap_unordered(worker, pool_iter):
|
||||
yield from pairs
|
||||
|
||||
# TODO: I thinks we need to have `fuzzy=True` here but cannot be sure
|
||||
yield from clusters_from_pairs(
|
||||
clustering() if processes == 1 else clustering_parallel(),
|
||||
min_size=min_size,
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
# =============================================================================
|
||||
# Fog NN-Descent Clustering Unit Tests
|
||||
# =============================================================================
|
||||
import csv
|
||||
from test.clustering.utils import Clusters
|
||||
from Levenshtein import distance as levenshtein
|
||||
from fog.clustering import nn_descent
|
||||
|
||||
DATA = [
|
||||
'Mister Hyde',
|
||||
'Mister Hide',
|
||||
'Claudia Loc',
|
||||
'Cladia Loc'
|
||||
]
|
||||
|
||||
CLUSTERS = Clusters([
|
||||
('Mister Hyde', 'Mister Hide'),
|
||||
('Claudia Loc', 'Cladia Loc')
|
||||
])
|
||||
|
||||
UNIVERSITY_CLUSTERS = Clusters([
|
||||
('Universidad De Manila', 'Universidad de Manila'),
|
||||
('DePaul University', 'DePauw University'),
|
||||
('Seton Hall University', 'Seton Hill University'),
|
||||
('Baylor University', 'Taylor University')
|
||||
])
|
||||
|
||||
with open('./data/universities.csv', 'r') as f:
|
||||
UNIVERSITIES = set([line['university'] for line in csv.DictReader(f)])
|
||||
|
||||
|
||||
class TestNNDescent(object):
|
||||
def test_basics(self):
|
||||
clusters = Clusters(nn_descent(DATA, k=1, distance=levenshtein, radius=1, seed=123))
|
||||
|
||||
assert clusters == CLUSTERS
|
||||
|
||||
def test_universities(self):
|
||||
clusters = Clusters(nn_descent(UNIVERSITIES, distance=levenshtein, radius=1, seed=123))
|
||||
|
||||
assert clusters == UNIVERSITY_CLUSTERS
|
||||
|
||||
parallel_clusters = Clusters(nn_descent(UNIVERSITIES, distance=levenshtein, radius=1, seed=123))
|
||||
|
||||
assert parallel_clusters == UNIVERSITY_CLUSTERS
|
Loading…
Reference in New Issue