Drafting quickjoin

This commit is contained in:
Yomguithereal 2018-07-11 17:58:25 +02:00
parent 6382d51d30
commit ef95edccd2
6 changed files with 229 additions and 0 deletions

View File

@ -29,6 +29,10 @@ with open('./data/universities.csv', 'r') as f:
clusters = list(vp_tree(universities, distance=levenshtein, radius=2))
print('VPTree (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(quickjoin(universities, distance=levenshtein, radius=2))
print('QuickJoin (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2))
print('Blocking (%i):' % len(clusters), timer() - start)
@ -69,6 +73,10 @@ with open('./data/musicians.csv', 'r') as f:
clusters = list(sorted_neighborhood(artists, key=skeleton_key, distance=levenshtein, radius=2))
print('SNM Skeleton (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(quickjoin(artists, distance=levenshtein, radius=2))
print('QuickJoin (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(pairwise_fuzzy_clusters(artists, distance=levenshtein, radius=2, processes=8))
print('Parallel Fuzzy clusters (%i):' % len(clusters), timer() - start)

View File

@ -10,5 +10,6 @@ from fog.clustering.pairwise import (
pairwise_fuzzy_clusters,
pairwise_connected_components
)
from fog.clustering.quickjoin import quickjoin
from fog.clustering.sorted_neighborhood import sorted_neighborhood
from fog.clustering.vp_tree import vp_tree

View File

@ -11,6 +11,7 @@ from multiprocessing import Pool
from fog.clustering.utils import make_similarity_function, clusters_from_pairs
# TODO: max_block_size to avoid ngrams with high DF
# TODO: worker using a VPTree
def blocking_worker(payload):

View File

@ -363,6 +363,7 @@ def pairwise_connected_components(data, similarity=None, distance=None, radius=N
for i, j in matches:
sets.union(i, j)
# TODO: Should really be using the sparse version
for component in sets.components(min_size=min_size, max_size=max_size):
yield [data[i] for i in component]

177
fog/clustering/quickjoin.py Normal file
View File

@ -0,0 +1,177 @@
# =============================================================================
# Fog QuickJoin Clustering
# =============================================================================
#
# Implementation of the Quick Join algorithm that works by recursively
# partitionning the given data with regards to the triangular inequality in
# order to reduce the amount of necessary distance computations.
#
# [Reference]:
# Jacox, Edwin H., et Hanan Samet. « Metric Space Similarity Joins ».
# ACM Transactions on Database Systems 33, no 2 (1 juin 2008): 138.
# https://doi.org/10.1145/1366102.1366104.
#
# Fredriksson K., Braithwaite B. (2013) Quicker Similarity Joins in Metric
# Spaces. In: Brisaboa N., Pedreira O., Zezula P. (eds) Similarity Search and
# Applications. SISAP 2013. Lecture Notes in Computer Science, vol 8199.
# Springer, Berlin, Heidelberg
#
import random
from fog.clustering.utils import clusters_from_pairs
def partition(S, distance, p, radius, rho):
L = []
G = []
Lw = []
Gw = []
l = rho - radius
g = rho + radius
for item in S:
d = distance(p, item)
if d < rho:
L.append(item)
if l <= d:
Lw.append(item)
else:
G.append(item)
if d <= g:
Gw.append(item)
return L, G, Lw, Gw
def quickjoin_bruteforce(S1, S2, distance, radius):
for i in range(len(S1)):
A = S1[i]
for j in range(len(S2)):
B = S2[j]
if distance(A, B) <= radius:
yield (A, B)
def quickjoin_self_bruteforce(S, distance, radius):
n = len(S)
for i in range(n):
A = S[i]
for j in range(i + 1, n):
B = S[j]
if distance(A, B) <= radius:
yield (A, B)
def quickjoin(data, distance, radius, block_size=500,
min_size=2, max_size=float('inf'),
mode='connected_components',
seed=None):
"""
Function returning an iterator over found clusters using the QuickJoin
algorithm.
Args:
data (iterable): Arbitrary iterable containing data points to gather
into clusters. Will be fully consumed.
distance (callable): The distance function to use. Must be a true
metric, e.g. the Levenshtein distance.
radius (number, optional): produced clusters' radius.
block_size (number, optional): block size where the algorithm will
switch to brute. Defaults to 500.
min_size (number, optional): minimum number of items in a cluster for
it to be considered viable. Defaults to 2.
max_size (number, optional): maximum number of items in a cluster for
it to be considered viable. Defaults to infinity.
mode (string, optional): 'fuzzy_clusters', 'connected_components'.
Defaults to 'connected_components'.
seed (number, optional): Seed for RNG. Defaults to None.
Yields:
list: A viable cluster.
"""
rng = random.Random(seed)
if type(data) is not list:
data = list(data)
def clustering():
stack = [(data, None)]
while len(stack) != 0:
S1, S2 = stack.pop()
# QuickJoin procedure
if S2 is None:
S = S1
N = len(S)
if N <= block_size:
yield from quickjoin_self_bruteforce(S, distance, radius)
continue
# Randomly selecting pivots. They must be different
p1 = rng.randint(0, N - 1)
p2 = None
while p2 is None or p1 == p2:
p2 = rng.randint(0, N - 1)
p1 = S[p1]
p2 = S[p2]
rho = distance(p1, p2)
L, G, Lw, Gw = partition(S, distance, p1, radius, rho)
# Recursion
stack.append((G, None))
stack.append((L, None))
stack.append((Lw, Gw))
# QuickJoinWin procedure
else:
N1 = len(S1)
N2 = len(S2)
N = N1 + N2
if N <= block_size:
yield from quickjoin_bruteforce(S1, S2, distance, radius)
continue
p1 = rng.randint(0, N - 1)
p2 = None
while p2 is None or p1 == p2:
p2 = rng.randint(0, N - 1)
p1 = S1[p1] if p1 < N1 else S2[p1 - N1]
p2 = S1[p2] if p2 < N1 else S2[p2 - N1]
rho = distance(p1, p2)
L1, G1, Lw1, Gw1 = partition(S1, distance, p1, radius, rho)
L2, G2, Lw2, Gw2 = partition(S2, distance, p1, radius, rho)
stack.append((L1, L2))
stack.append((G1, G2))
stack.append((Lw1, Gw2))
stack.append((Gw1, Lw2))
yield from clusters_from_pairs(
clustering(),
min_size=min_size,
max_size=max_size,
mode=mode,
fuzzy=True # TODO: Reconsider when using SparseSets
)

View File

@ -0,0 +1,41 @@
# =============================================================================
# Fog QuickJoin Clustering Unit Tests
# =============================================================================
import csv
from test.clustering.utils import Clusters
from Levenshtein import distance as levenshtein
from fog.clustering import quickjoin
DATA = [
'Mister Hyde',
'Mister Hide',
'Claudia Loc',
'Cladia Loc'
]
CLUSTERS = Clusters([
('Mister Hyde', 'Mister Hide'),
('Claudia Loc', 'Cladia Loc')
])
UNIVERSITY_CLUSTERS = Clusters([
('Universidad De Manila', 'Universidad de Manila'),
('DePaul University', 'DePauw University'),
('Seton Hall University', 'Seton Hill University'),
('Baylor University', 'Taylor University')
])
with open('./data/universities.csv', 'r') as f:
UNIVERSITIES = set([line['university'] for line in csv.DictReader(f)])
class TestQuickJoin(object):
def test_basics(self):
clusters = Clusters(quickjoin(DATA, distance=levenshtein, radius=1))
assert clusters == CLUSTERS
def test_universities(self):
clusters = Clusters(quickjoin(UNIVERSITIES, distance=levenshtein, radius=1))
assert clusters == UNIVERSITY_CLUSTERS