mirror of https://github.com/Yomguithereal/fog.git
Drafting quickjoin
This commit is contained in:
parent
6382d51d30
commit
ef95edccd2
|
@ -29,6 +29,10 @@ with open('./data/universities.csv', 'r') as f:
|
|||
clusters = list(vp_tree(universities, distance=levenshtein, radius=2))
|
||||
print('VPTree (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(quickjoin(universities, distance=levenshtein, radius=2))
|
||||
print('QuickJoin (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2))
|
||||
print('Blocking (%i):' % len(clusters), timer() - start)
|
||||
|
@ -69,6 +73,10 @@ with open('./data/musicians.csv', 'r') as f:
|
|||
clusters = list(sorted_neighborhood(artists, key=skeleton_key, distance=levenshtein, radius=2))
|
||||
print('SNM Skeleton (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(quickjoin(artists, distance=levenshtein, radius=2))
|
||||
print('QuickJoin (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(pairwise_fuzzy_clusters(artists, distance=levenshtein, radius=2, processes=8))
|
||||
print('Parallel Fuzzy clusters (%i):' % len(clusters), timer() - start)
|
||||
|
|
|
@ -10,5 +10,6 @@ from fog.clustering.pairwise import (
|
|||
pairwise_fuzzy_clusters,
|
||||
pairwise_connected_components
|
||||
)
|
||||
from fog.clustering.quickjoin import quickjoin
|
||||
from fog.clustering.sorted_neighborhood import sorted_neighborhood
|
||||
from fog.clustering.vp_tree import vp_tree
|
||||
|
|
|
@ -11,6 +11,7 @@ from multiprocessing import Pool
|
|||
from fog.clustering.utils import make_similarity_function, clusters_from_pairs
|
||||
|
||||
# TODO: max_block_size to avoid ngrams with high DF
|
||||
# TODO: worker using a VPTree
|
||||
|
||||
|
||||
def blocking_worker(payload):
|
||||
|
|
|
@ -363,6 +363,7 @@ def pairwise_connected_components(data, similarity=None, distance=None, radius=N
|
|||
for i, j in matches:
|
||||
sets.union(i, j)
|
||||
|
||||
# TODO: Should really be using the sparse version
|
||||
for component in sets.components(min_size=min_size, max_size=max_size):
|
||||
yield [data[i] for i in component]
|
||||
|
||||
|
|
|
@ -0,0 +1,177 @@
|
|||
# =============================================================================
|
||||
# Fog QuickJoin Clustering
|
||||
# =============================================================================
|
||||
#
|
||||
# Implementation of the Quick Join algorithm that works by recursively
|
||||
# partitionning the given data with regards to the triangular inequality in
|
||||
# order to reduce the amount of necessary distance computations.
|
||||
#
|
||||
# [Reference]:
|
||||
# Jacox, Edwin H., et Hanan Samet. « Metric Space Similarity Joins ».
|
||||
# ACM Transactions on Database Systems 33, no 2 (1 juin 2008): 1‑38.
|
||||
# https://doi.org/10.1145/1366102.1366104.
|
||||
#
|
||||
# Fredriksson K., Braithwaite B. (2013) Quicker Similarity Joins in Metric
|
||||
# Spaces. In: Brisaboa N., Pedreira O., Zezula P. (eds) Similarity Search and
|
||||
# Applications. SISAP 2013. Lecture Notes in Computer Science, vol 8199.
|
||||
# Springer, Berlin, Heidelberg
|
||||
#
|
||||
import random
|
||||
from fog.clustering.utils import clusters_from_pairs
|
||||
|
||||
|
||||
def partition(S, distance, p, radius, rho):
|
||||
L = []
|
||||
G = []
|
||||
Lw = []
|
||||
Gw = []
|
||||
|
||||
l = rho - radius
|
||||
g = rho + radius
|
||||
|
||||
for item in S:
|
||||
d = distance(p, item)
|
||||
|
||||
if d < rho:
|
||||
L.append(item)
|
||||
|
||||
if l <= d:
|
||||
Lw.append(item)
|
||||
else:
|
||||
G.append(item)
|
||||
|
||||
if d <= g:
|
||||
Gw.append(item)
|
||||
|
||||
return L, G, Lw, Gw
|
||||
|
||||
|
||||
def quickjoin_bruteforce(S1, S2, distance, radius):
|
||||
for i in range(len(S1)):
|
||||
A = S1[i]
|
||||
|
||||
for j in range(len(S2)):
|
||||
B = S2[j]
|
||||
|
||||
if distance(A, B) <= radius:
|
||||
yield (A, B)
|
||||
|
||||
|
||||
def quickjoin_self_bruteforce(S, distance, radius):
|
||||
n = len(S)
|
||||
|
||||
for i in range(n):
|
||||
A = S[i]
|
||||
|
||||
for j in range(i + 1, n):
|
||||
B = S[j]
|
||||
|
||||
if distance(A, B) <= radius:
|
||||
yield (A, B)
|
||||
|
||||
|
||||
def quickjoin(data, distance, radius, block_size=500,
|
||||
min_size=2, max_size=float('inf'),
|
||||
mode='connected_components',
|
||||
seed=None):
|
||||
"""
|
||||
Function returning an iterator over found clusters using the QuickJoin
|
||||
algorithm.
|
||||
|
||||
Args:
|
||||
data (iterable): Arbitrary iterable containing data points to gather
|
||||
into clusters. Will be fully consumed.
|
||||
distance (callable): The distance function to use. Must be a true
|
||||
metric, e.g. the Levenshtein distance.
|
||||
radius (number, optional): produced clusters' radius.
|
||||
block_size (number, optional): block size where the algorithm will
|
||||
switch to brute. Defaults to 500.
|
||||
min_size (number, optional): minimum number of items in a cluster for
|
||||
it to be considered viable. Defaults to 2.
|
||||
max_size (number, optional): maximum number of items in a cluster for
|
||||
it to be considered viable. Defaults to infinity.
|
||||
mode (string, optional): 'fuzzy_clusters', 'connected_components'.
|
||||
Defaults to 'connected_components'.
|
||||
seed (number, optional): Seed for RNG. Defaults to None.
|
||||
|
||||
Yields:
|
||||
list: A viable cluster.
|
||||
|
||||
"""
|
||||
|
||||
rng = random.Random(seed)
|
||||
|
||||
if type(data) is not list:
|
||||
data = list(data)
|
||||
|
||||
def clustering():
|
||||
stack = [(data, None)]
|
||||
|
||||
while len(stack) != 0:
|
||||
S1, S2 = stack.pop()
|
||||
|
||||
# QuickJoin procedure
|
||||
if S2 is None:
|
||||
|
||||
S = S1
|
||||
N = len(S)
|
||||
|
||||
if N <= block_size:
|
||||
yield from quickjoin_self_bruteforce(S, distance, radius)
|
||||
continue
|
||||
|
||||
# Randomly selecting pivots. They must be different
|
||||
p1 = rng.randint(0, N - 1)
|
||||
p2 = None
|
||||
|
||||
while p2 is None or p1 == p2:
|
||||
p2 = rng.randint(0, N - 1)
|
||||
|
||||
p1 = S[p1]
|
||||
p2 = S[p2]
|
||||
|
||||
rho = distance(p1, p2)
|
||||
|
||||
L, G, Lw, Gw = partition(S, distance, p1, radius, rho)
|
||||
|
||||
# Recursion
|
||||
stack.append((G, None))
|
||||
stack.append((L, None))
|
||||
stack.append((Lw, Gw))
|
||||
|
||||
# QuickJoinWin procedure
|
||||
else:
|
||||
N1 = len(S1)
|
||||
N2 = len(S2)
|
||||
N = N1 + N2
|
||||
|
||||
if N <= block_size:
|
||||
yield from quickjoin_bruteforce(S1, S2, distance, radius)
|
||||
continue
|
||||
|
||||
p1 = rng.randint(0, N - 1)
|
||||
p2 = None
|
||||
|
||||
while p2 is None or p1 == p2:
|
||||
p2 = rng.randint(0, N - 1)
|
||||
|
||||
p1 = S1[p1] if p1 < N1 else S2[p1 - N1]
|
||||
p2 = S1[p2] if p2 < N1 else S2[p2 - N1]
|
||||
|
||||
rho = distance(p1, p2)
|
||||
|
||||
L1, G1, Lw1, Gw1 = partition(S1, distance, p1, radius, rho)
|
||||
L2, G2, Lw2, Gw2 = partition(S2, distance, p1, radius, rho)
|
||||
|
||||
stack.append((L1, L2))
|
||||
stack.append((G1, G2))
|
||||
stack.append((Lw1, Gw2))
|
||||
stack.append((Gw1, Lw2))
|
||||
|
||||
yield from clusters_from_pairs(
|
||||
clustering(),
|
||||
min_size=min_size,
|
||||
max_size=max_size,
|
||||
mode=mode,
|
||||
fuzzy=True # TODO: Reconsider when using SparseSets
|
||||
)
|
|
@ -0,0 +1,41 @@
|
|||
# =============================================================================
|
||||
# Fog QuickJoin Clustering Unit Tests
|
||||
# =============================================================================
|
||||
import csv
|
||||
from test.clustering.utils import Clusters
|
||||
from Levenshtein import distance as levenshtein
|
||||
from fog.clustering import quickjoin
|
||||
|
||||
DATA = [
|
||||
'Mister Hyde',
|
||||
'Mister Hide',
|
||||
'Claudia Loc',
|
||||
'Cladia Loc'
|
||||
]
|
||||
|
||||
CLUSTERS = Clusters([
|
||||
('Mister Hyde', 'Mister Hide'),
|
||||
('Claudia Loc', 'Cladia Loc')
|
||||
])
|
||||
|
||||
UNIVERSITY_CLUSTERS = Clusters([
|
||||
('Universidad De Manila', 'Universidad de Manila'),
|
||||
('DePaul University', 'DePauw University'),
|
||||
('Seton Hall University', 'Seton Hill University'),
|
||||
('Baylor University', 'Taylor University')
|
||||
])
|
||||
|
||||
with open('./data/universities.csv', 'r') as f:
|
||||
UNIVERSITIES = set([line['university'] for line in csv.DictReader(f)])
|
||||
|
||||
|
||||
class TestQuickJoin(object):
|
||||
def test_basics(self):
|
||||
clusters = Clusters(quickjoin(DATA, distance=levenshtein, radius=1))
|
||||
|
||||
assert clusters == CLUSTERS
|
||||
|
||||
def test_universities(self):
|
||||
clusters = Clusters(quickjoin(UNIVERSITIES, distance=levenshtein, radius=1))
|
||||
|
||||
assert clusters == UNIVERSITY_CLUSTERS
|
Loading…
Reference in New Issue