From ef95edccd20e6c73bafcfab9dfecc1ffa8585c22 Mon Sep 17 00:00:00 2001
From: Yomguithereal <guillaumeplique@gmail.com>
Date: Wed, 11 Jul 2018 17:58:25 +0200
Subject: [PATCH] Drafting quickjoin

---
 experiments/benchmark.py          |   8 ++
 fog/clustering/__init__.py        |   1 +
 fog/clustering/blocking.py        |   1 +
 fog/clustering/pairwise.py        |   1 +
 fog/clustering/quickjoin.py       | 177 ++++++++++++++++++++++++++++++
 test/clustering/quickjoin_test.py |  41 +++++++
 6 files changed, 229 insertions(+)
 create mode 100644 fog/clustering/quickjoin.py
 create mode 100644 test/clustering/quickjoin_test.py

diff --git a/experiments/benchmark.py b/experiments/benchmark.py
index 730476e..d2f0a22 100644
--- a/experiments/benchmark.py
+++ b/experiments/benchmark.py
@@ -29,6 +29,10 @@ with open('./data/universities.csv', 'r') as f:
     clusters = list(vp_tree(universities, distance=levenshtein, radius=2))
     print('VPTree (%i):' % len(clusters), timer() - start)
 
+    start = timer()
+    clusters = list(quickjoin(universities, distance=levenshtein, radius=2))
+    print('QuickJoin (%i):' % len(clusters), timer() - start)
+
     start = timer()
     clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2))
     print('Blocking (%i):' % len(clusters), timer() - start)
@@ -69,6 +73,10 @@ with open('./data/musicians.csv', 'r') as f:
     clusters = list(sorted_neighborhood(artists, key=skeleton_key, distance=levenshtein, radius=2))
     print('SNM Skeleton (%i):' % len(clusters), timer() - start)
 
+    start = timer()
+    clusters = list(quickjoin(artists, distance=levenshtein, radius=2))
+    print('QuickJoin (%i):' % len(clusters), timer() - start)
+
     start = timer()
     clusters = list(pairwise_fuzzy_clusters(artists, distance=levenshtein, radius=2, processes=8))
     print('Parallel Fuzzy clusters (%i):' % len(clusters), timer() - start)
diff --git a/fog/clustering/__init__.py b/fog/clustering/__init__.py
index 0eaea75..ffdf6d9 100644
--- a/fog/clustering/__init__.py
+++ b/fog/clustering/__init__.py
@@ -10,5 +10,6 @@ from fog.clustering.pairwise import (
     pairwise_fuzzy_clusters,
     pairwise_connected_components
 )
+from fog.clustering.quickjoin import quickjoin
 from fog.clustering.sorted_neighborhood import sorted_neighborhood
 from fog.clustering.vp_tree import vp_tree
diff --git a/fog/clustering/blocking.py b/fog/clustering/blocking.py
index 388e61c..c96522a 100644
--- a/fog/clustering/blocking.py
+++ b/fog/clustering/blocking.py
@@ -11,6 +11,7 @@ from multiprocessing import Pool
 from fog.clustering.utils import make_similarity_function, clusters_from_pairs
 
 # TODO: max_block_size to avoid ngrams with high DF
+# TODO: worker using a VPTree
 
 
 def blocking_worker(payload):
diff --git a/fog/clustering/pairwise.py b/fog/clustering/pairwise.py
index 36e2772..ad0099c 100644
--- a/fog/clustering/pairwise.py
+++ b/fog/clustering/pairwise.py
@@ -363,6 +363,7 @@ def pairwise_connected_components(data, similarity=None, distance=None, radius=N
                 for i, j in matches:
                     sets.union(i, j)
 
+    # TODO: Should really be using the sparse version
     for component in sets.components(min_size=min_size, max_size=max_size):
         yield [data[i] for i in component]
 
diff --git a/fog/clustering/quickjoin.py b/fog/clustering/quickjoin.py
new file mode 100644
index 0000000..c8e0b7e
--- /dev/null
+++ b/fog/clustering/quickjoin.py
@@ -0,0 +1,177 @@
+# =============================================================================
+# Fog QuickJoin Clustering
+# =============================================================================
+#
+# Implementation of the Quick Join algorithm that works by recursively
+# partitionning the given data with regards to the triangular inequality in
+# order to reduce the amount of necessary distance computations.
+#
+# [Reference]:
+# Jacox, Edwin H., et Hanan Samet. « Metric Space Similarity Joins ».
+# ACM Transactions on Database Systems 33, no 2 (1 juin 2008): 1‑38.
+# https://doi.org/10.1145/1366102.1366104.
+#
+# Fredriksson K., Braithwaite B. (2013) Quicker Similarity Joins in Metric
+# Spaces. In: Brisaboa N., Pedreira O., Zezula P. (eds) Similarity Search and
+# Applications. SISAP 2013. Lecture Notes in Computer Science, vol 8199.
+# Springer, Berlin, Heidelberg
+#
+import random
+from fog.clustering.utils import clusters_from_pairs
+
+
+def partition(S, distance, p, radius, rho):
+    L = []
+    G = []
+    Lw = []
+    Gw = []
+
+    l = rho - radius
+    g = rho + radius
+
+    for item in S:
+        d = distance(p, item)
+
+        if d < rho:
+            L.append(item)
+
+            if l <= d:
+                Lw.append(item)
+        else:
+            G.append(item)
+
+            if d <= g:
+                Gw.append(item)
+
+    return L, G, Lw, Gw
+
+
+def quickjoin_bruteforce(S1, S2, distance, radius):
+    for i in range(len(S1)):
+        A = S1[i]
+
+        for j in range(len(S2)):
+            B = S2[j]
+
+            if distance(A, B) <= radius:
+                yield (A, B)
+
+
+def quickjoin_self_bruteforce(S, distance, radius):
+    n = len(S)
+
+    for i in range(n):
+        A = S[i]
+
+        for j in range(i + 1, n):
+            B = S[j]
+
+            if distance(A, B) <= radius:
+                yield (A, B)
+
+
+def quickjoin(data, distance, radius, block_size=500,
+              min_size=2, max_size=float('inf'),
+              mode='connected_components',
+              seed=None):
+    """
+    Function returning an iterator over found clusters using the QuickJoin
+    algorithm.
+
+    Args:
+        data (iterable): Arbitrary iterable containing data points to gather
+            into clusters. Will be fully consumed.
+        distance (callable): The distance function to use. Must be a true
+            metric, e.g. the Levenshtein distance.
+        radius (number, optional): produced clusters' radius.
+        block_size (number, optional): block size where the algorithm will
+            switch to brute. Defaults to 500.
+        min_size (number, optional): minimum number of items in a cluster for
+            it to be considered viable. Defaults to 2.
+        max_size (number, optional): maximum number of items in a cluster for
+            it to be considered viable. Defaults to infinity.
+        mode (string, optional): 'fuzzy_clusters', 'connected_components'.
+            Defaults to 'connected_components'.
+        seed (number, optional): Seed for RNG. Defaults to None.
+
+    Yields:
+        list: A viable cluster.
+
+    """
+
+    rng = random.Random(seed)
+
+    if type(data) is not list:
+        data = list(data)
+
+    def clustering():
+        stack = [(data, None)]
+
+        while len(stack) != 0:
+            S1, S2 = stack.pop()
+
+            # QuickJoin procedure
+            if S2 is None:
+
+                S = S1
+                N = len(S)
+
+                if N <= block_size:
+                    yield from quickjoin_self_bruteforce(S, distance, radius)
+                    continue
+
+                # Randomly selecting pivots. They must be different
+                p1 = rng.randint(0, N - 1)
+                p2 = None
+
+                while p2 is None or p1 == p2:
+                    p2 = rng.randint(0, N - 1)
+
+                p1 = S[p1]
+                p2 = S[p2]
+
+                rho = distance(p1, p2)
+
+                L, G, Lw, Gw = partition(S, distance, p1, radius, rho)
+
+                # Recursion
+                stack.append((G, None))
+                stack.append((L, None))
+                stack.append((Lw, Gw))
+
+            # QuickJoinWin procedure
+            else:
+                N1 = len(S1)
+                N2 = len(S2)
+                N = N1 + N2
+
+                if N <= block_size:
+                    yield from quickjoin_bruteforce(S1, S2, distance, radius)
+                    continue
+
+                p1 = rng.randint(0, N - 1)
+                p2 = None
+
+                while p2 is None or p1 == p2:
+                    p2 = rng.randint(0, N - 1)
+
+                p1 = S1[p1] if p1 < N1 else S2[p1 - N1]
+                p2 = S1[p2] if p2 < N1 else S2[p2 - N1]
+
+                rho = distance(p1, p2)
+
+                L1, G1, Lw1, Gw1 = partition(S1, distance, p1, radius, rho)
+                L2, G2, Lw2, Gw2 = partition(S2, distance, p1, radius, rho)
+
+                stack.append((L1, L2))
+                stack.append((G1, G2))
+                stack.append((Lw1, Gw2))
+                stack.append((Gw1, Lw2))
+
+    yield from clusters_from_pairs(
+        clustering(),
+        min_size=min_size,
+        max_size=max_size,
+        mode=mode,
+        fuzzy=True  # TODO: Reconsider when using SparseSets
+    )
diff --git a/test/clustering/quickjoin_test.py b/test/clustering/quickjoin_test.py
new file mode 100644
index 0000000..103c7b0
--- /dev/null
+++ b/test/clustering/quickjoin_test.py
@@ -0,0 +1,41 @@
+# =============================================================================
+# Fog QuickJoin Clustering Unit Tests
+# =============================================================================
+import csv
+from test.clustering.utils import Clusters
+from Levenshtein import distance as levenshtein
+from fog.clustering import quickjoin
+
+DATA = [
+    'Mister Hyde',
+    'Mister Hide',
+    'Claudia Loc',
+    'Cladia Loc'
+]
+
+CLUSTERS = Clusters([
+    ('Mister Hyde', 'Mister Hide'),
+    ('Claudia Loc', 'Cladia Loc')
+])
+
+UNIVERSITY_CLUSTERS = Clusters([
+    ('Universidad De Manila', 'Universidad de Manila'),
+    ('DePaul University', 'DePauw University'),
+    ('Seton Hall University', 'Seton Hill University'),
+    ('Baylor University', 'Taylor University')
+])
+
+with open('./data/universities.csv', 'r') as f:
+    UNIVERSITIES = set([line['university'] for line in csv.DictReader(f)])
+
+
+class TestQuickJoin(object):
+    def test_basics(self):
+        clusters = Clusters(quickjoin(DATA, distance=levenshtein, radius=1))
+
+        assert clusters == CLUSTERS
+
+    def test_universities(self):
+        clusters = Clusters(quickjoin(UNIVERSITIES, distance=levenshtein, radius=1))
+
+        assert clusters == UNIVERSITY_CLUSTERS