From 1f2c15a631cd41f412e07aada79ef1cf5d60c202 Mon Sep 17 00:00:00 2001
From: Yomguithereal <guillaumeplique@gmail.com>
Date: Tue, 31 Jul 2018 16:07:54 +0200
Subject: [PATCH] Adaptive sorted neighborhood

---
 fog/clustering/__init__.py                  |   3 +-
 fog/clustering/sorted_neighborhood.py       | 190 +++++++++++---------
 test/clustering/sorted_neighborhood_test.py |   9 +-
 3 files changed, 119 insertions(+), 83 deletions(-)

diff --git a/fog/clustering/__init__.py b/fog/clustering/__init__.py
index 6baaf04..df2106d 100644
--- a/fog/clustering/__init__.py
+++ b/fog/clustering/__init__.py
@@ -13,6 +13,7 @@ from fog.clustering.pairwise import (
 )
 from fog.clustering.quickjoin import quickjoin
 from fog.clustering.sorted_neighborhood import (
-    sorted_neighborhood
+    sorted_neighborhood,
+    adaptive_sorted_neighborhood
 )
 from fog.clustering.vp_tree import vp_tree
diff --git a/fog/clustering/sorted_neighborhood.py b/fog/clustering/sorted_neighborhood.py
index 9d9d881..b5ebb62 100644
--- a/fog/clustering/sorted_neighborhood.py
+++ b/fog/clustering/sorted_neighborhood.py
@@ -23,7 +23,6 @@
 from collections import defaultdict
 from fog.clustering.utils import make_similarity_function, clusters_from_pairs
 
-# TODO: adaptive etc.
 # TODO: parallelize
 
 
@@ -105,99 +104,128 @@ def sorted_neighborhood(data, key=None, keys=None, similarity=None, distance=Non
     )
 
 
-# def full_AA_SNM(sorted_records, similarity, window):
-#     n = len(sorted_records)
-#     w = window
-#     blocks = []
-#     first = 0
-#     last = first + w
+def full_AA_SNM(sorted_records, similarity, window):
+    n = len(sorted_records)
+    w = window
+    step = w - 1
+    first = 0
+    last = first + step
 
-#     while last < n:
-#         block.append(first)
+    while last < n:
+        f = first
 
-#         # Enlargement
-#         while similarity(sorted_records[first], sorted_records[last]):
-#             w = last - first + 1
-#             first = last
-#             last = first + w
+        # Enlargement
+        while similarity(sorted_records[first], sorted_records[last]):
+            w = last - f + 1
+            first = last
+            last = first + w - 1
 
-#         # Retrechment
-#         while w > window:
-#             pass
+        # Retrenchment
+        w = last - f + 1
+
+        while w > window:
+            previous = last - 1
+
+            if similarity(sorted_records[previous], sorted_records[last]):
+                break
+
+            # NOTE: possibility to divide the window by two according to
+            # the paper.
+            w -= 1
+            last = previous
+
+        w = window
+
+        yield (f, last)
+
+        first = last + 1
+        last = first + step
+
+    if first < n:
+        yield (first, n - 1)
 
 
-# def adaptive_sorted_neighborhood(data, key=None, keys=None, similarity=None,
-#                                  distance=None, radius=None, window=10,
-#                                  min_size=2, max_size=float('inf'),
-#                                  mode='connected_components'):
-#     """
-#     Function returning an iterator over found clusters using the sorted
-#     neighborhood method.
+def adaptive_sorted_neighborhood(data, key=None, keys=None, similarity=None,
+                                 distance=None, radius=None, window=10,
+                                 min_size=2, max_size=float('inf'),
+                                 mode='connected_components'):
+    """
+    Function returning an iterator over found clusters using the sorted
+    neighborhood method.
 
-#     It works using a improved variant of the Sorted Neighborhood Method (SNM)
-#     called Adaptive Sorted Neighoborhood.
+    It works using a improved variant of the Sorted Neighborhood Method (SNM)
+    called Adaptive Sorted Neighoborhood.
 
-#     More specifically, this method implements the "Full-Accumulatively-Adaptative
-#     SNM" from the "Adaptive Sorted Neighborhood Methods for Efficient
-#     Record Linkage" paper.
+    More specifically, this method implements the "Full-Accumulatively-Adaptative
+    SNM" from the "Adaptive Sorted Neighborhood Methods for Efficient
+    Record Linkage" paper.
 
-#     Args:
-#         data (iterable): Arbitrary iterable containing data points to gather
-#             into clusters. Will be fully consumed.
-#         key (callable, optional): key on which to sort the data.
-#         keys (iterable, optional): list of keys on which to sort for multipass
-#             sorted neighborhood method.
-#         similarity (callable): If radius is specified, a function returning
-#             the similarity between two points. Else, a function returning
-#             whether two points should be deemed similar. Alternatively, one can
-#             specify `distance` instead.
-#         distance (callable): If radius is specified, a function returning
-#             the distance between two points. Else, a function returning
-#             whether two point should not be deemed similar. Alternatively, one
-#             can specify `similarity` instead.
-#         radius (number, optional): produced clusters' radius.
-#         window (number, optional): Size of the window in which to look for
-#             matches. Defaults to 10.
-#         min_size (number, optional): minimum number of items in a cluster for
-#             it to be considered viable. Defaults to 2.
-#         max_size (number, optional): maximum number of items in a cluster for
-#             it to be considered viable. Defaults to infinity.
-#         mode (string, optional): 'fuzzy_clusters', 'connected_components'.
-#             Defaults to 'connected_components'.
+    Args:
+        data (iterable): Arbitrary iterable containing data points to gather
+            into clusters. Will be fully consumed.
+        key (callable, optional): key on which to sort the data.
+        keys (iterable, optional): list of keys on which to sort for multipass
+            sorted neighborhood method.
+        similarity (callable): If radius is specified, a function returning
+            the similarity between two points. Else, a function returning
+            whether two points should be deemed similar. Alternatively, one can
+            specify `distance` instead.
+        distance (callable): If radius is specified, a function returning
+            the distance between two points. Else, a function returning
+            whether two point should not be deemed similar. Alternatively, one
+            can specify `similarity` instead.
+        radius (number, optional): produced clusters' radius.
+        window (number, optional): Size of the window in which to look for
+            matches. Defaults to 10.
+        min_size (number, optional): minimum number of items in a cluster for
+            it to be considered viable. Defaults to 2.
+        max_size (number, optional): maximum number of items in a cluster for
+            it to be considered viable. Defaults to infinity.
+        mode (string, optional): 'fuzzy_clusters', 'connected_components'.
+            Defaults to 'connected_components'.
 
-#     Yields:
-#         list: A viable cluster.
+    Yields:
+        list: A viable cluster.
 
-#     """
+    """
 
-#     # Formatting similarity
-#     similarity = make_similarity_function(similarity=similarity, distance=distance, radius=radius)
+    # Formatting similarity
+    similarity = make_similarity_function(similarity=similarity, distance=distance, radius=radius)
 
-#     # Iterating over sorted data
-#     def clustering():
-#         multipass_keys = keys if keys is not None else [key]
+    # Iterating over sorted data
+    def clustering():
+        multipass_keys = keys if keys is not None else [key]
 
-#         for k in multipass_keys:
-#             S = sorted(data, key=k)
-#             n = len(S)
+        for k in multipass_keys:
+            S = sorted(data, key=k)
 
-#             w = window
-#             block = []
+            for start, end in full_AA_SNM(S, similarity, window):
+                l = end - start + 1
 
-#     # Building clusters
-#     yield from clusters_from_pairs(
-#         clustering(),
-#         min_size=min_size,
-#         max_size=max_size,
-#         mode=mode,
-#         fuzzy=keys is not None
-#     )
+                if l == 1:
+                    continue
 
-#     # Building clusters
-#     # yield from clusters_from_pairs(
-#     #     clustering(),
-#     #     min_size=min_size,
-#     #     max_size=max_size,
-#     #     mode=mode,
-#     #     fuzzy=keys is not None
-#     # )
+                elif l == 2:
+                    A = S[start]
+                    B = S[end]
+
+                    if similarity(A, B):
+                        yield (A, B)
+
+                else:
+                    for i in range(start, end + 1):
+                        for j in range(i + 1, end + 1):
+                            A = S[i]
+                            B = S[j]
+
+                            if similarity(A, B):
+                                yield (A, B)
+
+    # Building clusters
+    yield from clusters_from_pairs(
+        clustering(),
+        min_size=min_size,
+        max_size=max_size,
+        mode=mode,
+        fuzzy=keys is not None
+    )
diff --git a/test/clustering/sorted_neighborhood_test.py b/test/clustering/sorted_neighborhood_test.py
index e164bb0..3d75b43 100644
--- a/test/clustering/sorted_neighborhood_test.py
+++ b/test/clustering/sorted_neighborhood_test.py
@@ -4,7 +4,7 @@
 import csv
 from test.clustering.utils import Clusters
 from Levenshtein import distance as levenshtein
-from fog.clustering import sorted_neighborhood
+from fog.clustering import sorted_neighborhood, adaptive_sorted_neighborhood
 
 DATA = [
     'Abelard',
@@ -29,3 +29,10 @@ class TestSortedNeighborhood(object):
         clusters = Clusters(sorted_neighborhood(DATA, distance=levenshtein, radius=1, window=2))
 
         assert clusters == CLUSTERS
+
+    def test_adaptive(self):
+
+        # Sorting alphabetically
+        clusters = Clusters(adaptive_sorted_neighborhood(DATA, distance=levenshtein, radius=1, window=2))
+
+        assert clusters == CLUSTERS