From 1f2c15a631cd41f412e07aada79ef1cf5d60c202 Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Tue, 31 Jul 2018 16:07:54 +0200 Subject: [PATCH] Adaptive sorted neighborhood --- fog/clustering/__init__.py | 3 +- fog/clustering/sorted_neighborhood.py | 190 +++++++++++--------- test/clustering/sorted_neighborhood_test.py | 9 +- 3 files changed, 119 insertions(+), 83 deletions(-) diff --git a/fog/clustering/__init__.py b/fog/clustering/__init__.py index 6baaf04..df2106d 100644 --- a/fog/clustering/__init__.py +++ b/fog/clustering/__init__.py @@ -13,6 +13,7 @@ from fog.clustering.pairwise import ( ) from fog.clustering.quickjoin import quickjoin from fog.clustering.sorted_neighborhood import ( - sorted_neighborhood + sorted_neighborhood, + adaptive_sorted_neighborhood ) from fog.clustering.vp_tree import vp_tree diff --git a/fog/clustering/sorted_neighborhood.py b/fog/clustering/sorted_neighborhood.py index 9d9d881..b5ebb62 100644 --- a/fog/clustering/sorted_neighborhood.py +++ b/fog/clustering/sorted_neighborhood.py @@ -23,7 +23,6 @@ from collections import defaultdict from fog.clustering.utils import make_similarity_function, clusters_from_pairs -# TODO: adaptive etc. # TODO: parallelize @@ -105,99 +104,128 @@ def sorted_neighborhood(data, key=None, keys=None, similarity=None, distance=Non ) -# def full_AA_SNM(sorted_records, similarity, window): -# n = len(sorted_records) -# w = window -# blocks = [] -# first = 0 -# last = first + w +def full_AA_SNM(sorted_records, similarity, window): + n = len(sorted_records) + w = window + step = w - 1 + first = 0 + last = first + step -# while last < n: -# block.append(first) + while last < n: + f = first -# # Enlargement -# while similarity(sorted_records[first], sorted_records[last]): -# w = last - first + 1 -# first = last -# last = first + w + # Enlargement + while similarity(sorted_records[first], sorted_records[last]): + w = last - f + 1 + first = last + last = first + w - 1 -# # Retrechment -# while w > window: -# pass + # Retrenchment + w = last - f + 1 + + while w > window: + previous = last - 1 + + if similarity(sorted_records[previous], sorted_records[last]): + break + + # NOTE: possibility to divide the window by two according to + # the paper. + w -= 1 + last = previous + + w = window + + yield (f, last) + + first = last + 1 + last = first + step + + if first < n: + yield (first, n - 1) -# def adaptive_sorted_neighborhood(data, key=None, keys=None, similarity=None, -# distance=None, radius=None, window=10, -# min_size=2, max_size=float('inf'), -# mode='connected_components'): -# """ -# Function returning an iterator over found clusters using the sorted -# neighborhood method. +def adaptive_sorted_neighborhood(data, key=None, keys=None, similarity=None, + distance=None, radius=None, window=10, + min_size=2, max_size=float('inf'), + mode='connected_components'): + """ + Function returning an iterator over found clusters using the sorted + neighborhood method. -# It works using a improved variant of the Sorted Neighborhood Method (SNM) -# called Adaptive Sorted Neighoborhood. + It works using a improved variant of the Sorted Neighborhood Method (SNM) + called Adaptive Sorted Neighoborhood. -# More specifically, this method implements the "Full-Accumulatively-Adaptative -# SNM" from the "Adaptive Sorted Neighborhood Methods for Efficient -# Record Linkage" paper. + More specifically, this method implements the "Full-Accumulatively-Adaptative + SNM" from the "Adaptive Sorted Neighborhood Methods for Efficient + Record Linkage" paper. -# Args: -# data (iterable): Arbitrary iterable containing data points to gather -# into clusters. Will be fully consumed. -# key (callable, optional): key on which to sort the data. -# keys (iterable, optional): list of keys on which to sort for multipass -# sorted neighborhood method. -# similarity (callable): If radius is specified, a function returning -# the similarity between two points. Else, a function returning -# whether two points should be deemed similar. Alternatively, one can -# specify `distance` instead. -# distance (callable): If radius is specified, a function returning -# the distance between two points. Else, a function returning -# whether two point should not be deemed similar. Alternatively, one -# can specify `similarity` instead. -# radius (number, optional): produced clusters' radius. -# window (number, optional): Size of the window in which to look for -# matches. Defaults to 10. -# min_size (number, optional): minimum number of items in a cluster for -# it to be considered viable. Defaults to 2. -# max_size (number, optional): maximum number of items in a cluster for -# it to be considered viable. Defaults to infinity. -# mode (string, optional): 'fuzzy_clusters', 'connected_components'. -# Defaults to 'connected_components'. + Args: + data (iterable): Arbitrary iterable containing data points to gather + into clusters. Will be fully consumed. + key (callable, optional): key on which to sort the data. + keys (iterable, optional): list of keys on which to sort for multipass + sorted neighborhood method. + similarity (callable): If radius is specified, a function returning + the similarity between two points. Else, a function returning + whether two points should be deemed similar. Alternatively, one can + specify `distance` instead. + distance (callable): If radius is specified, a function returning + the distance between two points. Else, a function returning + whether two point should not be deemed similar. Alternatively, one + can specify `similarity` instead. + radius (number, optional): produced clusters' radius. + window (number, optional): Size of the window in which to look for + matches. Defaults to 10. + min_size (number, optional): minimum number of items in a cluster for + it to be considered viable. Defaults to 2. + max_size (number, optional): maximum number of items in a cluster for + it to be considered viable. Defaults to infinity. + mode (string, optional): 'fuzzy_clusters', 'connected_components'. + Defaults to 'connected_components'. -# Yields: -# list: A viable cluster. + Yields: + list: A viable cluster. -# """ + """ -# # Formatting similarity -# similarity = make_similarity_function(similarity=similarity, distance=distance, radius=radius) + # Formatting similarity + similarity = make_similarity_function(similarity=similarity, distance=distance, radius=radius) -# # Iterating over sorted data -# def clustering(): -# multipass_keys = keys if keys is not None else [key] + # Iterating over sorted data + def clustering(): + multipass_keys = keys if keys is not None else [key] -# for k in multipass_keys: -# S = sorted(data, key=k) -# n = len(S) + for k in multipass_keys: + S = sorted(data, key=k) -# w = window -# block = [] + for start, end in full_AA_SNM(S, similarity, window): + l = end - start + 1 -# # Building clusters -# yield from clusters_from_pairs( -# clustering(), -# min_size=min_size, -# max_size=max_size, -# mode=mode, -# fuzzy=keys is not None -# ) + if l == 1: + continue -# # Building clusters -# # yield from clusters_from_pairs( -# # clustering(), -# # min_size=min_size, -# # max_size=max_size, -# # mode=mode, -# # fuzzy=keys is not None -# # ) + elif l == 2: + A = S[start] + B = S[end] + + if similarity(A, B): + yield (A, B) + + else: + for i in range(start, end + 1): + for j in range(i + 1, end + 1): + A = S[i] + B = S[j] + + if similarity(A, B): + yield (A, B) + + # Building clusters + yield from clusters_from_pairs( + clustering(), + min_size=min_size, + max_size=max_size, + mode=mode, + fuzzy=keys is not None + ) diff --git a/test/clustering/sorted_neighborhood_test.py b/test/clustering/sorted_neighborhood_test.py index e164bb0..3d75b43 100644 --- a/test/clustering/sorted_neighborhood_test.py +++ b/test/clustering/sorted_neighborhood_test.py @@ -4,7 +4,7 @@ import csv from test.clustering.utils import Clusters from Levenshtein import distance as levenshtein -from fog.clustering import sorted_neighborhood +from fog.clustering import sorted_neighborhood, adaptive_sorted_neighborhood DATA = [ 'Abelard', @@ -29,3 +29,10 @@ class TestSortedNeighborhood(object): clusters = Clusters(sorted_neighborhood(DATA, distance=levenshtein, radius=1, window=2)) assert clusters == CLUSTERS + + def test_adaptive(self): + + # Sorting alphabetically + clusters = Clusters(adaptive_sorted_neighborhood(DATA, distance=levenshtein, radius=1, window=2)) + + assert clusters == CLUSTERS