Adaptive sorted neighborhood

This commit is contained in:
Yomguithereal 2018-07-31 16:07:54 +02:00
parent 7f1ce40ff2
commit 1f2c15a631
3 changed files with 119 additions and 83 deletions

View File

@ -13,6 +13,7 @@ from fog.clustering.pairwise import (
)
from fog.clustering.quickjoin import quickjoin
from fog.clustering.sorted_neighborhood import (
sorted_neighborhood
sorted_neighborhood,
adaptive_sorted_neighborhood
)
from fog.clustering.vp_tree import vp_tree

View File

@ -23,7 +23,6 @@
from collections import defaultdict
from fog.clustering.utils import make_similarity_function, clusters_from_pairs
# TODO: adaptive etc.
# TODO: parallelize
@ -105,99 +104,128 @@ def sorted_neighborhood(data, key=None, keys=None, similarity=None, distance=Non
)
# def full_AA_SNM(sorted_records, similarity, window):
# n = len(sorted_records)
# w = window
# blocks = []
# first = 0
# last = first + w
def full_AA_SNM(sorted_records, similarity, window):
n = len(sorted_records)
w = window
step = w - 1
first = 0
last = first + step
# while last < n:
# block.append(first)
while last < n:
f = first
# # Enlargement
# while similarity(sorted_records[first], sorted_records[last]):
# w = last - first + 1
# first = last
# last = first + w
# Enlargement
while similarity(sorted_records[first], sorted_records[last]):
w = last - f + 1
first = last
last = first + w - 1
# # Retrechment
# while w > window:
# pass
# Retrenchment
w = last - f + 1
while w > window:
previous = last - 1
if similarity(sorted_records[previous], sorted_records[last]):
break
# NOTE: possibility to divide the window by two according to
# the paper.
w -= 1
last = previous
w = window
yield (f, last)
first = last + 1
last = first + step
if first < n:
yield (first, n - 1)
# def adaptive_sorted_neighborhood(data, key=None, keys=None, similarity=None,
# distance=None, radius=None, window=10,
# min_size=2, max_size=float('inf'),
# mode='connected_components'):
# """
# Function returning an iterator over found clusters using the sorted
# neighborhood method.
def adaptive_sorted_neighborhood(data, key=None, keys=None, similarity=None,
distance=None, radius=None, window=10,
min_size=2, max_size=float('inf'),
mode='connected_components'):
"""
Function returning an iterator over found clusters using the sorted
neighborhood method.
# It works using a improved variant of the Sorted Neighborhood Method (SNM)
# called Adaptive Sorted Neighoborhood.
It works using a improved variant of the Sorted Neighborhood Method (SNM)
called Adaptive Sorted Neighoborhood.
# More specifically, this method implements the "Full-Accumulatively-Adaptative
# SNM" from the "Adaptive Sorted Neighborhood Methods for Efficient
# Record Linkage" paper.
More specifically, this method implements the "Full-Accumulatively-Adaptative
SNM" from the "Adaptive Sorted Neighborhood Methods for Efficient
Record Linkage" paper.
# Args:
# data (iterable): Arbitrary iterable containing data points to gather
# into clusters. Will be fully consumed.
# key (callable, optional): key on which to sort the data.
# keys (iterable, optional): list of keys on which to sort for multipass
# sorted neighborhood method.
# similarity (callable): If radius is specified, a function returning
# the similarity between two points. Else, a function returning
# whether two points should be deemed similar. Alternatively, one can
# specify `distance` instead.
# distance (callable): If radius is specified, a function returning
# the distance between two points. Else, a function returning
# whether two point should not be deemed similar. Alternatively, one
# can specify `similarity` instead.
# radius (number, optional): produced clusters' radius.
# window (number, optional): Size of the window in which to look for
# matches. Defaults to 10.
# min_size (number, optional): minimum number of items in a cluster for
# it to be considered viable. Defaults to 2.
# max_size (number, optional): maximum number of items in a cluster for
# it to be considered viable. Defaults to infinity.
# mode (string, optional): 'fuzzy_clusters', 'connected_components'.
# Defaults to 'connected_components'.
Args:
data (iterable): Arbitrary iterable containing data points to gather
into clusters. Will be fully consumed.
key (callable, optional): key on which to sort the data.
keys (iterable, optional): list of keys on which to sort for multipass
sorted neighborhood method.
similarity (callable): If radius is specified, a function returning
the similarity between two points. Else, a function returning
whether two points should be deemed similar. Alternatively, one can
specify `distance` instead.
distance (callable): If radius is specified, a function returning
the distance between two points. Else, a function returning
whether two point should not be deemed similar. Alternatively, one
can specify `similarity` instead.
radius (number, optional): produced clusters' radius.
window (number, optional): Size of the window in which to look for
matches. Defaults to 10.
min_size (number, optional): minimum number of items in a cluster for
it to be considered viable. Defaults to 2.
max_size (number, optional): maximum number of items in a cluster for
it to be considered viable. Defaults to infinity.
mode (string, optional): 'fuzzy_clusters', 'connected_components'.
Defaults to 'connected_components'.
# Yields:
# list: A viable cluster.
Yields:
list: A viable cluster.
# """
"""
# # Formatting similarity
# similarity = make_similarity_function(similarity=similarity, distance=distance, radius=radius)
# Formatting similarity
similarity = make_similarity_function(similarity=similarity, distance=distance, radius=radius)
# # Iterating over sorted data
# def clustering():
# multipass_keys = keys if keys is not None else [key]
# Iterating over sorted data
def clustering():
multipass_keys = keys if keys is not None else [key]
# for k in multipass_keys:
# S = sorted(data, key=k)
# n = len(S)
for k in multipass_keys:
S = sorted(data, key=k)
# w = window
# block = []
for start, end in full_AA_SNM(S, similarity, window):
l = end - start + 1
# # Building clusters
# yield from clusters_from_pairs(
# clustering(),
# min_size=min_size,
# max_size=max_size,
# mode=mode,
# fuzzy=keys is not None
# )
if l == 1:
continue
# # Building clusters
# # yield from clusters_from_pairs(
# # clustering(),
# # min_size=min_size,
# # max_size=max_size,
# # mode=mode,
# # fuzzy=keys is not None
# # )
elif l == 2:
A = S[start]
B = S[end]
if similarity(A, B):
yield (A, B)
else:
for i in range(start, end + 1):
for j in range(i + 1, end + 1):
A = S[i]
B = S[j]
if similarity(A, B):
yield (A, B)
# Building clusters
yield from clusters_from_pairs(
clustering(),
min_size=min_size,
max_size=max_size,
mode=mode,
fuzzy=keys is not None
)

View File

@ -4,7 +4,7 @@
import csv
from test.clustering.utils import Clusters
from Levenshtein import distance as levenshtein
from fog.clustering import sorted_neighborhood
from fog.clustering import sorted_neighborhood, adaptive_sorted_neighborhood
DATA = [
'Abelard',
@ -29,3 +29,10 @@ class TestSortedNeighborhood(object):
clusters = Clusters(sorted_neighborhood(DATA, distance=levenshtein, radius=1, window=2))
assert clusters == CLUSTERS
def test_adaptive(self):
# Sorting alphabetically
clusters = Clusters(adaptive_sorted_neighborhood(DATA, distance=levenshtein, radius=1, window=2))
assert clusters == CLUSTERS