Multi-pass SNM

This commit is contained in:
Yomguithereal 2018-07-12 16:33:35 +02:00
parent 28ed0b8a5e
commit 41bc27814e
2 changed files with 26 additions and 14 deletions

View File

@ -50,6 +50,10 @@ with open('./data/universities.csv', 'r') as f:
clusters = list(sorted_neighborhood(universities, key=skeleton_key, distance=levenshtein, radius=2))
print('SNM Skeleton (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(sorted_neighborhood(universities, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2))
print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start)
print()
with open('./data/musicians.csv', 'r') as f:
reader = csv.DictReader(f)
@ -78,6 +82,10 @@ with open('./data/musicians.csv', 'r') as f:
clusters = list(sorted_neighborhood(artists, key=skeleton_key, distance=levenshtein, radius=2))
print('SNM Skeleton (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(sorted_neighborhood(artists, keys=[omission_key, skeleton_key], distance=levenshtein, radius=2))
print('SNM Omission + Skeleton (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(quickjoin(artists, distance=levenshtein, radius=2, processes=8))
print('QuickJoin (%i):' % len(clusters), timer() - start)

View File

@ -19,10 +19,10 @@
from collections import defaultdict
from fog.clustering.utils import make_similarity_function, clusters_from_pairs
# TODO: multi-pass, adaptive etc.
# TODO: adaptive etc.
def sorted_neighborhood(data, key=None, similarity=None, distance=None,
def sorted_neighborhood(data, key=None, keys=None, similarity=None, distance=None,
radius=None, window=10, min_size=2, max_size=float('inf'),
mode='connected_components'):
"""
@ -45,6 +45,8 @@ def sorted_neighborhood(data, key=None, similarity=None, distance=None,
data (iterable): Arbitrary iterable containing data points to gather
into clusters. Will be fully consumed.
key (callable, optional): key on which to sort the data.
keys (iterable, optional): list of keys on which to sort for multipass
sorted neighborhood method.
similarity (callable): If radius is specified, a function returning
the similarity between two points. Else, a function returning
whether two points should be deemed similar. Alternatively, one can
@ -72,25 +74,27 @@ def sorted_neighborhood(data, key=None, similarity=None, distance=None,
similarity = make_similarity_function(similarity=similarity, distance=distance, radius=radius)
# Iterating over sorted data
S = sorted(data, key=key)
n = len(S)
graph = defaultdict(list)
def clustering():
for i in range(n):
A = S[i]
multipass_keys = keys if keys is not None else [key]
for j in range(i + 1, min(n, i + window)):
B = S[j]
for k in multipass_keys:
S = sorted(data, key=k)
n = len(S)
if similarity(A, B):
yield (A, B)
for i in range(n):
A = S[i]
for j in range(i + 1, min(n, i + window)):
B = S[j]
if similarity(A, B):
yield (A, B)
# Building clusters
yield from clusters_from_pairs(
clustering(),
min_size=min_size,
max_size=max_size,
mode=mode
mode=mode,
fuzzy=keys is not None
)