Adding SNM

This commit is contained in:
Yomguithereal 2018-07-06 18:05:19 +02:00
parent ed4c1f0834
commit eff4399192
7 changed files with 127 additions and 6 deletions

View File

@ -3,7 +3,7 @@ from functools import partial
from timeit import default_timer as timer
from fog.clustering import *
from fog.tokenizers import ngrams
from fog.key import fingerprint
from fog.key import fingerprint, omission_key
from Levenshtein import distance as levenshtein
with open('./data/universities.csv', 'r') as f:
@ -33,6 +33,10 @@ with open('./data/universities.csv', 'r') as f:
clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2))
print('Blocking (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(sorted_neighborhood(universities, key=omission_key, distance=levenshtein, radius=2))
print('SNM Omission (%i):' % len(clusters), timer() - start)
print()
with open('./data/musicians.csv', 'r') as f:
reader = csv.DictReader(f)
@ -53,6 +57,10 @@ with open('./data/musicians.csv', 'r') as f:
clusters = list(blocking(artists, blocks=partial(ngrams, 6), distance=levenshtein, radius=2, processes=8))
print('Blocking (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(sorted_neighborhood(artists, key=omission_key, distance=levenshtein, radius=2))
print('SNM Omission (%i):' % len(clusters), timer() - start)
start = timer()
clusters = list(pairwise_fuzzy_clusters(artists, distance=levenshtein, radius=2, processes=8))
print('Parallel Fuzzy clusters (%i):' % len(clusters), timer() - start)

View File

@ -10,4 +10,5 @@ from fog.clustering.pairwise import (
pairwise_fuzzy_clusters,
pairwise_connected_components
)
from fog.clustering.sorted_neighborhood import sorted_neighborhood
from fog.clustering.vp_tree import vp_tree

View File

@ -44,8 +44,8 @@ def blocking_worker(payload):
def blocking(data, block=None, blocks=None, similarity=None, distance=None,
radius=None, min_size=2, max_size=float('inf'), processes=1):
"""
Function returning an iterator over found clusters using the leader
algorithm.
Function returning an iterator over found clusters using the blocking
method.
It works by dispatching given items into one or more buckets before
computing pairwise comparisons on each bucket.

View File

@ -0,0 +1,84 @@
# =============================================================================
# Fog Sorted Neighborhood Clustering
# =============================================================================
#
# Implementation of the Sorted Neighborhood method.
#
from collections import defaultdict
from fog.clustering.utils import make_similarity_function
def sorted_neighborhood(data, key=None, similarity=None, distance=None,
radius=None, window=10, min_size=2, max_size=float('inf')):
"""
Function returning an iterator over found clusters using the sorted
neighborhood method.
It works by first sorting the data according to a key which could, if
cleverly chosen, put similar items next to one another in the result.
We then attempt to find clusters by computing pairwise similarity/distances
in small blocks of constant size in the sorted list.
Omission key & skeleton keys by Pollock & Zamora are a good choice of
sorting key if you try to find mispellings, for instance.
Args:
data (iterable): Arbitrary iterable containing data points to gather
into clusters. Will be fully consumed.
key (callable, optional): key on which to sort the data.
similarity (callable): If radius is specified, a function returning
the similarity between two points. Else, a function returning
whether two points should be deemed similar. Alternatively, one can
specify `distance` instead.
distance (callable): If radius is specified, a function returning
the distance between two points. Else, a function returning
whether two point should not be deemed similar. Alternatively, one
can specify `similarity` instead.
radius (number, optional): produced clusters' radius.
window (number, optional): Size of the window in which to look for
matches. Defaults to 10.
min_size (number, optional): minimum number of items in a cluster for
it to be considered viable. Defaults to 2.
max_size (number, optional): maximum number of items in a cluster for
it to be considered viable. Defaults to infinity.
Yields:
list: A viable cluster.
"""
# Formatting similarity
similarity = make_similarity_function(similarity=similarity, distance=distance, radius=radius)
# Iterating over sorted data
S = sorted(data, key=key)
n = len(S)
graph = defaultdict(list)
for i in range(n):
A = S[i]
for j in range(i + 1, min(n, i + window + 1)):
B = S[j]
if similarity(A, B):
graph[i].append(j)
graph[j].append(i)
# Building clusters
visited = set()
for i, neighbors in graph.items():
if i in visited:
continue
if len(neighbors) + 1 < min_size:
continue
if len(neighbors) + 1 > max_size:
continue
visited.update(neighbors)
cluster = [S[i]] + [S[j] for j in neighbors]
yield cluster

View File

@ -19,8 +19,6 @@ UNDESIRABLES_RE = re.compile(r'[^A-Z]')
CONSONANTS = 'JKQXZVWYBFMGPDHCLNTSR'
VOWELS = set('AEIOU')
# TODO: omission/skeleton key clustering for distance = 1
def omission_key(string):
"""

View File

@ -5,7 +5,6 @@ import csv
from test.clustering.utils import Clusters
from Levenshtein import distance as levenshtein
from fog.clustering import blocking
from fog.tokenizers import ngrams
DATA = [
'Abelard',

View File

@ -0,0 +1,31 @@
# =============================================================================
# Fog Sorted Neighborhood Unit Tests
# =============================================================================
import csv
from test.clustering.utils import Clusters
from Levenshtein import distance as levenshtein
from fog.clustering import sorted_neighborhood
DATA = [
'Abelard',
'Abelar',
'Atrium',
'Atrides',
'Belgian',
'Belgia',
'Telgia'
]
CLUSTERS = Clusters([
('Abelard', 'Abelar'),
('Belgian', 'Belgia')
])
class TestSortedNeighborhood(object):
def test_basics(self):
# Sorting alphabetically
clusters = Clusters(sorted_neighborhood(DATA, distance=levenshtein, radius=1, window=1))
assert clusters == CLUSTERS