mirror of https://github.com/Yomguithereal/fog.git
Adding SNM
This commit is contained in:
parent
ed4c1f0834
commit
eff4399192
|
@ -3,7 +3,7 @@ from functools import partial
|
|||
from timeit import default_timer as timer
|
||||
from fog.clustering import *
|
||||
from fog.tokenizers import ngrams
|
||||
from fog.key import fingerprint
|
||||
from fog.key import fingerprint, omission_key
|
||||
from Levenshtein import distance as levenshtein
|
||||
|
||||
with open('./data/universities.csv', 'r') as f:
|
||||
|
@ -33,6 +33,10 @@ with open('./data/universities.csv', 'r') as f:
|
|||
clusters = list(blocking(universities, blocks=partial(ngrams, 6), distance=levenshtein, radius=2))
|
||||
print('Blocking (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(sorted_neighborhood(universities, key=omission_key, distance=levenshtein, radius=2))
|
||||
print('SNM Omission (%i):' % len(clusters), timer() - start)
|
||||
|
||||
print()
|
||||
with open('./data/musicians.csv', 'r') as f:
|
||||
reader = csv.DictReader(f)
|
||||
|
@ -53,6 +57,10 @@ with open('./data/musicians.csv', 'r') as f:
|
|||
clusters = list(blocking(artists, blocks=partial(ngrams, 6), distance=levenshtein, radius=2, processes=8))
|
||||
print('Blocking (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(sorted_neighborhood(artists, key=omission_key, distance=levenshtein, radius=2))
|
||||
print('SNM Omission (%i):' % len(clusters), timer() - start)
|
||||
|
||||
start = timer()
|
||||
clusters = list(pairwise_fuzzy_clusters(artists, distance=levenshtein, radius=2, processes=8))
|
||||
print('Parallel Fuzzy clusters (%i):' % len(clusters), timer() - start)
|
|
@ -10,4 +10,5 @@ from fog.clustering.pairwise import (
|
|||
pairwise_fuzzy_clusters,
|
||||
pairwise_connected_components
|
||||
)
|
||||
from fog.clustering.sorted_neighborhood import sorted_neighborhood
|
||||
from fog.clustering.vp_tree import vp_tree
|
||||
|
|
|
@ -44,8 +44,8 @@ def blocking_worker(payload):
|
|||
def blocking(data, block=None, blocks=None, similarity=None, distance=None,
|
||||
radius=None, min_size=2, max_size=float('inf'), processes=1):
|
||||
"""
|
||||
Function returning an iterator over found clusters using the leader
|
||||
algorithm.
|
||||
Function returning an iterator over found clusters using the blocking
|
||||
method.
|
||||
|
||||
It works by dispatching given items into one or more buckets before
|
||||
computing pairwise comparisons on each bucket.
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
# =============================================================================
|
||||
# Fog Sorted Neighborhood Clustering
|
||||
# =============================================================================
|
||||
#
|
||||
# Implementation of the Sorted Neighborhood method.
|
||||
#
|
||||
from collections import defaultdict
|
||||
from fog.clustering.utils import make_similarity_function
|
||||
|
||||
|
||||
def sorted_neighborhood(data, key=None, similarity=None, distance=None,
|
||||
radius=None, window=10, min_size=2, max_size=float('inf')):
|
||||
"""
|
||||
Function returning an iterator over found clusters using the sorted
|
||||
neighborhood method.
|
||||
|
||||
It works by first sorting the data according to a key which could, if
|
||||
cleverly chosen, put similar items next to one another in the result.
|
||||
|
||||
We then attempt to find clusters by computing pairwise similarity/distances
|
||||
in small blocks of constant size in the sorted list.
|
||||
|
||||
Omission key & skeleton keys by Pollock & Zamora are a good choice of
|
||||
sorting key if you try to find mispellings, for instance.
|
||||
|
||||
Args:
|
||||
data (iterable): Arbitrary iterable containing data points to gather
|
||||
into clusters. Will be fully consumed.
|
||||
key (callable, optional): key on which to sort the data.
|
||||
similarity (callable): If radius is specified, a function returning
|
||||
the similarity between two points. Else, a function returning
|
||||
whether two points should be deemed similar. Alternatively, one can
|
||||
specify `distance` instead.
|
||||
distance (callable): If radius is specified, a function returning
|
||||
the distance between two points. Else, a function returning
|
||||
whether two point should not be deemed similar. Alternatively, one
|
||||
can specify `similarity` instead.
|
||||
radius (number, optional): produced clusters' radius.
|
||||
window (number, optional): Size of the window in which to look for
|
||||
matches. Defaults to 10.
|
||||
min_size (number, optional): minimum number of items in a cluster for
|
||||
it to be considered viable. Defaults to 2.
|
||||
max_size (number, optional): maximum number of items in a cluster for
|
||||
it to be considered viable. Defaults to infinity.
|
||||
|
||||
Yields:
|
||||
list: A viable cluster.
|
||||
|
||||
"""
|
||||
|
||||
# Formatting similarity
|
||||
similarity = make_similarity_function(similarity=similarity, distance=distance, radius=radius)
|
||||
|
||||
# Iterating over sorted data
|
||||
S = sorted(data, key=key)
|
||||
n = len(S)
|
||||
|
||||
graph = defaultdict(list)
|
||||
|
||||
for i in range(n):
|
||||
A = S[i]
|
||||
|
||||
for j in range(i + 1, min(n, i + window + 1)):
|
||||
B = S[j]
|
||||
|
||||
if similarity(A, B):
|
||||
graph[i].append(j)
|
||||
graph[j].append(i)
|
||||
|
||||
# Building clusters
|
||||
visited = set()
|
||||
for i, neighbors in graph.items():
|
||||
if i in visited:
|
||||
continue
|
||||
|
||||
if len(neighbors) + 1 < min_size:
|
||||
continue
|
||||
if len(neighbors) + 1 > max_size:
|
||||
continue
|
||||
|
||||
visited.update(neighbors)
|
||||
|
||||
cluster = [S[i]] + [S[j] for j in neighbors]
|
||||
yield cluster
|
|
@ -19,8 +19,6 @@ UNDESIRABLES_RE = re.compile(r'[^A-Z]')
|
|||
CONSONANTS = 'JKQXZVWYBFMGPDHCLNTSR'
|
||||
VOWELS = set('AEIOU')
|
||||
|
||||
# TODO: omission/skeleton key clustering for distance = 1
|
||||
|
||||
|
||||
def omission_key(string):
|
||||
"""
|
||||
|
|
|
@ -5,7 +5,6 @@ import csv
|
|||
from test.clustering.utils import Clusters
|
||||
from Levenshtein import distance as levenshtein
|
||||
from fog.clustering import blocking
|
||||
from fog.tokenizers import ngrams
|
||||
|
||||
DATA = [
|
||||
'Abelard',
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
# =============================================================================
|
||||
# Fog Sorted Neighborhood Unit Tests
|
||||
# =============================================================================
|
||||
import csv
|
||||
from test.clustering.utils import Clusters
|
||||
from Levenshtein import distance as levenshtein
|
||||
from fog.clustering import sorted_neighborhood
|
||||
|
||||
DATA = [
|
||||
'Abelard',
|
||||
'Abelar',
|
||||
'Atrium',
|
||||
'Atrides',
|
||||
'Belgian',
|
||||
'Belgia',
|
||||
'Telgia'
|
||||
]
|
||||
|
||||
CLUSTERS = Clusters([
|
||||
('Abelard', 'Abelar'),
|
||||
('Belgian', 'Belgia')
|
||||
])
|
||||
|
||||
|
||||
class TestSortedNeighborhood(object):
|
||||
def test_basics(self):
|
||||
|
||||
# Sorting alphabetically
|
||||
clusters = Clusters(sorted_neighborhood(DATA, distance=levenshtein, radius=1, window=1))
|
||||
|
||||
assert clusters == CLUSTERS
|
Loading…
Reference in New Issue