fog/test/clustering/key_collision_test.py

60 lines
1.5 KiB
Python

# =============================================================================
# Fog Key Collision Clustering Unit Tests
# =============================================================================
from test.clustering.utils import Clusters
from Levenshtein import distance as levenshtein
from fog.clustering import key_collision
from fog.tokenizers import ngrams
DATA = [
'Hello',
'hello',
'heLLo',
'gooDbye',
'Goodbye'
]
CLUSTERS = Clusters([
['Hello', 'hello', 'heLLo'],
['gooDbye', 'Goodbye']
])
NAMES = [
'John Doe',
'John Doe Jr.',
'Mary S.',
'Mary Silva',
'John D.'
]
NAMES_CLUSTERS = Clusters([
['John Doe', 'John Doe Jr.', 'John D.'],
['John Doe', 'John Doe Jr.', 'John D.'],
['John Doe', 'John Doe Jr.'],
['John Doe', 'John Doe Jr.'],
['Mary S.', 'Mary Silva'],
['Mary S.', 'Mary Silva']
])
MERGED_NAMES_CLUSTERS = Clusters([
['John D.', 'John Doe', 'John Doe Jr.'],
['Mary S.', 'Mary Silva']
])
class TestKeyCollisionClustering(object):
def test_single_key(self):
clusters = Clusters(key_collision(DATA, key=lambda x: x.lower()))
assert clusters == CLUSTERS
def test_multiple_key(self):
clusters = Clusters(key_collision(NAMES, keys=lambda x: ngrams(5, x), merge=False))
assert clusters == NAMES_CLUSTERS
def test_multiple_keys_merged(self):
clusters = Clusters(key_collision(NAMES, keys=lambda x: ngrams(5, x)))
assert clusters == MERGED_NAMES_CLUSTERS