fog/test/clustering/key_collision_test.py

60 lines
1.5 KiB
Python
Raw Normal View History

2018-06-08 17:24:49 +00:00
# =============================================================================
# Fog Key Collision Clustering Unit Tests
# =============================================================================
2018-06-11 15:00:59 +00:00
from test.clustering.utils import Clusters
2018-06-08 17:24:49 +00:00
from Levenshtein import distance as levenshtein
from fog.clustering import key_collision
from fog.tokenizers import ngrams
DATA = [
'Hello',
'hello',
'heLLo',
'gooDbye',
'Goodbye'
]
2018-06-11 15:00:59 +00:00
CLUSTERS = Clusters([
2018-06-08 17:24:49 +00:00
['Hello', 'hello', 'heLLo'],
['gooDbye', 'Goodbye']
2018-06-11 15:00:59 +00:00
])
2018-06-08 17:24:49 +00:00
NAMES = [
'John Doe',
'John Doe Jr.',
'Mary S.',
'Mary Silva',
'John D.'
]
2018-06-11 15:00:59 +00:00
NAMES_CLUSTERS = Clusters([
2018-06-08 17:24:49 +00:00
['John Doe', 'John Doe Jr.', 'John D.'],
['John Doe', 'John Doe Jr.', 'John D.'],
['John Doe', 'John Doe Jr.'],
['John Doe', 'John Doe Jr.'],
['Mary S.', 'Mary Silva'],
['Mary S.', 'Mary Silva']
2018-06-11 15:00:59 +00:00
])
2018-06-08 17:24:49 +00:00
2018-06-11 15:00:59 +00:00
MERGED_NAMES_CLUSTERS = Clusters([
['John D.', 'John Doe', 'John Doe Jr.'],
['Mary S.', 'Mary Silva']
2018-06-08 17:24:49 +00:00
])
class TestKeyCollisionClustering(object):
def test_single_key(self):
2018-06-11 15:00:59 +00:00
clusters = Clusters(key_collision(DATA, key=lambda x: x.lower()))
2018-06-08 17:24:49 +00:00
2018-06-11 15:00:59 +00:00
assert clusters == CLUSTERS
2018-06-08 17:24:49 +00:00
def test_multiple_key(self):
2018-07-11 13:56:34 +00:00
clusters = Clusters(key_collision(NAMES, keys=lambda x: ngrams(5, x), merge=False))
2018-06-08 17:24:49 +00:00
assert clusters == NAMES_CLUSTERS
def test_multiple_keys_merged(self):
2018-07-11 13:56:34 +00:00
clusters = Clusters(key_collision(NAMES, keys=lambda x: ngrams(5, x)))
2018-06-08 17:24:49 +00:00
assert clusters == MERGED_NAMES_CLUSTERS