2018-06-08 17:24:49 +00:00
|
|
|
# =============================================================================
|
|
|
|
# Fog Key Collision Clustering Unit Tests
|
|
|
|
# =============================================================================
|
2018-06-11 15:00:59 +00:00
|
|
|
from test.clustering.utils import Clusters
|
2018-06-08 17:24:49 +00:00
|
|
|
from Levenshtein import distance as levenshtein
|
|
|
|
from fog.clustering import key_collision
|
|
|
|
from fog.tokenizers import ngrams
|
|
|
|
|
|
|
|
DATA = [
|
|
|
|
'Hello',
|
|
|
|
'hello',
|
|
|
|
'heLLo',
|
|
|
|
'gooDbye',
|
|
|
|
'Goodbye'
|
|
|
|
]
|
|
|
|
|
2018-06-11 15:00:59 +00:00
|
|
|
CLUSTERS = Clusters([
|
2018-06-08 17:24:49 +00:00
|
|
|
['Hello', 'hello', 'heLLo'],
|
|
|
|
['gooDbye', 'Goodbye']
|
2018-06-11 15:00:59 +00:00
|
|
|
])
|
2018-06-08 17:24:49 +00:00
|
|
|
|
|
|
|
NAMES = [
|
|
|
|
'John Doe',
|
|
|
|
'John Doe Jr.',
|
|
|
|
'Mary S.',
|
|
|
|
'Mary Silva',
|
|
|
|
'John D.'
|
|
|
|
]
|
|
|
|
|
2018-06-11 15:00:59 +00:00
|
|
|
NAMES_CLUSTERS = Clusters([
|
2018-06-08 17:24:49 +00:00
|
|
|
['John Doe', 'John Doe Jr.', 'John D.'],
|
|
|
|
['John Doe', 'John Doe Jr.', 'John D.'],
|
|
|
|
['John Doe', 'John Doe Jr.'],
|
|
|
|
['John Doe', 'John Doe Jr.'],
|
|
|
|
['Mary S.', 'Mary Silva'],
|
|
|
|
['Mary S.', 'Mary Silva']
|
2018-06-11 15:00:59 +00:00
|
|
|
])
|
2018-06-08 17:24:49 +00:00
|
|
|
|
2018-06-11 15:00:59 +00:00
|
|
|
MERGED_NAMES_CLUSTERS = Clusters([
|
|
|
|
['John D.', 'John Doe', 'John Doe Jr.'],
|
|
|
|
['Mary S.', 'Mary Silva']
|
2018-06-08 17:24:49 +00:00
|
|
|
])
|
|
|
|
|
|
|
|
|
|
|
|
class TestKeyCollisionClustering(object):
|
|
|
|
def test_single_key(self):
|
2018-06-11 15:00:59 +00:00
|
|
|
clusters = Clusters(key_collision(DATA, key=lambda x: x.lower()))
|
2018-06-08 17:24:49 +00:00
|
|
|
|
2018-06-11 15:00:59 +00:00
|
|
|
assert clusters == CLUSTERS
|
2018-06-08 17:24:49 +00:00
|
|
|
|
|
|
|
def test_multiple_key(self):
|
2018-07-11 13:56:34 +00:00
|
|
|
clusters = Clusters(key_collision(NAMES, keys=lambda x: ngrams(5, x), merge=False))
|
2018-06-08 17:24:49 +00:00
|
|
|
|
|
|
|
assert clusters == NAMES_CLUSTERS
|
|
|
|
|
|
|
|
def test_multiple_keys_merged(self):
|
2018-07-11 13:56:34 +00:00
|
|
|
clusters = Clusters(key_collision(NAMES, keys=lambda x: ngrams(5, x)))
|
2018-06-08 17:24:49 +00:00
|
|
|
|
|
|
|
assert clusters == MERGED_NAMES_CLUSTERS
|