fog/test/clustering/key_collision_test.py

# =============================================================================
# Fog Key Collision Clustering Unit Tests
# =============================================================================
from Levenshtein import distance as levenshtein
from fog.clustering import key_collision
from fog.tokenizers import ngrams

DATA = [
    'Hello',
    'hello',
    'heLLo',
    'gooDbye',
    'Goodbye'
]

CLUSTERS = [
    ['Hello', 'hello', 'heLLo'],
    ['gooDbye', 'Goodbye']
]

NAMES = [
    'John Doe',
    'John Doe Jr.',
    'Mary S.',
    'Mary Silva',
    'John D.'
]

NAMES_CLUSTERS = [
    ['John Doe', 'John Doe Jr.', 'John D.'],
    ['John Doe', 'John Doe Jr.', 'John D.'],
    ['John Doe', 'John Doe Jr.'],
    ['John Doe', 'John Doe Jr.'],
    ['Mary S.', 'Mary Silva'],
    ['Mary S.', 'Mary Silva']
]

MERGED_NAMES_CLUSTERS = set([
    ('John D.', 'John Doe', 'John Doe Jr.'),
    ('Mary S.', 'Mary Silva')
])


class TestKeyCollisionClustering(object):
    def test_single_key(self):
        clusters = list(key_collision(DATA, key=lambda x: x.lower()))

        assert clusters == clusters

    def test_multiple_key(self):
        clusters = list(key_collision(NAMES, keys=lambda x: ngrams(5, x)))

        assert clusters == NAMES_CLUSTERS

    def test_multiple_keys_merged(self):
        clusters = list(key_collision(NAMES, keys=lambda x: ngrams(5, x), merge=True))
        clusters = set(tuple(sorted(c)) for c in clusters)

        assert clusters == MERGED_NAMES_CLUSTERS