# =============================================================================
# Fog Key Collision Clustering Unit Tests
# =============================================================================
from test.clustering.utils import Clusters
from Levenshtein import distance as levenshtein
from fog.clustering import key_collision
from fog.tokenizers import ngrams

DATA = [
    'Hello',
    'hello',
    'heLLo',
    'gooDbye',
    'Goodbye'
]

CLUSTERS = Clusters([
    ['Hello', 'hello', 'heLLo'],
    ['gooDbye', 'Goodbye']
])

NAMES = [
    'John Doe',
    'John Doe Jr.',
    'Mary S.',
    'Mary Silva',
    'John D.'
]

NAMES_CLUSTERS = Clusters([
    ['John Doe', 'John Doe Jr.', 'John D.'],
    ['John Doe', 'John Doe Jr.', 'John D.'],
    ['John Doe', 'John Doe Jr.'],
    ['John Doe', 'John Doe Jr.'],
    ['Mary S.', 'Mary Silva'],
    ['Mary S.', 'Mary Silva']
])

MERGED_NAMES_CLUSTERS = Clusters([
    ['John D.', 'John Doe', 'John Doe Jr.'],
    ['Mary S.', 'Mary Silva']
])


class TestKeyCollisionClustering(object):
    def test_single_key(self):
        clusters = Clusters(key_collision(DATA, key=lambda x: x.lower()))

        assert clusters == CLUSTERS

    def test_multiple_key(self):
        clusters = Clusters(key_collision(NAMES, keys=lambda x: ngrams(5, x), merge=False))

        assert clusters == NAMES_CLUSTERS

    def test_multiple_keys_merged(self):
        clusters = Clusters(key_collision(NAMES, keys=lambda x: ngrams(5, x)))

        assert clusters == MERGED_NAMES_CLUSTERS