fog/test/clustering/pairwise_test.py

85 lines
2.4 KiB
Python
Raw Normal View History

2018-06-08 17:07:02 +00:00
# =============================================================================
# Fog Pairwise Clustering Unit Tests
# =============================================================================
2018-06-11 15:00:59 +00:00
from test.clustering.utils import Clusters
2018-06-08 17:07:02 +00:00
from Levenshtein import distance as levenshtein
from fog.clustering import (
pairwise_leader,
pairwise_fuzzy_clusters,
pairwise_connected_components
)
DATA = [
'abc',
'bcd',
'cde',
'def',
'efg',
'fgh',
'ghi'
]
2018-06-11 15:00:59 +00:00
LEADER_CLUSTERS = Clusters([
2018-06-08 17:07:02 +00:00
['abc', 'bcd'],
['cde', 'def'],
['efg', 'fgh']
2018-06-11 15:00:59 +00:00
])
2018-06-08 17:07:02 +00:00
2018-06-11 15:00:59 +00:00
FUZZY_CLUSTERS = Clusters([
2018-06-08 17:07:02 +00:00
['abc', 'bcd'],
['cde', 'bcd', 'def'],
['efg', 'def', 'fgh'],
['ghi', 'fgh']
2018-06-11 15:00:59 +00:00
])
2018-06-08 17:07:02 +00:00
2018-06-11 15:00:59 +00:00
MIN_FUZZY_CLUSTERS = Clusters([
2018-06-08 17:07:02 +00:00
['bcd', 'abc', 'cde'],
['def', 'cde', 'efg'],
['fgh', 'efg', 'ghi']
2018-06-11 15:00:59 +00:00
])
2018-06-08 17:07:02 +00:00
class TestPairwiseClustering(object):
def test_pairwise_leader(self):
2018-06-11 15:00:59 +00:00
clusters = Clusters(pairwise_leader(DATA, distance=levenshtein, radius=2))
2018-06-08 17:07:02 +00:00
assert clusters == LEADER_CLUSTERS
def test_pairwise_fuzzy_clusters(self):
2018-06-11 15:00:59 +00:00
clusters = Clusters(pairwise_fuzzy_clusters(DATA, distance=levenshtein, radius=2))
2018-06-08 17:07:02 +00:00
assert clusters == FUZZY_CLUSTERS
2018-06-11 15:00:59 +00:00
min_clusters = Clusters(pairwise_fuzzy_clusters(DATA, distance=levenshtein, radius=2, min_size=3))
2018-06-08 17:07:02 +00:00
assert min_clusters == MIN_FUZZY_CLUSTERS
2018-07-02 19:23:29 +00:00
# Parallelized
2018-06-11 15:00:59 +00:00
parallel_clusters = Clusters(pairwise_fuzzy_clusters(DATA, distance=levenshtein, radius=2, processes=2, chunk_size=3))
2018-06-08 17:07:02 +00:00
assert parallel_clusters == FUZZY_CLUSTERS
2018-06-20 10:57:43 +00:00
# Using custom keys
keyed_data = [(1.0, d) for d in DATA]
clusters = Clusters([i[1] for i in c] for c in pairwise_fuzzy_clusters(keyed_data, distance=levenshtein, radius=2, key=lambda x: x[1]))
assert clusters == FUZZY_CLUSTERS
2018-06-08 17:07:02 +00:00
def test_pairwise_connected_components(self):
2018-06-11 15:00:59 +00:00
clusters = Clusters(pairwise_connected_components(DATA, distance=levenshtein, radius=2))
2018-06-08 17:07:02 +00:00
2018-06-11 15:00:59 +00:00
assert clusters == Clusters([DATA])
2018-06-15 12:27:19 +00:00
2018-07-02 19:23:29 +00:00
# Parallelized
parallel_clusters = Clusters(pairwise_connected_components(DATA, distance=levenshtein, radius=2, processes=2, chunk_size=3))
assert parallel_clusters == Clusters([DATA])
2018-06-15 12:27:19 +00:00
# Using custom keys
keyed_data = [(1.0, d) for d in DATA]
clusters = Clusters(pairwise_connected_components(keyed_data, distance=levenshtein, radius=2, key=lambda x: x[1]))
assert clusters == Clusters([keyed_data])