2018-06-08 17:07:02 +00:00
|
|
|
# =============================================================================
|
|
|
|
# Fog Pairwise Clustering Unit Tests
|
|
|
|
# =============================================================================
|
2018-06-11 15:00:59 +00:00
|
|
|
from test.clustering.utils import Clusters
|
2018-06-08 17:07:02 +00:00
|
|
|
from Levenshtein import distance as levenshtein
|
|
|
|
from fog.clustering import (
|
|
|
|
pairwise_leader,
|
|
|
|
pairwise_fuzzy_clusters,
|
|
|
|
pairwise_connected_components
|
|
|
|
)
|
|
|
|
|
|
|
|
DATA = [
|
|
|
|
'abc',
|
|
|
|
'bcd',
|
|
|
|
'cde',
|
|
|
|
'def',
|
|
|
|
'efg',
|
|
|
|
'fgh',
|
|
|
|
'ghi'
|
|
|
|
]
|
|
|
|
|
2018-06-11 15:00:59 +00:00
|
|
|
LEADER_CLUSTERS = Clusters([
|
2018-06-08 17:07:02 +00:00
|
|
|
['abc', 'bcd'],
|
|
|
|
['cde', 'def'],
|
|
|
|
['efg', 'fgh']
|
2018-06-11 15:00:59 +00:00
|
|
|
])
|
2018-06-08 17:07:02 +00:00
|
|
|
|
2018-06-11 15:00:59 +00:00
|
|
|
FUZZY_CLUSTERS = Clusters([
|
2018-06-08 17:07:02 +00:00
|
|
|
['abc', 'bcd'],
|
|
|
|
['cde', 'bcd', 'def'],
|
|
|
|
['efg', 'def', 'fgh'],
|
|
|
|
['ghi', 'fgh']
|
2018-06-11 15:00:59 +00:00
|
|
|
])
|
2018-06-08 17:07:02 +00:00
|
|
|
|
2018-06-11 15:00:59 +00:00
|
|
|
MIN_FUZZY_CLUSTERS = Clusters([
|
2018-06-08 17:07:02 +00:00
|
|
|
['bcd', 'abc', 'cde'],
|
|
|
|
['def', 'cde', 'efg'],
|
|
|
|
['fgh', 'efg', 'ghi']
|
2018-06-11 15:00:59 +00:00
|
|
|
])
|
2018-06-08 17:07:02 +00:00
|
|
|
|
|
|
|
|
|
|
|
class TestPairwiseClustering(object):
|
|
|
|
def test_pairwise_leader(self):
|
2018-06-11 15:00:59 +00:00
|
|
|
clusters = Clusters(pairwise_leader(DATA, distance=levenshtein, radius=2))
|
2018-06-08 17:07:02 +00:00
|
|
|
|
|
|
|
assert clusters == LEADER_CLUSTERS
|
|
|
|
|
|
|
|
def test_pairwise_fuzzy_clusters(self):
|
2018-06-11 15:00:59 +00:00
|
|
|
clusters = Clusters(pairwise_fuzzy_clusters(DATA, distance=levenshtein, radius=2))
|
2018-06-08 17:07:02 +00:00
|
|
|
|
|
|
|
assert clusters == FUZZY_CLUSTERS
|
|
|
|
|
2018-06-11 15:00:59 +00:00
|
|
|
min_clusters = Clusters(pairwise_fuzzy_clusters(DATA, distance=levenshtein, radius=2, min_size=3))
|
2018-06-08 17:07:02 +00:00
|
|
|
|
|
|
|
assert min_clusters == MIN_FUZZY_CLUSTERS
|
|
|
|
|
2018-07-02 19:23:29 +00:00
|
|
|
# Parallelized
|
2018-06-11 15:00:59 +00:00
|
|
|
parallel_clusters = Clusters(pairwise_fuzzy_clusters(DATA, distance=levenshtein, radius=2, processes=2, chunk_size=3))
|
2018-06-08 17:07:02 +00:00
|
|
|
|
|
|
|
assert parallel_clusters == FUZZY_CLUSTERS
|
|
|
|
|
2018-06-20 10:57:43 +00:00
|
|
|
# Using custom keys
|
|
|
|
keyed_data = [(1.0, d) for d in DATA]
|
|
|
|
|
|
|
|
clusters = Clusters([i[1] for i in c] for c in pairwise_fuzzy_clusters(keyed_data, distance=levenshtein, radius=2, key=lambda x: x[1]))
|
|
|
|
|
|
|
|
assert clusters == FUZZY_CLUSTERS
|
|
|
|
|
2018-06-08 17:07:02 +00:00
|
|
|
def test_pairwise_connected_components(self):
|
2018-06-11 15:00:59 +00:00
|
|
|
clusters = Clusters(pairwise_connected_components(DATA, distance=levenshtein, radius=2))
|
2018-06-08 17:07:02 +00:00
|
|
|
|
2018-06-11 15:00:59 +00:00
|
|
|
assert clusters == Clusters([DATA])
|
2018-06-15 12:27:19 +00:00
|
|
|
|
2018-07-02 19:23:29 +00:00
|
|
|
# Parallelized
|
|
|
|
parallel_clusters = Clusters(pairwise_connected_components(DATA, distance=levenshtein, radius=2, processes=2, chunk_size=3))
|
|
|
|
|
|
|
|
assert parallel_clusters == Clusters([DATA])
|
|
|
|
|
2018-06-15 12:27:19 +00:00
|
|
|
# Using custom keys
|
|
|
|
keyed_data = [(1.0, d) for d in DATA]
|
|
|
|
|
|
|
|
clusters = Clusters(pairwise_connected_components(keyed_data, distance=levenshtein, radius=2, key=lambda x: x[1]))
|
|
|
|
|
|
|
|
assert clusters == Clusters([keyed_data])
|