2018-07-31 16:19:14 +00:00
|
|
|
# Little experiments testing the recall of the skeleton and omission keys
|
2018-07-31 16:20:26 +00:00
|
|
|
#
|
|
|
|
# Note that counting the number of clusters may be erroneous but with
|
|
|
|
# a low Levenshtein distance, clusters are rarely very large and this is
|
|
|
|
# good enough.
|
|
|
|
#
|
2018-07-31 16:19:14 +00:00
|
|
|
import csv
|
|
|
|
from Levenshtein import distance as levenshtein
|
|
|
|
from fog.clustering import pairwise_connected_components, sorted_neighborhood
|
|
|
|
from fog.key import skeleton_key, omission_key
|
|
|
|
|
|
|
|
GROUND_TRUTH_LEV1 = 138
|
|
|
|
GROUND_TRUTH_LEV2 = 627
|
|
|
|
|
|
|
|
with open('./data/musicians.csv', 'r') as f:
|
|
|
|
reader = csv.DictReader(f)
|
|
|
|
|
|
|
|
artists = sorted(set(line['artist'] for line in reader))
|
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('Artists: %i' % len(artists))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
# true_clusters = list(pairwise_connected_components(artists, distance=levenshtein, radius=2, processes=8))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
# print(len(true_clusters))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('GroundTruth-Lev1: %i' % GROUND_TRUTH_LEV1)
|
|
|
|
print('GroundTruth-Lev2: %i' % GROUND_TRUTH_LEV2)
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
skeleton_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, key=skeleton_key))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('Skeleton-Lev1: Found %i clusters (Recall: %f)' % (len(skeleton_clusters), len(skeleton_clusters) / GROUND_TRUTH_LEV1))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
skeleton_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, key=skeleton_key))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('Skeleton-Lev2: Found %i clusters (Recall: %f)' % (len(skeleton_clusters), len(skeleton_clusters) / GROUND_TRUTH_LEV2))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
omission_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, key=omission_key))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('Omission-Lev1: Found %i clusters (Recall: %f)' % (len(omission_clusters), len(omission_clusters) / GROUND_TRUTH_LEV1))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
omission_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, key=omission_key))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('Omission-Lev2: Found %i clusters (Recall: %f)' % (len(omission_clusters), len(omission_clusters) / GROUND_TRUTH_LEV2))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
compound_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, keys=(omission_key, skeleton_key)))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('Compound-Lev1: Found %i clusters (Recall: %f)' % (len(compound_clusters), len(compound_clusters) / GROUND_TRUTH_LEV1))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
compound_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, keys=(omission_key, skeleton_key)))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('Compound-Lev2: Found %i clusters (Recall: %f)' % (len(compound_clusters), len(compound_clusters) / GROUND_TRUTH_LEV2))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
lexicographic_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, key=None))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('Lexicographic-Lev1: Found %i clusters (Recall: %f)' % (len(lexicographic_clusters), len(lexicographic_clusters) / GROUND_TRUTH_LEV1))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
lexicographic_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, key=None))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('Lexicographic-Lev2: Found %i clusters (Recall: %f)' % (len(lexicographic_clusters), len(lexicographic_clusters) / GROUND_TRUTH_LEV2))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
reverse_lexicographic_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, key=lambda x: x[::-1]))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('ReverseLexicographic-Lev1: Found %i clusters (Recall: %f)' % (len(reverse_lexicographic_clusters), len(reverse_lexicographic_clusters) / GROUND_TRUTH_LEV1))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
reverse_lexicographic_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, key=lambda x: x[::-1]))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('ReverseLexicographic-Lev2: Found %i clusters (Recall: %f)' % (len(reverse_lexicographic_clusters), len(reverse_lexicographic_clusters) / GROUND_TRUTH_LEV2))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
compound_lexicographic_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, keys=(None, lambda x: x[::-1])))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('CompoundLexicographic-Lev1: Found %i clusters (Recall: %f)' % (len(compound_lexicographic_clusters), len(compound_lexicographic_clusters) / GROUND_TRUTH_LEV1))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
compound_lexicographic_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, keys=(None, lambda x: x[::-1])))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('CompoundLexicographic-Lev2: Found %i clusters (Recall: %f)' % (len(compound_lexicographic_clusters), len(compound_lexicographic_clusters) / GROUND_TRUTH_LEV2))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
mega_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, keys=(None, lambda x: x[::-1])))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('Mega-Lev1: Found %i clusters (Recall: %f)' % (len(mega_clusters), len(mega_clusters) / GROUND_TRUTH_LEV1))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
mega_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, keys=(None, lambda x: x[::-1], omission_key, skeleton_key)))
|
2018-07-31 16:19:14 +00:00
|
|
|
|
2018-07-31 16:21:51 +00:00
|
|
|
print('Mega-Lev2: Found %i clusters (Recall: %f)' % (len(mega_clusters), len(mega_clusters) / GROUND_TRUTH_LEV2))
|