fog/experiments/skeleton_omission.py

84 lines
4.4 KiB
Python
Raw Permalink Normal View History

# Little experiments testing the recall of the skeleton and omission keys
2018-07-31 16:20:26 +00:00
#
# Note that counting the number of clusters may be erroneous but with
# a low Levenshtein distance, clusters are rarely very large and this is
# good enough.
#
import csv
from Levenshtein import distance as levenshtein
from fog.clustering import pairwise_connected_components, sorted_neighborhood
from fog.key import skeleton_key, omission_key
GROUND_TRUTH_LEV1 = 138
GROUND_TRUTH_LEV2 = 627
with open('./data/musicians.csv', 'r') as f:
reader = csv.DictReader(f)
artists = sorted(set(line['artist'] for line in reader))
2018-07-31 16:21:51 +00:00
print('Artists: %i' % len(artists))
2018-07-31 16:21:51 +00:00
# true_clusters = list(pairwise_connected_components(artists, distance=levenshtein, radius=2, processes=8))
2018-07-31 16:21:51 +00:00
# print(len(true_clusters))
2018-07-31 16:21:51 +00:00
print('GroundTruth-Lev1: %i' % GROUND_TRUTH_LEV1)
print('GroundTruth-Lev2: %i' % GROUND_TRUTH_LEV2)
2018-07-31 16:21:51 +00:00
skeleton_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, key=skeleton_key))
2018-07-31 16:21:51 +00:00
print('Skeleton-Lev1: Found %i clusters (Recall: %f)' % (len(skeleton_clusters), len(skeleton_clusters) / GROUND_TRUTH_LEV1))
2018-07-31 16:21:51 +00:00
skeleton_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, key=skeleton_key))
2018-07-31 16:21:51 +00:00
print('Skeleton-Lev2: Found %i clusters (Recall: %f)' % (len(skeleton_clusters), len(skeleton_clusters) / GROUND_TRUTH_LEV2))
2018-07-31 16:21:51 +00:00
omission_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, key=omission_key))
2018-07-31 16:21:51 +00:00
print('Omission-Lev1: Found %i clusters (Recall: %f)' % (len(omission_clusters), len(omission_clusters) / GROUND_TRUTH_LEV1))
2018-07-31 16:21:51 +00:00
omission_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, key=omission_key))
2018-07-31 16:21:51 +00:00
print('Omission-Lev2: Found %i clusters (Recall: %f)' % (len(omission_clusters), len(omission_clusters) / GROUND_TRUTH_LEV2))
2018-07-31 16:21:51 +00:00
compound_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, keys=(omission_key, skeleton_key)))
2018-07-31 16:21:51 +00:00
print('Compound-Lev1: Found %i clusters (Recall: %f)' % (len(compound_clusters), len(compound_clusters) / GROUND_TRUTH_LEV1))
2018-07-31 16:21:51 +00:00
compound_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, keys=(omission_key, skeleton_key)))
2018-07-31 16:21:51 +00:00
print('Compound-Lev2: Found %i clusters (Recall: %f)' % (len(compound_clusters), len(compound_clusters) / GROUND_TRUTH_LEV2))
2018-07-31 16:21:51 +00:00
lexicographic_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, key=None))
2018-07-31 16:21:51 +00:00
print('Lexicographic-Lev1: Found %i clusters (Recall: %f)' % (len(lexicographic_clusters), len(lexicographic_clusters) / GROUND_TRUTH_LEV1))
2018-07-31 16:21:51 +00:00
lexicographic_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, key=None))
2018-07-31 16:21:51 +00:00
print('Lexicographic-Lev2: Found %i clusters (Recall: %f)' % (len(lexicographic_clusters), len(lexicographic_clusters) / GROUND_TRUTH_LEV2))
2018-07-31 16:21:51 +00:00
reverse_lexicographic_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, key=lambda x: x[::-1]))
2018-07-31 16:21:51 +00:00
print('ReverseLexicographic-Lev1: Found %i clusters (Recall: %f)' % (len(reverse_lexicographic_clusters), len(reverse_lexicographic_clusters) / GROUND_TRUTH_LEV1))
2018-07-31 16:21:51 +00:00
reverse_lexicographic_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, key=lambda x: x[::-1]))
2018-07-31 16:21:51 +00:00
print('ReverseLexicographic-Lev2: Found %i clusters (Recall: %f)' % (len(reverse_lexicographic_clusters), len(reverse_lexicographic_clusters) / GROUND_TRUTH_LEV2))
2018-07-31 16:21:51 +00:00
compound_lexicographic_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, keys=(None, lambda x: x[::-1])))
2018-07-31 16:21:51 +00:00
print('CompoundLexicographic-Lev1: Found %i clusters (Recall: %f)' % (len(compound_lexicographic_clusters), len(compound_lexicographic_clusters) / GROUND_TRUTH_LEV1))
2018-07-31 16:21:51 +00:00
compound_lexicographic_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, keys=(None, lambda x: x[::-1])))
2018-07-31 16:21:51 +00:00
print('CompoundLexicographic-Lev2: Found %i clusters (Recall: %f)' % (len(compound_lexicographic_clusters), len(compound_lexicographic_clusters) / GROUND_TRUTH_LEV2))
2018-07-31 16:21:51 +00:00
mega_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=1, keys=(None, lambda x: x[::-1])))
2018-07-31 16:21:51 +00:00
print('Mega-Lev1: Found %i clusters (Recall: %f)' % (len(mega_clusters), len(mega_clusters) / GROUND_TRUTH_LEV1))
2018-07-31 16:21:51 +00:00
mega_clusters = list(sorted_neighborhood(artists, distance=levenshtein, radius=2, keys=(None, lambda x: x[::-1], omission_key, skeleton_key)))
2018-07-31 16:21:51 +00:00
print('Mega-Lev2: Found %i clusters (Recall: %f)' % (len(mega_clusters), len(mega_clusters) / GROUND_TRUTH_LEV2))