mirror of https://github.com/Yomguithereal/fog.git
54 lines
1.5 KiB
Python
54 lines
1.5 KiB
Python
import csv
|
|
from experiments.utils import Timer
|
|
from fog.clustering import minhash, jaccard_intersection_index
|
|
from fog.tokenizers import ngrams
|
|
|
|
GROUND_TRUTH = 15
|
|
|
|
# ['Luna (singer)', 'Yuna (singer)']
|
|
# ['Lobo (musician)', 'Robo (musician)']
|
|
# ['Sam Jones (musician)', 'Adam Jones (musician)']
|
|
# ['Donnie Brooks', 'Lonnie Brooks']
|
|
# ['Dan Wilson (musician)', 'Alan Wilson (musician)', 'Ian Wilson (musician)']
|
|
# ['Arsenie Todiraş', 'Arsenie Todiraș']
|
|
# ['Tim Ward (musician)', 'Jim Ward (musician)']
|
|
# ['KK (singer)', 'K (singer)']
|
|
# ['Wando (singer)', 'Mando (singer)']
|
|
# ['Phillip Phillips', 'Flip Phillips']
|
|
# ['Mina (singer)', 'Dina (singer)']
|
|
# ['Nana (singer)', 'Bana (singer)', 'Jana (singer)']
|
|
# ['John Paris', 'John Parish']
|
|
# ['Ronnie Van Zant', 'Donnie Van Zant']
|
|
# ['Steve Nardelli', 'Steve Nardella']
|
|
|
|
with open('./data/musicians.csv', 'r') as f:
|
|
artists = set(line['artist'] for line in csv.DictReader(f))
|
|
|
|
print('Artists:', len(artists))
|
|
|
|
key = lambda x: list(ngrams(5, x))
|
|
radius = 0.8
|
|
|
|
print()
|
|
print('Minhash')
|
|
with Timer():
|
|
clusters = list(minhash(artists, key=key, radius=radius, use_numpy=True))
|
|
|
|
for cluster in clusters:
|
|
print(cluster)
|
|
|
|
print('Clusters:', len(clusters))
|
|
print('Precision:', len(clusters) / GROUND_TRUTH)
|
|
|
|
print()
|
|
print('Jaccard Intersection Index')
|
|
with Timer():
|
|
clusters = list(jaccard_intersection_index(artists, key=key, radius=radius))
|
|
|
|
for cluster in clusters:
|
|
print(cluster)
|
|
|
|
print('Clusters:', len(clusters))
|
|
print('Precision:', len(clusters) / GROUND_TRUTH)
|
|
|