2018-06-21 16:32:30 +00:00
import csv
from fog.lsh import simhash, simhash_similarity
2018-06-21 16:43:15 +00:00
from fog.metrics import cosine_similarity
2018-06-21 16:32:30 +00:00
from fog.tokenizers import ngrams
from collections import defaultdict, Counter
with open('./data/musicians.csv', 'r') as f:
artists = set(line['artist'] for line in csv.DictReader(f))
buckets = defaultdict(list)
graph = defaultdict(Counter)
sh = lambda x: simhash(ngrams(5, x))
for university in artists:
h = bin(sh(university))[2:]
radius = 0.8
# k = floor((1.0 - radius) * 128)
k = 25
# TODO: find equation to optimize r & m
# TODO: what if k does not divide 128
# TODO: try a 64 bits version also with f parameter
r = 16
m = 128 // r
for i in range(0, 128, r):
key = (i, h[i:i + r])
bucket = buckets[key]
for neighbor in bucket:
graph[university][neighbor] += 1
graph[neighbor][university] += 1
candidates = 0
visited = set()
for university, neighbors in graph.items():
if university in visited:
cluster = [university]
for neighbor, count in neighbors.items():
if count > m - k:
candidates += 1
2018-06-21 16:43:15 +00:00
if cosine_similarity(ngrams(5, university), ngrams(5, neighbor)) >= radius:
2018-06-21 16:32:30 +00:00
if len(cluster) > 1:
print('Candidates', candidates, '/', int(len(artists) * (len(artists) - 1) / 2))