MinHash experiments

This commit is contained in:
Yomguithereal 2018-06-19 18:25:52 +02:00
parent 12fde300cf
commit 3112ecdfc3
2 changed files with 34 additions and 8 deletions

View File

@ -55,7 +55,7 @@ with open('./data/universities.csv', 'r') as f:
key = lambda x: list(ngrams(3, x))
clusters = list(minhash(universities, precision=8, radius=0.8, key=key))
clusters = list(minhash(universities, h=256, radius=0.8, key=key))
for cluster in clusters:
print(cluster)

View File

@ -11,7 +11,7 @@ from collections import defaultdict
import math
from fog.clustering.utils import merge_buckets_into_clusters
from fog.lsh.minhash import LSBMinHash
from fog.lsh.minhash import LSBMinHash, MinHash
# TODO: optimize probability iteratively to find number of bands
@ -29,9 +29,7 @@ def similarity_threshold(h, bands):
return (1.0 / bands) ** (1 / (h / bands))
def guess_bands(precision, radius, probability):
h = precision * 64
def guess_bands(h, radius, probability):
bands = 1
while bands <= h:
@ -55,12 +53,40 @@ def guess_bands(precision, radius, probability):
# TODO: faster hashing https://stackoverflow.com/questions/19701052/how-many-hash-functions-are-required-in-a-minhash-algorithm
def minhash(data, precision=4, key=None, radius=0.8, probability=0.9):
def minhash(data, h=256, key=None, radius=0.8, probability=0.9):
bands = guess_bands(h, radius, probability)
rows = h // bands
print(bands)
print(match_probability(h, bands, radius))
print(similarity_threshold(h, bands))
mh = MinHash(h)
buckets = defaultdict(list)
for item in data:
k = item
if key is not None:
k = key(item)
signature = mh.create_signature(k)
for band in range(0, h, rows):
band_key = (band, '%'.join(str(n) for n in signature[band:band + rows]))
buckets[band_key].append(item)
yield from merge_buckets_into_clusters(buckets.values(), mode='connected_components')
def minhash_lsb(data, precision=4, key=None, radius=0.8, probability=0.9):
h = precision * 64
# NOTE: it seems we need to divide the bands by 2 because of LSB
bands = max(1, guess_bands(precision, radius, probability) // 2)
bands = max(1, guess_bands(h, radius, probability) // 2)
rows = h // bands
mh = LSBMinHash(precision=precision)
@ -73,7 +99,7 @@ def minhash(data, precision=4, key=None, radius=0.8, probability=0.9):
if key is not None:
k = key(item)
signature = mh.hash(k)
signature = mh.create_signature(k)
binary = ''.join([bin(i)[2:].rjust(64, '0') for i in signature])