mirror of https://github.com/Yomguithereal/fog.git
MinHash experiments
This commit is contained in:
parent
12fde300cf
commit
3112ecdfc3
|
@ -55,7 +55,7 @@ with open('./data/universities.csv', 'r') as f:
|
|||
|
||||
key = lambda x: list(ngrams(3, x))
|
||||
|
||||
clusters = list(minhash(universities, precision=8, radius=0.8, key=key))
|
||||
clusters = list(minhash(universities, h=256, radius=0.8, key=key))
|
||||
|
||||
for cluster in clusters:
|
||||
print(cluster)
|
||||
|
|
|
@ -11,7 +11,7 @@ from collections import defaultdict
|
|||
import math
|
||||
|
||||
from fog.clustering.utils import merge_buckets_into_clusters
|
||||
from fog.lsh.minhash import LSBMinHash
|
||||
from fog.lsh.minhash import LSBMinHash, MinHash
|
||||
|
||||
|
||||
# TODO: optimize probability iteratively to find number of bands
|
||||
|
@ -29,9 +29,7 @@ def similarity_threshold(h, bands):
|
|||
return (1.0 / bands) ** (1 / (h / bands))
|
||||
|
||||
|
||||
def guess_bands(precision, radius, probability):
|
||||
h = precision * 64
|
||||
|
||||
def guess_bands(h, radius, probability):
|
||||
bands = 1
|
||||
|
||||
while bands <= h:
|
||||
|
@ -55,12 +53,40 @@ def guess_bands(precision, radius, probability):
|
|||
# TODO: faster hashing https://stackoverflow.com/questions/19701052/how-many-hash-functions-are-required-in-a-minhash-algorithm
|
||||
|
||||
|
||||
def minhash(data, precision=4, key=None, radius=0.8, probability=0.9):
|
||||
def minhash(data, h=256, key=None, radius=0.8, probability=0.9):
|
||||
|
||||
bands = guess_bands(h, radius, probability)
|
||||
rows = h // bands
|
||||
|
||||
print(bands)
|
||||
print(match_probability(h, bands, radius))
|
||||
print(similarity_threshold(h, bands))
|
||||
|
||||
mh = MinHash(h)
|
||||
|
||||
buckets = defaultdict(list)
|
||||
|
||||
for item in data:
|
||||
k = item
|
||||
|
||||
if key is not None:
|
||||
k = key(item)
|
||||
|
||||
signature = mh.create_signature(k)
|
||||
|
||||
for band in range(0, h, rows):
|
||||
band_key = (band, '%'.join(str(n) for n in signature[band:band + rows]))
|
||||
buckets[band_key].append(item)
|
||||
|
||||
yield from merge_buckets_into_clusters(buckets.values(), mode='connected_components')
|
||||
|
||||
|
||||
def minhash_lsb(data, precision=4, key=None, radius=0.8, probability=0.9):
|
||||
|
||||
h = precision * 64
|
||||
|
||||
# NOTE: it seems we need to divide the bands by 2 because of LSB
|
||||
bands = max(1, guess_bands(precision, radius, probability) // 2)
|
||||
bands = max(1, guess_bands(h, radius, probability) // 2)
|
||||
rows = h // bands
|
||||
|
||||
mh = LSBMinHash(precision=precision)
|
||||
|
@ -73,7 +99,7 @@ def minhash(data, precision=4, key=None, radius=0.8, probability=0.9):
|
|||
if key is not None:
|
||||
k = key(item)
|
||||
|
||||
signature = mh.hash(k)
|
||||
signature = mh.create_signature(k)
|
||||
|
||||
binary = ''.join([bin(i)[2:].rjust(64, '0') for i in signature])
|
||||
|
||||
|
|
Loading…
Reference in New Issue