Better minhash clustering

This commit is contained in:
Yomguithereal 2018-06-19 20:55:59 +02:00
parent 3112ecdfc3
commit bbc65cfc06
3 changed files with 47 additions and 59 deletions

View File

@ -53,12 +53,21 @@ with open('./data/universities.csv', 'r') as f:
print('%i universities' % len(universities))
key = lambda x: list(ngrams(3, x))
key = lambda x: list(ngrams(5, x))
clusters = list(minhash(universities, h=256, radius=0.8, key=key))
RADIUS = 0.80
clusters = list(minhash(universities, h=240, threshold=RADIUS, key=key))
c = 0
for cluster in clusters:
print(cluster)
j = jaccard_similarity(key(cluster[0]), key(cluster[1]))
if j >= RADIUS:
c += 1
print(cluster, j)
print('Count', c)
# for cluster in clusters:
# print(cluster)
@ -66,9 +75,11 @@ with open('./data/universities.csv', 'r') as f:
# TODO: Compare found items, use ngrams also
print(distinct_values(clusters))
clusters = list(pairwise(universities, mode='connected_components', similarity=jaccard_similarity, radius=0.8, key=key))
clusters = list(pairwise(universities, mode='connected_components', similarity=jaccard_similarity, radius=RADIUS, key=key))
print(distinct_values(clusters))
for cluster in clusters:
print(cluster)
print('Count', len(clusters))

View File

@ -12,14 +12,18 @@ import math
from fog.clustering.utils import merge_buckets_into_clusters
from fog.lsh.minhash import LSBMinHash, MinHash
from fog.metrics.jaccard import jaccard_similarity
# TODO: optimize probability iteratively to find number of bands
# Note than ideally, number of rows should divide 64 evenly
# TODO: else try also to find precision
# TODO: step 1 bands option + sane iteration + experiments with threshold to see what gives
# TODO:
# * Parallelize
# * use threshold to find bands (works better) similarity just bring more
# * possibility to hash the band key
# * note that we allow uneven bands for fine grained results
# * double_check with minhash or jaccard or sub similarity even
# * superminhash to generate signature faster
# * cheap_hashes
# TODO: parallelize
def match_probability(h, bands, similarity):
return 1.0 - (1.0 - similarity ** (h / bands)) ** bands
@ -29,38 +33,27 @@ def similarity_threshold(h, bands):
return (1.0 / bands) ** (1 / (h / bands))
def guess_bands(h, radius, probability):
def guess_bands(h, threshold):
bands = 1
while bands <= h:
p = match_probability(h, bands, radius)
t = similarity_threshold(h, bands)
if p >= probability:
if t <= threshold:
break
bands += 1
while h % bands != 0:
bands += 1
return bands
# TODO: double_check with jaccard or minhash, sub similarity or true radius
# TODO: compute on 64 * precision to avoid modulo issues and filtering out
# TODO: need to think in bands ^ not bands = precision
# TODO: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4431368/
# TODO: superminhash https://arxiv.org/pdf/1706.05698.pdf
# TODO: faster hashing https://stackoverflow.com/questions/19701052/how-many-hash-functions-are-required-in-a-minhash-algorithm
def minhash(data, h=256, key=None, threshold=0.8, bands=None):
def minhash(data, h=256, key=None, radius=0.8, probability=0.9):
if bands is None:
bands = guess_bands(h, threshold)
bands = guess_bands(h, radius, probability)
rows = h // bands
print(bands)
print(match_probability(h, bands, radius))
print(similarity_threshold(h, bands))
h_upper_bound = bands * rows
mh = MinHash(h)
@ -74,37 +67,18 @@ def minhash(data, h=256, key=None, radius=0.8, probability=0.9):
signature = mh.create_signature(k)
for band in range(0, h, rows):
for band in range(0, h_upper_bound, rows):
band_key = (band, '%'.join(str(n) for n in signature[band:band + rows]))
buckets[band_key].append(item)
yield from merge_buckets_into_clusters(buckets.values(), mode='connected_components')
def double_check(A, B):
if k is not None:
return jaccard_similarity(key(A), key(B)) >= threshold
return jaccard_similarity(A, B) >= threshold
def minhash_lsb(data, precision=4, key=None, radius=0.8, probability=0.9):
h = precision * 64
# NOTE: it seems we need to divide the bands by 2 because of LSB
bands = max(1, guess_bands(h, radius, probability) // 2)
rows = h // bands
mh = LSBMinHash(precision=precision)
buckets = defaultdict(list)
for item in data:
k = item
if key is not None:
k = key(item)
signature = mh.create_signature(k)
binary = ''.join([bin(i)[2:].rjust(64, '0') for i in signature])
for band in range(0, h, rows):
band_key = (band, binary[band:band + rows])
buckets[band_key].append(item)
yield from merge_buckets_into_clusters(buckets.values(), mode='connected_components')
yield from merge_buckets_into_clusters(
buckets.values(),
mode='connected_components',
similarity=double_check
)

View File

@ -42,7 +42,7 @@ def make_similarity_function(similarity=None, distance=None, radius=None):
def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'),
mode='fuzzy_clusters'):
mode='fuzzy_clusters', similarity=None):
"""
Function merging buckets into fuzzy clusters. Each bucket will create
relations in an undirected graph that is later solved to compose clusters.
@ -54,6 +54,8 @@ def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'),
infinity.
mode (string, optional): 'fuzzy_clusters' or 'connected_components'.
Defaults to 'fuzzy_clusters'.
similarity (callable, optional)= similarity function to use to validate
matches from buckets.
Yields:
list: A viable cluster.
@ -70,8 +72,9 @@ def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'),
for j in range(i + 1, n):
B = bucket[j]
graph[A].add(B)
graph[B].add(A)
if similarity is None or similarity(A, B):
graph[A].add(B)
graph[B].add(A)
# TODO: leader mode
if mode == 'fuzzy_clusters':