mirror of https://github.com/Yomguithereal/fog.git
Better minhash clustering
This commit is contained in:
parent
3112ecdfc3
commit
bbc65cfc06
|
@ -53,12 +53,21 @@ with open('./data/universities.csv', 'r') as f:
|
|||
|
||||
print('%i universities' % len(universities))
|
||||
|
||||
key = lambda x: list(ngrams(3, x))
|
||||
key = lambda x: list(ngrams(5, x))
|
||||
|
||||
clusters = list(minhash(universities, h=256, radius=0.8, key=key))
|
||||
RADIUS = 0.80
|
||||
|
||||
clusters = list(minhash(universities, h=240, threshold=RADIUS, key=key))
|
||||
|
||||
c = 0
|
||||
for cluster in clusters:
|
||||
print(cluster)
|
||||
j = jaccard_similarity(key(cluster[0]), key(cluster[1]))
|
||||
|
||||
if j >= RADIUS:
|
||||
c += 1
|
||||
print(cluster, j)
|
||||
|
||||
print('Count', c)
|
||||
|
||||
# for cluster in clusters:
|
||||
# print(cluster)
|
||||
|
@ -66,9 +75,11 @@ with open('./data/universities.csv', 'r') as f:
|
|||
# TODO: Compare found items, use ngrams also
|
||||
print(distinct_values(clusters))
|
||||
|
||||
clusters = list(pairwise(universities, mode='connected_components', similarity=jaccard_similarity, radius=0.8, key=key))
|
||||
clusters = list(pairwise(universities, mode='connected_components', similarity=jaccard_similarity, radius=RADIUS, key=key))
|
||||
|
||||
print(distinct_values(clusters))
|
||||
|
||||
for cluster in clusters:
|
||||
print(cluster)
|
||||
|
||||
print('Count', len(clusters))
|
||||
|
|
|
@ -12,14 +12,18 @@ import math
|
|||
|
||||
from fog.clustering.utils import merge_buckets_into_clusters
|
||||
from fog.lsh.minhash import LSBMinHash, MinHash
|
||||
from fog.metrics.jaccard import jaccard_similarity
|
||||
|
||||
|
||||
# TODO: optimize probability iteratively to find number of bands
|
||||
# Note than ideally, number of rows should divide 64 evenly
|
||||
# TODO: else try also to find precision
|
||||
# TODO: step 1 bands option + sane iteration + experiments with threshold to see what gives
|
||||
# TODO:
|
||||
# * Parallelize
|
||||
# * use threshold to find bands (works better) similarity just bring more
|
||||
# * possibility to hash the band key
|
||||
# * note that we allow uneven bands for fine grained results
|
||||
# * double_check with minhash or jaccard or sub similarity even
|
||||
# * superminhash to generate signature faster
|
||||
# * cheap_hashes
|
||||
|
||||
# TODO: parallelize
|
||||
|
||||
def match_probability(h, bands, similarity):
|
||||
return 1.0 - (1.0 - similarity ** (h / bands)) ** bands
|
||||
|
@ -29,38 +33,27 @@ def similarity_threshold(h, bands):
|
|||
return (1.0 / bands) ** (1 / (h / bands))
|
||||
|
||||
|
||||
def guess_bands(h, radius, probability):
|
||||
def guess_bands(h, threshold):
|
||||
bands = 1
|
||||
|
||||
while bands <= h:
|
||||
p = match_probability(h, bands, radius)
|
||||
t = similarity_threshold(h, bands)
|
||||
|
||||
if p >= probability:
|
||||
if t <= threshold:
|
||||
break
|
||||
|
||||
bands += 1
|
||||
|
||||
while h % bands != 0:
|
||||
bands += 1
|
||||
|
||||
return bands
|
||||
|
||||
# TODO: double_check with jaccard or minhash, sub similarity or true radius
|
||||
# TODO: compute on 64 * precision to avoid modulo issues and filtering out
|
||||
# TODO: need to think in bands ^ not bands = precision
|
||||
# TODO: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4431368/
|
||||
# TODO: superminhash https://arxiv.org/pdf/1706.05698.pdf
|
||||
# TODO: faster hashing https://stackoverflow.com/questions/19701052/how-many-hash-functions-are-required-in-a-minhash-algorithm
|
||||
|
||||
def minhash(data, h=256, key=None, threshold=0.8, bands=None):
|
||||
|
||||
def minhash(data, h=256, key=None, radius=0.8, probability=0.9):
|
||||
if bands is None:
|
||||
bands = guess_bands(h, threshold)
|
||||
|
||||
bands = guess_bands(h, radius, probability)
|
||||
rows = h // bands
|
||||
|
||||
print(bands)
|
||||
print(match_probability(h, bands, radius))
|
||||
print(similarity_threshold(h, bands))
|
||||
h_upper_bound = bands * rows
|
||||
|
||||
mh = MinHash(h)
|
||||
|
||||
|
@ -74,37 +67,18 @@ def minhash(data, h=256, key=None, radius=0.8, probability=0.9):
|
|||
|
||||
signature = mh.create_signature(k)
|
||||
|
||||
for band in range(0, h, rows):
|
||||
for band in range(0, h_upper_bound, rows):
|
||||
band_key = (band, '%'.join(str(n) for n in signature[band:band + rows]))
|
||||
buckets[band_key].append(item)
|
||||
|
||||
yield from merge_buckets_into_clusters(buckets.values(), mode='connected_components')
|
||||
def double_check(A, B):
|
||||
if k is not None:
|
||||
return jaccard_similarity(key(A), key(B)) >= threshold
|
||||
|
||||
return jaccard_similarity(A, B) >= threshold
|
||||
|
||||
def minhash_lsb(data, precision=4, key=None, radius=0.8, probability=0.9):
|
||||
|
||||
h = precision * 64
|
||||
|
||||
# NOTE: it seems we need to divide the bands by 2 because of LSB
|
||||
bands = max(1, guess_bands(h, radius, probability) // 2)
|
||||
rows = h // bands
|
||||
|
||||
mh = LSBMinHash(precision=precision)
|
||||
|
||||
buckets = defaultdict(list)
|
||||
|
||||
for item in data:
|
||||
k = item
|
||||
|
||||
if key is not None:
|
||||
k = key(item)
|
||||
|
||||
signature = mh.create_signature(k)
|
||||
|
||||
binary = ''.join([bin(i)[2:].rjust(64, '0') for i in signature])
|
||||
|
||||
for band in range(0, h, rows):
|
||||
band_key = (band, binary[band:band + rows])
|
||||
buckets[band_key].append(item)
|
||||
|
||||
yield from merge_buckets_into_clusters(buckets.values(), mode='connected_components')
|
||||
yield from merge_buckets_into_clusters(
|
||||
buckets.values(),
|
||||
mode='connected_components',
|
||||
similarity=double_check
|
||||
)
|
||||
|
|
|
@ -42,7 +42,7 @@ def make_similarity_function(similarity=None, distance=None, radius=None):
|
|||
|
||||
|
||||
def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'),
|
||||
mode='fuzzy_clusters'):
|
||||
mode='fuzzy_clusters', similarity=None):
|
||||
"""
|
||||
Function merging buckets into fuzzy clusters. Each bucket will create
|
||||
relations in an undirected graph that is later solved to compose clusters.
|
||||
|
@ -54,6 +54,8 @@ def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'),
|
|||
infinity.
|
||||
mode (string, optional): 'fuzzy_clusters' or 'connected_components'.
|
||||
Defaults to 'fuzzy_clusters'.
|
||||
similarity (callable, optional)= similarity function to use to validate
|
||||
matches from buckets.
|
||||
|
||||
Yields:
|
||||
list: A viable cluster.
|
||||
|
@ -70,8 +72,9 @@ def merge_buckets_into_clusters(buckets, min_size=2, max_size=float('inf'),
|
|||
for j in range(i + 1, n):
|
||||
B = bucket[j]
|
||||
|
||||
graph[A].add(B)
|
||||
graph[B].add(A)
|
||||
if similarity is None or similarity(A, B):
|
||||
graph[A].add(B)
|
||||
graph[B].add(A)
|
||||
|
||||
# TODO: leader mode
|
||||
if mode == 'fuzzy_clusters':
|
||||
|
|
Loading…
Reference in New Issue