fog/experiments/simhash_clustering.py

import csv
import math
import sys
import itertools
import numpy as np
from fog.lsh import simhash, simhash_similarity
from fog.metrics import cosine_similarity
from fog.tokenizers import ngrams
from fog.clustering.utils import merge_buckets_into_clusters
from collections import defaultdict, Counter
from progressbar import ProgressBar

GROUND_TRUTH = 132

with open('./data/musicians.csv', 'r') as f:
    artists = set(line['artist'] for line in csv.DictReader(f))

buckets = defaultdict(list)

f = 64
radius = 0.8
key = lambda x: list(ngrams(5, x))
sh = lambda x: simhash(key(x), f=f)
k = math.floor((1.0 - radius) * f)

print('f', f)
print('k', k)

# NOTE: does not work -> need to rotate the bits
# https://github.com/leonsim/simhash/blob/master/simhash/__init__.py#L116-L208
# https://github.com/scrapinghub/python-simhash
# http://www.wwwconference.org/www2007/papers/paper215.pdf
# https://github.com/seomoz/simhash-cpp/tree/e7aacb1642f406ff0815cf402e909d2002473812
# https://ir.library.dc-uoit.ca/bitstream/10155/475/1/Rodriguez%20Reina_Ernesto.pdf
# Guessing b, should be smallest power of 2 greater than k
b = 2

while b < k:
    b *= 2

b = 6

r = f // b

t = b - k

print('b', b)
print('r', r)
print('t', t)

# =============================================================================
# Fog MinHash Clustering
# =============================================================================
#
# Clustering algorithm leveraging MinHash LSH to produce suitable clusters.
#
# [Url]:
# http://infolab.stanford.edu/~ullman/mmds/ch3.pdf
#
from collections import defaultdict
import math

from fog.clustering.utils import merge_buckets_into_clusters
from fog.lsh.minhash import MinHash
from fog.metrics.jaccard import jaccard_similarity


# TODO:
#   * Parallelize
#   * possibility to hash the band key
#   * note that we allow uneven bands for fine grained results
#   * double_check with minhash or jaccard or sub similarity even
#   * superminhash to generate signature faster (better for large docs)
#   * cheap_hashes
#   * possibility to use one dict per band + sum the integers

# TODO: compute similarities online + edge list -> connected components


def match_probability(h, bands, similarity):
    """
    Function returning the probability two pairs will match given a number
    of a signature's integers, the number of bands dividing the signature
    matrix and the desired similarity.

    Args:
        h (int): Number of integers in the minhash signature.
        bands (int): Number of bands dividing the signature matrix.
        similarity (float): Desired Jaccard similarity.

    Returns:
        float: The match probability.

    """
    return 1.0 - (1.0 - similarity ** (h / bands)) ** bands


def similarity_threshold(h, bands):
    """
    Function returning the Jaccard similarity threshold for minhash signature
    composed of h integers and a signature matrix divided in n bands.

    Args:
        h (int): Number of integers in the minhash signature.
        bands (int): Number of bands dividing the signature matrix.

    Returns:
        float: The Jaccard similarity threshold.

    """
    return (1.0 / bands) ** (1 / (h / bands))


def guess_bands(h, threshold):
    """
    Function used to iteratively guess the optimal number of bands needed to
    divide a minhash signature matrix in order to find pairs having a
    Jaccard similarity over the given threshold.

    Args:
        h (int): Number of integers in the minhash signature.
        threshold (float): Jaccard similarity threshold.

    Returns:
        int: The optimal number of bands.

    """

    bands = 1

    while bands <= h:
        t = similarity_threshold(h, bands)

        if t <= threshold:
            break

        bands += 1

    return bands


def minhash(data, h=128, key=None, radius=0.8, bands=None, use_numpy=False):
    """
    Function returning an iterator over clusters found using the minhash
    clustering method.

    The idea is to compute minhash signatures for every item and divide the
    resulting signature matrix in bands of n rows so that if two items share
    the exact same rows in a band, they are likely to be similar.

    It runs in O(nh), n being the number of items, h the number of integers to
    use as minhash signature. Note that since usually h << n, it practically
    runs in O(n).

    Args:
        data (iterable): Items to cluster.
        h (int, optional): Number of integers to use as the minhash signature.
            Defaults to 256.
        key (callable, optional): Function returning an item's key.
        radius (float, optional): Radius over which a pair of items is deemed
            similar. Defaults to 0.8.
        bands (int, optional): By defaults, the function will attempt to guess
            the optimal number of bands to use to divide the signature matrix
            using given radius. Set this argument if you want to set the
            number of bands by yourself.
        use_numpy (bool, optional): whether to use numpy to speed up minhash
            signatures computations. Defaults to False.

    """

    if bands is None:
        bands = guess_bands(h, radius)
    print(bands)
    rows = h // bands
    h_upper_bound = bands * rows

    buckets = defaultdict(list)

    bar = ProgressBar()
    for item in bar(data):
        k = item

        if key is not None:
            k = key(item)

        signature = simhash(k, 128)
        binary = bin(signature)[2:].rjust(f, '0')

        for band in range(0, h_upper_bound, rows):
            band_key = (band, binary[band:band + rows])
            buckets[band_key].append(item)

    def double_check(A, B):
        if key is not None:
            return cosine_similarity(key(A), key(B)) >= radius

        return cosine_similarity(A, B) >= radius

    yield from merge_buckets_into_clusters(
        buckets.values(),
        mode='connected_components',
        similarity=double_check
    )

clusters = list(minhash(artists, key=key, radius=radius))

print(len(clusters))

# print('Buckets...')
# bar = ProgressBar()
# for artist in bar(sorted(artists)):
#     binary = bin(sh(artist))[2:].rjust(f, '0')

#     for i, band in enumerate(range(0, 64, 16)):
#         kk = (i, binary[band:band + 16])
#         buckets[kk].append(artist)

# print(len(buckets))
# print(np.median(np.fromiter((len(b) for b in buckets.values()), int)))
# print(min(len(b) for b in buckets.values()), max(len(b) for b in buckets.values()))
# print(sum(1 for b in buckets.values() if len(b) > 1))

# graph = defaultdict(set)

# for bucket in buckets.values():
#     if len(bucket) < 2:
#         continue

#     for i, item1 in enumerate(bucket):
#         for j, item2 in enumerate(bucket):
#             if (item2 in graph and item1 in graph[item2]) or (item1 in graph and item2 in graph[item1]):
#                 continue

#             if cosine_similarity(key(item1), key(item2)) >= radius:
#                 graph[item1].add(item2)
#                 graph[item2].add(item1)

# visited = set()
# stack = []

# for item, neighbors in graph.items():
#     if item in visited:
#         continue

#     visited.add(item)

#     cluster = [item]

#     stack.extend(neighbors)

#     while len(stack) != 0:
#         neighbor = stack.pop()

#         if neighbor in visited:
#             continue

#         cluster.append(neighbor)
#         visited.add(neighbor)

#         stack.extend(graph[neighbor])

#     if len(cluster) >= 2:
#         print(cluster)

# for b in buckets.values():
#     if len(b) > 1:
#         print(b)
#         print()
# clusters = list(merge_buckets_into_clusters(buckets.values(), similarity=lambda x, y: cosine_similarity(key(x), key(y)) >= 0.8))

# print('Clusters', clusters_count, '/', GROUND_TRUTH)
# print('Precision', clusters_count / GROUND_TRUTH)
# print('Candidates', candidates, '/', int(len(artists) * (len(artists) - 1) / 2))
# print('Ratio', candidates / int(len(artists) * (len(artists) - 1) / 2))
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00			`import csv`
CLI experimental endpoint 2018-06-22 15:57:25 +00:00			`import math`
Advances 2018-07-05 15:20:28 +00:00			`import sys`
			`import itertools`
			`import numpy as np`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00			`from fog.lsh import simhash, simhash_similarity`
Adding cosine_similarity 2018-06-21 16:43:15 +00:00			`from fog.metrics import cosine_similarity`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00			`from fog.tokenizers import ngrams`
Advances 2018-07-05 15:20:28 +00:00			`from fog.clustering.utils import merge_buckets_into_clusters`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00			`from collections import defaultdict, Counter`
CLI experimental endpoint 2018-06-22 15:57:25 +00:00			`from progressbar import ProgressBar`

			`GROUND_TRUTH = 132`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
			`with open('./data/musicians.csv', 'r') as f:`
			`artists = set(line['artist'] for line in csv.DictReader(f))`

			`buckets = defaultdict(list)`

Advances 2018-07-05 15:20:28 +00:00			`f = 64`
CLI experimental endpoint 2018-06-22 15:57:25 +00:00			`radius = 0.8`
Advances 2018-07-05 15:20:28 +00:00			`key = lambda x: list(ngrams(5, x))`
			`sh = lambda x: simhash(key(x), f=f)`
CLI experimental endpoint 2018-06-22 15:57:25 +00:00			`k = math.floor((1.0 - radius) * f)`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
CLI experimental endpoint 2018-06-22 15:57:25 +00:00			`print('f', f)`
			`print('k', k)`
Experimenting 2018-06-22 13:39:12 +00:00
CLI experimental endpoint 2018-06-22 15:57:25 +00:00			`# NOTE: does not work -> need to rotate the bits`
			`# https://github.com/leonsim/simhash/blob/master/simhash/__init__.py#L116-L208`
Advances 2018-07-05 15:20:28 +00:00			`# https://github.com/scrapinghub/python-simhash`
			`# http://www.wwwconference.org/www2007/papers/paper215.pdf`
			`# https://github.com/seomoz/simhash-cpp/tree/e7aacb1642f406ff0815cf402e909d2002473812`
			`# https://ir.library.dc-uoit.ca/bitstream/10155/475/1/Rodriguez%20Reina_Ernesto.pdf`
CLI experimental endpoint 2018-06-22 15:57:25 +00:00			`# Guessing b, should be smallest power of 2 greater than k`
			`b = 2`

			`while b < k:`
			`b *= 2`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
Advances 2018-07-05 15:20:28 +00:00			`b = 6`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
CLI experimental endpoint 2018-06-22 15:57:25 +00:00			`r = f // b`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
CLI experimental endpoint 2018-06-22 15:57:25 +00:00			`t = b - k`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
CLI experimental endpoint 2018-06-22 15:57:25 +00:00			`print('b', b)`
			`print('r', r)`
			`print('t', t)`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
Advances 2018-07-05 15:20:28 +00:00			`# =============================================================================`
			`# Fog MinHash Clustering`
			`# =============================================================================`
			`#`
			`# Clustering algorithm leveraging MinHash LSH to produce suitable clusters.`
			`#`
			`# [Url]:`
			`# http://infolab.stanford.edu/~ullman/mmds/ch3.pdf`
			`#`
			`from collections import defaultdict`
			`import math`

			`from fog.clustering.utils import merge_buckets_into_clusters`
			`from fog.lsh.minhash import MinHash`
			`from fog.metrics.jaccard import jaccard_similarity`


			`# TODO:`
			`# * Parallelize`
			`# * possibility to hash the band key`
			`# * note that we allow uneven bands for fine grained results`
			`# * double_check with minhash or jaccard or sub similarity even`
			`# * superminhash to generate signature faster (better for large docs)`
			`# * cheap_hashes`
			`# * possibility to use one dict per band + sum the integers`

			`# TODO: compute similarities online + edge list -> connected components`


			`def match_probability(h, bands, similarity):`
			`"""`
			`Function returning the probability two pairs will match given a number`
			`of a signature's integers, the number of bands dividing the signature`
			`matrix and the desired similarity.`

			`Args:`
			`h (int): Number of integers in the minhash signature.`
			`bands (int): Number of bands dividing the signature matrix.`
			`similarity (float): Desired Jaccard similarity.`

			`Returns:`
			`float: The match probability.`

			`"""`
			`return 1.0 - (1.0 - similarity (h / bands)) bands`


			`def similarity_threshold(h, bands):`
			`"""`
			`Function returning the Jaccard similarity threshold for minhash signature`
			`composed of h integers and a signature matrix divided in n bands.`

			`Args:`
			`h (int): Number of integers in the minhash signature.`
			`bands (int): Number of bands dividing the signature matrix.`

			`Returns:`
			`float: The Jaccard similarity threshold.`

			`"""`
			`return (1.0 / bands) ** (1 / (h / bands))`


			`def guess_bands(h, threshold):`
			`"""`
			`Function used to iteratively guess the optimal number of bands needed to`
			`divide a minhash signature matrix in order to find pairs having a`
			`Jaccard similarity over the given threshold.`

			`Args:`
			`h (int): Number of integers in the minhash signature.`
			`threshold (float): Jaccard similarity threshold.`

			`Returns:`
			`int: The optimal number of bands.`

			`"""`

			`bands = 1`

			`while bands <= h:`
			`t = similarity_threshold(h, bands)`

			`if t <= threshold:`
			`break`

			`bands += 1`

			`return bands`


			`def minhash(data, h=128, key=None, radius=0.8, bands=None, use_numpy=False):`
			`"""`
			`Function returning an iterator over clusters found using the minhash`
			`clustering method.`

			`The idea is to compute minhash signatures for every item and divide the`
			`resulting signature matrix in bands of n rows so that if two items share`
			`the exact same rows in a band, they are likely to be similar.`

			`It runs in O(nh), n being the number of items, h the number of integers to`
			`use as minhash signature. Note that since usually h << n, it practically`
			`runs in O(n).`

			`Args:`
			`data (iterable): Items to cluster.`
			`h (int, optional): Number of integers to use as the minhash signature.`
			`Defaults to 256.`
			`key (callable, optional): Function returning an item's key.`
			`radius (float, optional): Radius over which a pair of items is deemed`
			`similar. Defaults to 0.8.`
			`bands (int, optional): By defaults, the function will attempt to guess`
			`the optimal number of bands to use to divide the signature matrix`
			`using given radius. Set this argument if you want to set the`
			`number of bands by yourself.`
			`use_numpy (bool, optional): whether to use numpy to speed up minhash`
			`signatures computations. Defaults to False.`

			`"""`

			`if bands is None:`
			`bands = guess_bands(h, radius)`
			`print(bands)`
			`rows = h // bands`
			`h_upper_bound = bands * rows`

			`buckets = defaultdict(list)`

			`bar = ProgressBar()`
			`for item in bar(data):`
			`k = item`

			`if key is not None:`
			`k = key(item)`

			`signature = simhash(k, 128)`
			`binary = bin(signature)[2:].rjust(f, '0')`

			`for band in range(0, h_upper_bound, rows):`
			`band_key = (band, binary[band:band + rows])`
			`buckets[band_key].append(item)`

			`def double_check(A, B):`
			`if key is not None:`
			`return cosine_similarity(key(A), key(B)) >= radius`

			`return cosine_similarity(A, B) >= radius`

			`yield from merge_buckets_into_clusters(`
			`buckets.values(),`
			`mode='connected_components',`
			`similarity=double_check`
			`)`

			`clusters = list(minhash(artists, key=key, radius=radius))`

			`print(len(clusters))`

			`# print('Buckets...')`
			`# bar = ProgressBar()`
			`# for artist in bar(sorted(artists)):`
			`# binary = bin(sh(artist))[2:].rjust(f, '0')`

			`# for i, band in enumerate(range(0, 64, 16)):`
			`# kk = (i, binary[band:band + 16])`
			`# buckets[kk].append(artist)`

			`# print(len(buckets))`
			`# print(np.median(np.fromiter((len(b) for b in buckets.values()), int)))`
			`# print(min(len(b) for b in buckets.values()), max(len(b) for b in buckets.values()))`
			`# print(sum(1 for b in buckets.values() if len(b) > 1))`

			`# graph = defaultdict(set)`

			`# for bucket in buckets.values():`
			`# if len(bucket) < 2:`
			`# continue`

			`# for i, item1 in enumerate(bucket):`
			`# for j, item2 in enumerate(bucket):`
			`# if (item2 in graph and item1 in graph[item2]) or (item1 in graph and item2 in graph[item1]):`
			`# continue`

			`# if cosine_similarity(key(item1), key(item2)) >= radius:`
			`# graph[item1].add(item2)`
			`# graph[item2].add(item1)`
CLI experimental endpoint 2018-06-22 15:57:25 +00:00
Advances 2018-07-05 15:20:28 +00:00			`# visited = set()`
			`# stack = []`
CLI experimental endpoint 2018-06-22 15:57:25 +00:00
Advances 2018-07-05 15:20:28 +00:00			`# for item, neighbors in graph.items():`
			`# if item in visited:`
			`# continue`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
Advances 2018-07-05 15:20:28 +00:00			`# visited.add(item)`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
Advances 2018-07-05 15:20:28 +00:00			`# cluster = [item]`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
Advances 2018-07-05 15:20:28 +00:00			`# stack.extend(neighbors)`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
Advances 2018-07-05 15:20:28 +00:00			`# while len(stack) != 0:`
			`# neighbor = stack.pop()`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
Advances 2018-07-05 15:20:28 +00:00			`# if neighbor in visited:`
			`# continue`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
Advances 2018-07-05 15:20:28 +00:00			`# cluster.append(neighbor)`
			`# visited.add(neighbor)`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
Advances 2018-07-05 15:20:28 +00:00			`# stack.extend(graph[neighbor])`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
Advances 2018-07-05 15:20:28 +00:00			`# if len(cluster) >= 2:`
			`# print(cluster)`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
Advances 2018-07-05 15:20:28 +00:00			`# for b in buckets.values():`
			`# if len(b) > 1:`
			`# print(b)`
			`# print()`
			`# clusters = list(merge_buckets_into_clusters(buckets.values(), similarity=lambda x, y: cosine_similarity(key(x), key(y)) >= 0.8))`
Experimenting with simhash clustering 2018-06-21 16:32:30 +00:00
Advances 2018-07-05 15:20:28 +00:00			`# print('Clusters', clusters_count, '/', GROUND_TRUTH)`
			`# print('Precision', clusters_count / GROUND_TRUTH)`
			`# print('Candidates', candidates, '/', int(len(artists) * (len(artists) - 1) / 2))`
			`# print('Ratio', candidates / int(len(artists) * (len(artists) - 1) / 2))`