From e7d178d7eb30bb0f8ef6dacae349ffcaf9ea3b1e Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Thu, 21 Jun 2018 14:36:09 +0200 Subject: [PATCH] Better docs --- fog/clustering/minhash.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fog/clustering/minhash.py b/fog/clustering/minhash.py index 2d57062..18158b4 100644 --- a/fog/clustering/minhash.py +++ b/fog/clustering/minhash.py @@ -92,6 +92,14 @@ def minhash(data, h=256, key=None, radius=0.8, bands=None, use_numpy=False): Function returning an iterator over clusters found using the minhash clustering method. + The idea is to compute minhash signatures for every item and divide the + resulting signature matrix in bands of n rows so that if two items share + the exact same rows in a band, they are likely to be similar. + + It runs in O(nh), n being the number of items, h the number of integers to + use as minhash signature. Note that since usually h << n, it practically + runs in O(n). + Args: data (iterable): Items to cluster. h (int, optional): Number of integers to use as the minhash signature.