Notes

2018-07-31 16:30:24 +02:00 · 2018-07-31 16:30:24 +02:00 · 6c4be04ead
parent 5627707c45
commit 6c4be04ead
1 changed files with 10 additions and 6 deletions
--- a/fog/clustering/sorted_neighborhood.py
+++ b/fog/clustering/sorted_neighborhood.py
@ -31,7 +31,7 @@ def sorted_neighborhood(data, key=None, keys=None, similarity=None, distance=Non
                        mode='connected_components'):
    """
    Function returning an iterator over found clusters using the sorted
-    neighborhood method.
+    neighborhood method (SNM).

    It works by first sorting the data according to a key which could, if
    cleverly chosen, put similar items next to one another in the result.
@ -42,6 +42,9 @@ def sorted_neighborhood(data, key=None, keys=None, similarity=None, distance=Non
    Omission key & skeleton keys by Pollock & Zamora are a good choice of
    sorting key if you try to find mispellings, for instance.

+    Other good choice of keys is a combination of sorting strings
+    lexicographically and sorting them reversed ([None, lambda x: x[::-1]]).
+
    Note that the sorted neighboorhood method usually runs faster than blocking
    but also misses much more true positives.

@ -150,16 +153,17 @@ def adaptive_sorted_neighborhood(data, key=None, keys=None, similarity=None,
                                 min_size=2, max_size=float('inf'),
                                 mode='connected_components'):
    """
-    Function returning an iterator over found clusters using the sorted
-    neighborhood method.
-
-    It works using a improved variant of the Sorted Neighborhood Method (SNM)
-    called Adaptive Sorted Neighoborhood.
+    Function returning an iterator over found clusters using the adaptive
+    sorted neighborhood method (ASNM).

    More specifically, this method implements the "Full-Accumulatively-Adaptative
    SNM" from the "Adaptive Sorted Neighborhood Methods for Efficient
    Record Linkage" paper.

+    Note that this method is more precise that basic SNM only for cases where
+    duplicate components are sufficiently large and when the variance
+    of their size is important. Else it's basically useless.
+
    Args:
        data (iterable): Arbitrary iterable containing data points to gather
            into clusters. Will be fully consumed.