Notes

2018-07-31 16:30:24 +02:00 · 2018-07-31 16:30:24 +02:00 · 6c4be04ead
parent 5627707c45
commit 6c4be04ead
1 changed files with 10 additions and 6 deletions
--- a/fog/clustering/sorted_neighborhood.py
+++ b/fog/clustering/sorted_neighborhood.py
@ -31,7 +31,7 @@ def sorted_neighborhood(data, key=None, keys=None, similarity=None, distance=Non
                        mode='connected_components'):
    """
    Function returning an iterator over found clusters using the sorted
-    neighborhood method.
+    neighborhood method (SNM).
    It works by first sorting the data according to a key which could, if
    cleverly chosen, put similar items next to one another in the result.
@ -42,6 +42,9 @@ def sorted_neighborhood(data, key=None, keys=None, similarity=None, distance=Non
    Omission key & skeleton keys by Pollock & Zamora are a good choice of
    sorting key if you try to find mispellings, for instance.
    Other good choice of keys is a combination of sorting strings
    lexicographically and sorting them reversed ([None, lambda x: x[::-1]]).
    Note that the sorted neighboorhood method usually runs faster than blocking
    but also misses much more true positives.
@ -150,16 +153,17 @@ def adaptive_sorted_neighborhood(data, key=None, keys=None, similarity=None,
                                 min_size=2, max_size=float('inf'),
                                 mode='connected_components'):
    """
-    Function returning an iterator over found clusters using the sorted
+    Function returning an iterator over found clusters using the adaptive
-    neighborhood method.
+    sorted neighborhood method (ASNM).
    It works using a improved variant of the Sorted Neighborhood Method (SNM)
    called Adaptive Sorted Neighoborhood.
    More specifically, this method implements the "Full-Accumulatively-Adaptative
    SNM" from the "Adaptive Sorted Neighborhood Methods for Efficient
    Record Linkage" paper.
    Note that this method is more precise that basic SNM only for cases where
    duplicate components are sufficiently large and when the variance
    of their size is important. Else it's basically useless.
    Args:
        data (iterable): Arbitrary iterable containing data points to gather
            into clusters. Will be fully consumed.