mirror of https://github.com/Yomguithereal/fog.git
Notes
This commit is contained in:
parent
5627707c45
commit
6c4be04ead
|
@ -31,7 +31,7 @@ def sorted_neighborhood(data, key=None, keys=None, similarity=None, distance=Non
|
||||||
mode='connected_components'):
|
mode='connected_components'):
|
||||||
"""
|
"""
|
||||||
Function returning an iterator over found clusters using the sorted
|
Function returning an iterator over found clusters using the sorted
|
||||||
neighborhood method.
|
neighborhood method (SNM).
|
||||||
|
|
||||||
It works by first sorting the data according to a key which could, if
|
It works by first sorting the data according to a key which could, if
|
||||||
cleverly chosen, put similar items next to one another in the result.
|
cleverly chosen, put similar items next to one another in the result.
|
||||||
|
@ -42,6 +42,9 @@ def sorted_neighborhood(data, key=None, keys=None, similarity=None, distance=Non
|
||||||
Omission key & skeleton keys by Pollock & Zamora are a good choice of
|
Omission key & skeleton keys by Pollock & Zamora are a good choice of
|
||||||
sorting key if you try to find mispellings, for instance.
|
sorting key if you try to find mispellings, for instance.
|
||||||
|
|
||||||
|
Other good choice of keys is a combination of sorting strings
|
||||||
|
lexicographically and sorting them reversed ([None, lambda x: x[::-1]]).
|
||||||
|
|
||||||
Note that the sorted neighboorhood method usually runs faster than blocking
|
Note that the sorted neighboorhood method usually runs faster than blocking
|
||||||
but also misses much more true positives.
|
but also misses much more true positives.
|
||||||
|
|
||||||
|
@ -150,16 +153,17 @@ def adaptive_sorted_neighborhood(data, key=None, keys=None, similarity=None,
|
||||||
min_size=2, max_size=float('inf'),
|
min_size=2, max_size=float('inf'),
|
||||||
mode='connected_components'):
|
mode='connected_components'):
|
||||||
"""
|
"""
|
||||||
Function returning an iterator over found clusters using the sorted
|
Function returning an iterator over found clusters using the adaptive
|
||||||
neighborhood method.
|
sorted neighborhood method (ASNM).
|
||||||
|
|
||||||
It works using a improved variant of the Sorted Neighborhood Method (SNM)
|
|
||||||
called Adaptive Sorted Neighoborhood.
|
|
||||||
|
|
||||||
More specifically, this method implements the "Full-Accumulatively-Adaptative
|
More specifically, this method implements the "Full-Accumulatively-Adaptative
|
||||||
SNM" from the "Adaptive Sorted Neighborhood Methods for Efficient
|
SNM" from the "Adaptive Sorted Neighborhood Methods for Efficient
|
||||||
Record Linkage" paper.
|
Record Linkage" paper.
|
||||||
|
|
||||||
|
Note that this method is more precise that basic SNM only for cases where
|
||||||
|
duplicate components are sufficiently large and when the variance
|
||||||
|
of their size is important. Else it's basically useless.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
data (iterable): Arbitrary iterable containing data points to gather
|
data (iterable): Arbitrary iterable containing data points to gather
|
||||||
into clusters. Will be fully consumed.
|
into clusters. Will be fully consumed.
|
||||||
|
|
Loading…
Reference in New Issue