multiple changes in testing, input validation and implementations
This commit is contained in:
parent
95a724ebc0
commit
21f4014acc
|
@ -25,8 +25,9 @@ from fuzzysearch.common import Match, get_best_match_in_group, group_matches, \
|
|||
search_exact
|
||||
from fuzzysearch.levenshtein import find_near_matches_levenshtein
|
||||
from fuzzysearch.substitutions_only import find_near_matches_substitutions
|
||||
from fuzzysearch.generic_search import \
|
||||
find_near_matches_generic_linear_programming
|
||||
from fuzzysearch.generic_search import find_near_matches_generic, \
|
||||
find_near_matches_generic_linear_programming, _check_arguments, \
|
||||
_get_max_l_dist
|
||||
|
||||
|
||||
def find_near_matches(subsequence, sequence,
|
||||
|
@ -45,31 +46,15 @@ def find_near_matches(subsequence, sequence,
|
|||
* the total number of substitutions, insertions and deletions
|
||||
(a.k.a. the Levenshtein distance)
|
||||
"""
|
||||
if max_l_dist is None:
|
||||
if (
|
||||
max_substitutions is None and
|
||||
max_insertions is None and
|
||||
max_deletions is None
|
||||
):
|
||||
raise ValueError('No limitations given!')
|
||||
_check_arguments(subsequence, sequence, max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist)
|
||||
|
||||
if max_substitutions is None:
|
||||
raise ValueError('# substitutions must be limited!')
|
||||
if max_insertions is None:
|
||||
raise ValueError('# insertions must be limited!')
|
||||
if max_deletions is None:
|
||||
raise ValueError('# deletions must be limited!')
|
||||
max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist)
|
||||
|
||||
# if the limitations are so strict that only exact matches are allowed,
|
||||
# use search_exact()
|
||||
if (
|
||||
max_l_dist == 0 or
|
||||
(
|
||||
max_substitutions == 0 and
|
||||
max_insertions == 0 and
|
||||
max_deletions == 0
|
||||
)
|
||||
):
|
||||
if max_l_dist == 0:
|
||||
return [
|
||||
Match(start_index, start_index + len(subsequence), 0)
|
||||
for start_index in search_exact(subsequence, sequence)
|
||||
|
@ -83,7 +68,7 @@ def find_near_matches(subsequence, sequence,
|
|||
|
||||
# if it is enough to just take into account the maximum Levenshtein
|
||||
# distance, use find_near_matches_levenshtein()
|
||||
elif max_l_dist is not None and max_l_dist <= min([max_l_dist] + [
|
||||
elif max_l_dist <= min([
|
||||
param for param in [
|
||||
max_substitutions, max_insertions, max_deletions
|
||||
]
|
||||
|
@ -91,10 +76,8 @@ def find_near_matches(subsequence, sequence,
|
|||
]):
|
||||
return find_near_matches_levenshtein(subsequence, sequence, max_l_dist)
|
||||
|
||||
# if none of the special cases above are met, use the most generic version:
|
||||
# find_near_matches_generic_linear_programming()
|
||||
# if none of the special cases above are met, use the most generic version
|
||||
else:
|
||||
return list(find_near_matches_generic_linear_programming(
|
||||
subsequence, sequence,
|
||||
max_substitutions, max_insertions, max_deletions, max_l_dist,
|
||||
))
|
||||
return find_near_matches_generic(subsequence, sequence,
|
||||
max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist)
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,6 +1,8 @@
|
|||
from sys import maxint
|
||||
import six
|
||||
from fuzzysearch.common import Match
|
||||
from libc.stdlib cimport malloc, free, realloc
|
||||
from libc.string cimport strstr, strncpy
|
||||
|
||||
|
||||
__all__ = ['c_find_near_matches_generic_linear_programming']
|
||||
|
@ -43,29 +45,27 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
|
|||
raise ValueError('Given subsequence is empty!')
|
||||
|
||||
# optimization: prepare some often used things in advance
|
||||
cdef int _subseq_len = len(subsequence)
|
||||
cdef int _subseq_len_minus_one = _subseq_len - 1
|
||||
cdef size_t _subseq_len = len(subsequence)
|
||||
cdef size_t _subseq_len_minus_one = _subseq_len - 1
|
||||
|
||||
maxes_sum = sum(
|
||||
(x if x is not None else 0)
|
||||
for x in [max_substitutions, max_insertions, max_deletions]
|
||||
cdef unsigned int c_max_substitutions = max_substitutions if max_substitutions is not None else (1<<29)
|
||||
cdef unsigned int c_max_insertions = max_insertions if max_insertions is not None else (1<<29)
|
||||
cdef unsigned int c_max_deletions = max_deletions if max_deletions is not None else (1<<29)
|
||||
|
||||
# TODO: write a good comment
|
||||
cdef unsigned int c_max_l_dist = min(
|
||||
max_l_dist if max_l_dist is not None else (1<<29),
|
||||
c_max_substitutions + c_max_insertions + c_max_deletions,
|
||||
)
|
||||
if max_l_dist is None or max_l_dist >= maxes_sum:
|
||||
max_l_dist = maxes_sum
|
||||
|
||||
cdef c_max_l_dist = max_l_dist
|
||||
cdef c_max_substitutions = max_substitutions
|
||||
cdef c_max_insertions = max_insertions
|
||||
cdef c_max_deletions = max_deletions
|
||||
|
||||
cdef alloc_size
|
||||
cdef size_t alloc_size
|
||||
cdef GenericSearchCandidate* candidates
|
||||
cdef GenericSearchCandidate* new_candidates
|
||||
cdef GenericSearchCandidate* _tmp
|
||||
cdef GenericSearchCandidate cand
|
||||
cdef int n_candidates = 0
|
||||
cdef int n_new_candidates = 0
|
||||
cdef int n_cand
|
||||
cdef size_t n_candidates = 0
|
||||
cdef size_t n_new_candidates = 0
|
||||
cdef size_t n_cand
|
||||
|
||||
cdef char* c_sequence = sequence
|
||||
cdef char* c_subsequence = subsequence
|
||||
|
@ -80,7 +80,7 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
|
|||
free(candidates)
|
||||
raise MemoryError()
|
||||
|
||||
cdef unsigned int index
|
||||
cdef size_t index
|
||||
try:
|
||||
index = 0
|
||||
have_realloced = False
|
||||
|
@ -92,7 +92,7 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
|
|||
cand = candidates[n_cand]
|
||||
|
||||
if n_new_candidates + 4 > alloc_size:
|
||||
alloc_size += alloc_size // 2
|
||||
alloc_size *= 2
|
||||
_tmp = <GenericSearchCandidate *>realloc(new_candidates, alloc_size * sizeof(GenericSearchCandidate))
|
||||
if _tmp is NULL:
|
||||
raise MemoryError()
|
||||
|
@ -218,3 +218,80 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
|
|||
finally:
|
||||
free(candidates)
|
||||
free(new_candidates)
|
||||
|
||||
|
||||
def c_find_near_matches_generic_ngrams(subsequence, sequence,
|
||||
max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist=None):
|
||||
"""search for near-matches of subsequence in sequence
|
||||
|
||||
This searches for near-matches, where the nearly-matching parts of the
|
||||
sequence must meet the following limitations (relative to the subsequence):
|
||||
|
||||
* the maximum allowed number of character substitutions
|
||||
* the maximum allowed number of new characters inserted
|
||||
* and the maximum allowed number of character deletions
|
||||
* the total number of substitutions, insertions and deletions
|
||||
"""
|
||||
if not isinstance(sequence, ALLOWED_TYPES):
|
||||
raise TypeError('sequence is of invalid type %s' % type(subsequence))
|
||||
if not isinstance(subsequence, ALLOWED_TYPES):
|
||||
raise TypeError('subsequence is of invalid type %s' % type(subsequence))
|
||||
|
||||
if not subsequence:
|
||||
raise ValueError('Given subsequence is empty!')
|
||||
|
||||
# optimization: prepare some often used things in advance
|
||||
cdef size_t _subseq_len = len(subsequence)
|
||||
cdef size_t _subseq_len_minus_one = _subseq_len - 1
|
||||
cdef size_t _seq_len = len(sequence)
|
||||
|
||||
cdef unsigned int c_max_substitutions = max_substitutions if max_substitutions is not None else (1<<29)
|
||||
cdef unsigned int c_max_insertions = max_insertions if max_insertions is not None else (1<<29)
|
||||
cdef unsigned int c_max_deletions = max_deletions if max_deletions is not None else (1<<29)
|
||||
|
||||
# TODO: write a good comment
|
||||
cdef unsigned int c_max_l_dist = min(
|
||||
max_l_dist if max_l_dist is not None else (1<<29),
|
||||
c_max_substitutions + c_max_insertions + c_max_deletions,
|
||||
)
|
||||
|
||||
cdef char* c_sequence = sequence
|
||||
cdef char* c_subsequence = subsequence
|
||||
cdef char* ngram_str
|
||||
|
||||
cdef size_t ngram_len = _subseq_len // (c_max_l_dist + 1)
|
||||
if ngram_len == 0:
|
||||
raise ValueError('the subsequence length must be greater than max_l_dist')
|
||||
|
||||
ngram_str = <char *> malloc((ngram_len + 1) * sizeof(char))
|
||||
if ngram_str is NULL:
|
||||
raise MemoryError()
|
||||
|
||||
cdef int index
|
||||
cdef size_t ngram_start, small_search_start_index
|
||||
cdef char *match_ptr
|
||||
|
||||
try:
|
||||
ngram_str[ngram_len] = 0
|
||||
|
||||
for ngram_start in xrange(0, _subseq_len - ngram_len + 1, ngram_len):
|
||||
strncpy(ngram_str, c_subsequence + ngram_start, ngram_len)
|
||||
|
||||
match_ptr = strstr(c_sequence, ngram_str)
|
||||
while match_ptr != NULL:
|
||||
index = (match_ptr - c_sequence)
|
||||
small_search_start_index = max(0, index - <int>(ngram_start + c_max_l_dist))
|
||||
# try to expand left and/or right according to n_ngram
|
||||
for match in c_find_near_matches_generic_linear_programming(
|
||||
subsequence, sequence[small_search_start_index:index - ngram_start + _subseq_len + c_max_l_dist],
|
||||
max_substitutions, max_insertions, max_deletions, c_max_l_dist,
|
||||
):
|
||||
yield match._replace(
|
||||
start=match.start + small_search_start_index,
|
||||
end=match.end + small_search_start_index,
|
||||
)
|
||||
match_ptr = strstr(match_ptr + 1, ngram_str)
|
||||
|
||||
finally:
|
||||
free(ngram_str)
|
||||
|
|
|
@ -18,11 +18,50 @@ GenericSearchCandidate = namedtuple(
|
|||
)
|
||||
|
||||
|
||||
def _find_near_matches_generic_linear_programming(subsequence, sequence,
|
||||
max_substitutions,
|
||||
max_insertions,
|
||||
max_deletions,
|
||||
max_l_dist=None):
|
||||
def _check_arguments(subsequence, sequence,
|
||||
max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist=None):
|
||||
if not subsequence:
|
||||
raise ValueError('Given subsequence is empty!')
|
||||
|
||||
if max_l_dist is None:
|
||||
if (
|
||||
max_substitutions is None or
|
||||
max_insertions is None or
|
||||
max_deletions is None
|
||||
):
|
||||
if (
|
||||
max_substitutions is None and
|
||||
max_insertions is None and
|
||||
max_deletions is None
|
||||
):
|
||||
raise ValueError('No limitations given!')
|
||||
|
||||
if max_substitutions is None:
|
||||
raise ValueError('# substitutions must be limited!')
|
||||
if max_insertions is None:
|
||||
raise ValueError('# insertions must be limited!')
|
||||
if max_deletions is None:
|
||||
raise ValueError('# deletions must be limited!')
|
||||
|
||||
|
||||
def _get_max_l_dist(max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist):
|
||||
maxes_sum = (
|
||||
(max_substitutions if max_substitutions is not None else (1 << 29)) +
|
||||
(max_insertions if max_insertions is not None else (1 << 29)) +
|
||||
(max_deletions if max_deletions is not None else (1 << 29))
|
||||
)
|
||||
return (
|
||||
max_l_dist
|
||||
if max_l_dist is not None and max_l_dist <= maxes_sum
|
||||
else maxes_sum
|
||||
)
|
||||
|
||||
|
||||
def find_near_matches_generic(subsequence, sequence,
|
||||
max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist=None):
|
||||
"""search for near-matches of subsequence in sequence
|
||||
|
||||
This searches for near-matches, where the nearly-matching parts of the
|
||||
|
@ -33,19 +72,64 @@ def _find_near_matches_generic_linear_programming(subsequence, sequence,
|
|||
* and the maximum allowed number of character deletions
|
||||
* the total number of substitutions, insertions and deletions
|
||||
"""
|
||||
if not subsequence:
|
||||
raise ValueError('Given subsequence is empty!')
|
||||
_check_arguments(subsequence, sequence, max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist)
|
||||
|
||||
max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist)
|
||||
|
||||
# if the limitations are so strict that only exact matches are allowed,
|
||||
# use search_exact()
|
||||
if max_l_dist == 0:
|
||||
return [
|
||||
Match(start_index, start_index + len(subsequence), 0)
|
||||
for start_index in search_exact(subsequence, sequence)
|
||||
]
|
||||
|
||||
# if the n-gram length would be at least 3, use the n-gram search method
|
||||
elif len(subsequence) // (max_l_dist + 1) >= 3:
|
||||
return find_near_matches_generic_ngrams(subsequence, sequence,
|
||||
max_substitutions,
|
||||
max_insertions,
|
||||
max_deletions,
|
||||
max_l_dist)
|
||||
|
||||
# use the linear programming search method
|
||||
else:
|
||||
matches = find_near_matches_generic_linear_programming(
|
||||
subsequence, sequence,
|
||||
max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist)
|
||||
|
||||
match_groups = group_matches(matches)
|
||||
best_matches = [get_best_match_in_group(group) for group in match_groups]
|
||||
return sorted(best_matches)
|
||||
|
||||
|
||||
def _find_near_matches_generic_linear_programming(subsequence, sequence,
|
||||
max_substitutions,
|
||||
max_insertions,
|
||||
max_deletions,
|
||||
max_l_dist=None):
|
||||
"""search for near-matches of subsequence in sequence
|
||||
|
||||
This searches for near-matches, where the nearly-matching parts of the
|
||||
sequence must meet the following limitations (relative to the subsequence):
|
||||
|
||||
* the maximum allowed number of character substitutions
|
||||
* the maximum allowed number of new characters inserted
|
||||
* and the maximum allowed number of character deletions
|
||||
* the total number of substitutions, insertions and deletions
|
||||
"""
|
||||
_check_arguments(subsequence, sequence, max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist)
|
||||
|
||||
max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist)
|
||||
|
||||
# optimization: prepare some often used things in advance
|
||||
_subseq_len = len(subsequence)
|
||||
|
||||
maxes_sum = sum(
|
||||
(x if x is not None else 0)
|
||||
for x in [max_substitutions, max_insertions, max_deletions]
|
||||
)
|
||||
if max_l_dist is None or max_l_dist >= maxes_sum:
|
||||
max_l_dist = maxes_sum
|
||||
|
||||
candidates = []
|
||||
for index, char in enumerate(sequence):
|
||||
candidates.append(GenericSearchCandidate(index, 0, 0, 0, 0, 0))
|
||||
|
@ -179,13 +263,12 @@ def find_near_matches_generic_ngrams(subsequence, sequence,
|
|||
* and the maximum allowed number of character deletions
|
||||
* the total number of substitutions, insertions and deletions
|
||||
"""
|
||||
maxes_sum = (
|
||||
(max_substitutions if max_substitutions is not None else 0) +
|
||||
(max_insertions if max_insertions is not None else 0) +
|
||||
(max_deletions if max_deletions is not None else 0)
|
||||
)
|
||||
if max_l_dist is None or max_l_dist >= maxes_sum:
|
||||
max_l_dist = maxes_sum
|
||||
_check_arguments(subsequence, sequence,
|
||||
max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist)
|
||||
|
||||
max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist)
|
||||
|
||||
matches = list(_find_near_matches_generic_ngrams(subsequence, sequence,
|
||||
max_substitutions,
|
||||
|
@ -244,13 +327,12 @@ def has_near_match_generic_ngrams(subsequence, sequence,
|
|||
* and the maximum allowed number of character deletions
|
||||
* the total number of substitutions, insertions and deletions
|
||||
"""
|
||||
maxes_sum = (
|
||||
(max_substitutions if max_substitutions is not None else 0) +
|
||||
(max_insertions if max_insertions is not None else 0) +
|
||||
(max_deletions if max_deletions is not None else 0)
|
||||
)
|
||||
if max_l_dist is None or max_l_dist >= maxes_sum:
|
||||
max_l_dist = maxes_sum
|
||||
_check_arguments(subsequence, sequence,
|
||||
max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist)
|
||||
|
||||
max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
|
||||
max_deletions, max_l_dist)
|
||||
|
||||
for match in _find_near_matches_generic_ngrams(subsequence, sequence,
|
||||
max_substitutions,
|
||||
|
|
|
@ -28,7 +28,7 @@ class TestFindNearMatches(unittest.TestCase):
|
|||
MockFunctionFailsUnlessDefined()
|
||||
self.mock_find_near_matches_substitutions = \
|
||||
MockFunctionFailsUnlessDefined()
|
||||
self.mock_find_near_matches_generic_linear_programming = \
|
||||
self.mock_find_near_matches_generic = \
|
||||
MockFunctionFailsUnlessDefined()
|
||||
|
||||
patcher = mock.patch.multiple(
|
||||
|
@ -38,8 +38,8 @@ class TestFindNearMatches(unittest.TestCase):
|
|||
self.mock_find_near_matches_levenshtein,
|
||||
find_near_matches_substitutions=
|
||||
self.mock_find_near_matches_substitutions,
|
||||
find_near_matches_generic_linear_programming=
|
||||
self.mock_find_near_matches_generic_linear_programming,
|
||||
find_near_matches_generic=
|
||||
self.mock_find_near_matches_generic,
|
||||
)
|
||||
self.addCleanup(patcher.stop)
|
||||
patcher.start()
|
||||
|
@ -154,14 +154,14 @@ class TestFindNearMatches(unittest.TestCase):
|
|||
)
|
||||
|
||||
def test_generic(self):
|
||||
self.mock_find_near_matches_generic_linear_programming.return_value = [42]
|
||||
self.mock_find_near_matches_generic.return_value = [42]
|
||||
|
||||
self.assertEqual(
|
||||
find_near_matches('a', 'a', 1, 1, 1),
|
||||
[42],
|
||||
)
|
||||
self.assertEqual(
|
||||
self.mock_find_near_matches_generic_linear_programming.call_count,
|
||||
self.mock_find_near_matches_generic.call_count,
|
||||
1,
|
||||
)
|
||||
|
||||
|
@ -170,6 +170,6 @@ class TestFindNearMatches(unittest.TestCase):
|
|||
[42],
|
||||
)
|
||||
self.assertEqual(
|
||||
self.mock_find_near_matches_generic_linear_programming.call_count,
|
||||
self.mock_find_near_matches_generic.call_count,
|
||||
2,
|
||||
)
|
||||
|
|
|
@ -9,7 +9,7 @@ from fuzzysearch.generic_search import \
|
|||
has_near_match_generic_ngrams as hnm_generic_ngrams
|
||||
|
||||
|
||||
class TestGenericSearchLPAsLevenshtein(TestFindNearMatchesLevenshteinBase,
|
||||
class TestGenericSearchLpAsLevenshtein(TestFindNearMatchesLevenshteinBase,
|
||||
unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_l_dist):
|
||||
return [
|
||||
|
@ -28,7 +28,7 @@ class TestGenericSearchNgramsAsLevenshtein(TestFindNearMatchesLevenshteinBase,
|
|||
max_l_dist, max_l_dist, max_l_dist)
|
||||
|
||||
|
||||
class TestGenericSearchLPAsSubstitutionsOnly(TestSubstitionsOnlyBase,
|
||||
class TestGenericSearchLpAsSubstitutionsOnly(TestSubstitionsOnlyBase,
|
||||
unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_subs):
|
||||
return list(
|
||||
|
@ -90,11 +90,6 @@ class TestGenericSearchBase(object):
|
|||
[Match(start=4, end=7, dist=0)],
|
||||
)
|
||||
|
||||
self.assertListEqual(
|
||||
self.search('def', 'abcddefg', 0, 1, 0),
|
||||
[Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
|
||||
)
|
||||
|
||||
self.assertIn(
|
||||
Match(start=4, end=7, dist=0),
|
||||
self.search('def', 'abcddefg', 0, 0, 1),
|
||||
|
@ -129,6 +124,70 @@ class TestGenericSearchBase(object):
|
|||
[Match(start=3, end=5, dist=1)],
|
||||
)
|
||||
|
||||
def test_valid_none_arguments(self):
|
||||
# check that no exception is raised when some values are None
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', 0, None, None, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, 0, None, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, None, 0, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', 0, 0, None, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', 0, None, 0, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, 0, 0, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, None, None, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', 0, 0, 0, None),
|
||||
[],
|
||||
)
|
||||
|
||||
def test_invalid_none_arguments(self):
|
||||
# check that an exception is raised when max_l_dist is None as well as
|
||||
# at least one other limitation
|
||||
with self.assertRaises(ValueError):
|
||||
self.search('a', 'b', None, None, None, None)
|
||||
|
||||
|
||||
class TestGenericSearchLp(TestGenericSearchBase, unittest.TestCase):
|
||||
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
|
||||
max_l_dist=None):
|
||||
return list(fnm_generic_lp(pattern, sequence,
|
||||
max_subs, max_ins, max_dels, max_l_dist))
|
||||
|
||||
def test_double_first_item_two_results(self):
|
||||
# sequence = 'abcdefg'
|
||||
# pattern = 'bde'
|
||||
self.assertListEqual(
|
||||
self.search('def', 'abcddefg', 0, 1, 0),
|
||||
[Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
|
||||
)
|
||||
|
||||
def test_missing_second_item_complex(self):
|
||||
self.assertListEqual(
|
||||
self.search('bde', 'abcdefg', 1, 1, 1, 1),
|
||||
[Match(start=1, end=5, dist=1),
|
||||
|
@ -136,7 +195,6 @@ class TestGenericSearchBase(object):
|
|||
Match(start=3, end=5, dist=1)],
|
||||
)
|
||||
|
||||
def test_missing_second_item_complex(self):
|
||||
self.assertTrue(
|
||||
set([
|
||||
Match(start=1, end=5, dist=1),
|
||||
|
@ -148,36 +206,6 @@ class TestGenericSearchBase(object):
|
|||
))
|
||||
)
|
||||
|
||||
def test_argument_handling(self):
|
||||
# check that no exception is raised when some values are None
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', 0, None, None, None),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, 0, None, None),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, None, 0, None),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, None, None, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
|
||||
class TestGenericSearchLP(TestGenericSearchBase, unittest.TestCase):
|
||||
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
|
||||
max_l_dist=None):
|
||||
return list(fnm_generic_lp(pattern, sequence,
|
||||
max_subs, max_ins, max_dels, max_l_dist))
|
||||
|
||||
|
||||
class TestGenericSearchNgrams(TestGenericSearchBase, unittest.TestCase):
|
||||
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
|
||||
max_l_dist=None):
|
||||
|
@ -185,15 +213,13 @@ class TestGenericSearchNgrams(TestGenericSearchBase, unittest.TestCase):
|
|||
max_subs, max_ins, max_dels, max_l_dist)
|
||||
|
||||
def test_missing_second_item_complex(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Ngrams search doesn't return overlapping matches")
|
||||
def test_double_first_item(self):
|
||||
return super(TestGenericSearchNgrams, self).test_double_first_item()
|
||||
|
||||
@unittest.skip("Ngrams search doesn't return overlapping matches")
|
||||
def test_missing_second_item(self):
|
||||
return super(TestGenericSearchNgrams, self).test_double_first_item()
|
||||
self.assertTrue(
|
||||
set(self.search('bde', 'abcdefg', 1, 1, 1, 1)).issubset([
|
||||
Match(start=1, end=5, dist=1),
|
||||
Match(start=2, end=5, dist=1),
|
||||
Match(start=3, end=5, dist=1),
|
||||
])
|
||||
)
|
||||
|
||||
|
||||
class TestHasNearMatchGenericNgramsAsSubstitutionsOnly(
|
||||
|
|
|
@ -5,11 +5,12 @@ from tests.test_substitutions_only import TestSubstitionsOnlyBase
|
|||
|
||||
try:
|
||||
from fuzzysearch._generic_search import \
|
||||
c_find_near_matches_generic_linear_programming as c_fnm_generic_lp
|
||||
c_find_near_matches_generic_linear_programming as c_fnm_generic_lp, \
|
||||
c_find_near_matches_generic_ngrams as c_fnm_generic_ngrams
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
class TestGenericSearchAsLevenshtein(TestFindNearMatchesLevenshteinBase,
|
||||
class TestGenericSearchLpAsLevenshtein(TestFindNearMatchesLevenshteinBase,
|
||||
unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_l_dist):
|
||||
return [
|
||||
|
@ -17,13 +18,27 @@ else:
|
|||
for group in group_matches(
|
||||
c_fnm_generic_lp(subsequence.encode('ascii'),
|
||||
sequence.encode('ascii'),
|
||||
max_l_dist, max_l_dist, max_l_dist, max_l_dist)
|
||||
max_l_dist, max_l_dist,
|
||||
max_l_dist, max_l_dist)
|
||||
)
|
||||
]
|
||||
|
||||
class TestGenericSearchNgramsAsLevenshtein(
|
||||
TestFindNearMatchesLevenshteinBase, unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_l_dist):
|
||||
return [
|
||||
get_best_match_in_group(group)
|
||||
for group in group_matches(
|
||||
c_fnm_generic_ngrams(subsequence.encode('ascii'),
|
||||
sequence.encode('ascii'),
|
||||
max_l_dist, max_l_dist,
|
||||
max_l_dist, max_l_dist)
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class TestGenericSearchAsSubstitutionsOnly(TestSubstitionsOnlyBase,
|
||||
unittest.TestCase):
|
||||
class TestGenericSearchLpAsSubstitutionsOnly(TestSubstitionsOnlyBase,
|
||||
unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_subs):
|
||||
return list(
|
||||
c_fnm_generic_lp(subsequence.encode('ascii'),
|
||||
|
@ -32,15 +47,25 @@ else:
|
|||
)
|
||||
|
||||
|
||||
class TestGenericSearch(unittest.TestCase):
|
||||
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
|
||||
max_l_dist=None):
|
||||
return list(
|
||||
c_fnm_generic_lp(pattern.encode('ascii'),
|
||||
sequence.encode('ascii'),
|
||||
max_subs, max_ins, max_dels, max_l_dist)
|
||||
)
|
||||
class TestGenericSearchNgramsAsSubstitutionsOnly(TestSubstitionsOnlyBase,
|
||||
unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_subs):
|
||||
return [
|
||||
get_best_match_in_group(group)
|
||||
for group in group_matches(
|
||||
c_fnm_generic_ngrams(subsequence.encode('ascii'),
|
||||
sequence.encode('ascii'),
|
||||
max_subs, 0, 0, max_subs)
|
||||
)
|
||||
]
|
||||
|
||||
@unittest.skip("Ngrams search doesn't return overlapping matches")
|
||||
def test_double_first_item(self):
|
||||
return super(TestGenericSearchNgramsAsSubstitutionsOnly,
|
||||
self).test_double_first_item()
|
||||
|
||||
|
||||
class TestGenericSearchBase(object):
|
||||
def test_empty_sequence(self):
|
||||
self.assertEqual([], self.search('PATTERN', '', 0, 0, 0))
|
||||
|
||||
|
@ -78,11 +103,6 @@ else:
|
|||
self.search('def', 'abcddefg', 1, 0, 0),
|
||||
)
|
||||
|
||||
self.assertListEqual(
|
||||
[Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
|
||||
self.search('def', 'abcddefg', 0, 1, 0),
|
||||
)
|
||||
|
||||
self.assertIn(
|
||||
Match(start=4, end=7, dist=0),
|
||||
self.search('def', 'abcddefg', 0, 0, 1),
|
||||
|
@ -117,6 +137,65 @@ else:
|
|||
[Match(start=3, end=5, dist=1)],
|
||||
)
|
||||
|
||||
def test_valid_none_arguments(self):
|
||||
# check that no exception is raised when some values are None
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', 0, None, None, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, 0, None, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, None, 0, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', 0, 0, None, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', 0, None, 0, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, 0, 0, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, None, None, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', 0, 0, 0, None),
|
||||
[],
|
||||
)
|
||||
|
||||
class TestGenericSearchLp(TestGenericSearchBase, unittest.TestCase):
|
||||
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
|
||||
max_l_dist=None):
|
||||
return list(c_fnm_generic_lp(pattern.encode('ascii'),
|
||||
sequence.encode('ascii'),
|
||||
max_subs, max_ins,
|
||||
max_dels, max_l_dist))
|
||||
|
||||
def test_double_first_item_two_results(self):
|
||||
# sequence = 'abcdefg'
|
||||
# pattern = 'bde'
|
||||
self.assertListEqual(
|
||||
self.search('def', 'abcddefg', 0, 1, 0),
|
||||
[Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
|
||||
)
|
||||
|
||||
def test_missing_second_item_complex(self):
|
||||
self.assertListEqual(
|
||||
self.search('bde', 'abcdefg', 1, 1, 1, 1),
|
||||
[Match(start=1, end=5, dist=1),
|
||||
|
@ -135,24 +214,24 @@ else:
|
|||
))
|
||||
)
|
||||
|
||||
def test_argument_handling(self):
|
||||
# check that no exception is raised when some values are None
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', 0, None, None, None),
|
||||
[],
|
||||
)
|
||||
class TestGenericSearchNgrams(TestGenericSearchBase, unittest.TestCase):
|
||||
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
|
||||
max_l_dist=None):
|
||||
return [
|
||||
get_best_match_in_group(group)
|
||||
for group in group_matches(
|
||||
c_fnm_generic_ngrams(pattern.encode('ascii'),
|
||||
sequence.encode('ascii'),
|
||||
max_subs, max_ins,
|
||||
max_dels, max_l_dist)
|
||||
)
|
||||
]
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, 0, None, None),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, None, 0, None),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, None, None, 0),
|
||||
[],
|
||||
def test_missing_second_item_complex(self):
|
||||
self.assertTrue(
|
||||
set(self.search('bde', 'abcdefg', 1, 1, 1, 1)).issubset([
|
||||
Match(start=1, end=5, dist=1),
|
||||
Match(start=2, end=5, dist=1),
|
||||
Match(start=3, end=5, dist=1),
|
||||
])
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue