multiple changes in testing, input validation and implementations

This commit is contained in:
Tal Einat 2014-04-24 19:37:59 +03:00
parent 95a724ebc0
commit 21f4014acc
7 changed files with 2719 additions and 1479 deletions

View File

@ -25,8 +25,9 @@ from fuzzysearch.common import Match, get_best_match_in_group, group_matches, \
search_exact
from fuzzysearch.levenshtein import find_near_matches_levenshtein
from fuzzysearch.substitutions_only import find_near_matches_substitutions
from fuzzysearch.generic_search import \
find_near_matches_generic_linear_programming
from fuzzysearch.generic_search import find_near_matches_generic, \
find_near_matches_generic_linear_programming, _check_arguments, \
_get_max_l_dist
def find_near_matches(subsequence, sequence,
@ -45,31 +46,15 @@ def find_near_matches(subsequence, sequence,
* the total number of substitutions, insertions and deletions
(a.k.a. the Levenshtein distance)
"""
if max_l_dist is None:
if (
max_substitutions is None and
max_insertions is None and
max_deletions is None
):
raise ValueError('No limitations given!')
_check_arguments(subsequence, sequence, max_substitutions, max_insertions,
max_deletions, max_l_dist)
if max_substitutions is None:
raise ValueError('# substitutions must be limited!')
if max_insertions is None:
raise ValueError('# insertions must be limited!')
if max_deletions is None:
raise ValueError('# deletions must be limited!')
max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
max_deletions, max_l_dist)
# if the limitations are so strict that only exact matches are allowed,
# use search_exact()
if (
max_l_dist == 0 or
(
max_substitutions == 0 and
max_insertions == 0 and
max_deletions == 0
)
):
if max_l_dist == 0:
return [
Match(start_index, start_index + len(subsequence), 0)
for start_index in search_exact(subsequence, sequence)
@ -83,7 +68,7 @@ def find_near_matches(subsequence, sequence,
# if it is enough to just take into account the maximum Levenshtein
# distance, use find_near_matches_levenshtein()
elif max_l_dist is not None and max_l_dist <= min([max_l_dist] + [
elif max_l_dist <= min([
param for param in [
max_substitutions, max_insertions, max_deletions
]
@ -91,10 +76,8 @@ def find_near_matches(subsequence, sequence,
]):
return find_near_matches_levenshtein(subsequence, sequence, max_l_dist)
# if none of the special cases above are met, use the most generic version:
# find_near_matches_generic_linear_programming()
# if none of the special cases above are met, use the most generic version
else:
return list(find_near_matches_generic_linear_programming(
subsequence, sequence,
max_substitutions, max_insertions, max_deletions, max_l_dist,
))
return find_near_matches_generic(subsequence, sequence,
max_substitutions, max_insertions,
max_deletions, max_l_dist)

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,8 @@
from sys import maxint
import six
from fuzzysearch.common import Match
from libc.stdlib cimport malloc, free, realloc
from libc.string cimport strstr, strncpy
__all__ = ['c_find_near_matches_generic_linear_programming']
@ -43,29 +45,27 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
raise ValueError('Given subsequence is empty!')
# optimization: prepare some often used things in advance
cdef int _subseq_len = len(subsequence)
cdef int _subseq_len_minus_one = _subseq_len - 1
cdef size_t _subseq_len = len(subsequence)
cdef size_t _subseq_len_minus_one = _subseq_len - 1
maxes_sum = sum(
(x if x is not None else 0)
for x in [max_substitutions, max_insertions, max_deletions]
cdef unsigned int c_max_substitutions = max_substitutions if max_substitutions is not None else (1<<29)
cdef unsigned int c_max_insertions = max_insertions if max_insertions is not None else (1<<29)
cdef unsigned int c_max_deletions = max_deletions if max_deletions is not None else (1<<29)
# TODO: write a good comment
cdef unsigned int c_max_l_dist = min(
max_l_dist if max_l_dist is not None else (1<<29),
c_max_substitutions + c_max_insertions + c_max_deletions,
)
if max_l_dist is None or max_l_dist >= maxes_sum:
max_l_dist = maxes_sum
cdef c_max_l_dist = max_l_dist
cdef c_max_substitutions = max_substitutions
cdef c_max_insertions = max_insertions
cdef c_max_deletions = max_deletions
cdef alloc_size
cdef size_t alloc_size
cdef GenericSearchCandidate* candidates
cdef GenericSearchCandidate* new_candidates
cdef GenericSearchCandidate* _tmp
cdef GenericSearchCandidate cand
cdef int n_candidates = 0
cdef int n_new_candidates = 0
cdef int n_cand
cdef size_t n_candidates = 0
cdef size_t n_new_candidates = 0
cdef size_t n_cand
cdef char* c_sequence = sequence
cdef char* c_subsequence = subsequence
@ -80,7 +80,7 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
free(candidates)
raise MemoryError()
cdef unsigned int index
cdef size_t index
try:
index = 0
have_realloced = False
@ -92,7 +92,7 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
cand = candidates[n_cand]
if n_new_candidates + 4 > alloc_size:
alloc_size += alloc_size // 2
alloc_size *= 2
_tmp = <GenericSearchCandidate *>realloc(new_candidates, alloc_size * sizeof(GenericSearchCandidate))
if _tmp is NULL:
raise MemoryError()
@ -218,3 +218,80 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
finally:
free(candidates)
free(new_candidates)
def c_find_near_matches_generic_ngrams(subsequence, sequence,
max_substitutions, max_insertions,
max_deletions, max_l_dist=None):
"""search for near-matches of subsequence in sequence
This searches for near-matches, where the nearly-matching parts of the
sequence must meet the following limitations (relative to the subsequence):
* the maximum allowed number of character substitutions
* the maximum allowed number of new characters inserted
* and the maximum allowed number of character deletions
* the total number of substitutions, insertions and deletions
"""
if not isinstance(sequence, ALLOWED_TYPES):
raise TypeError('sequence is of invalid type %s' % type(subsequence))
if not isinstance(subsequence, ALLOWED_TYPES):
raise TypeError('subsequence is of invalid type %s' % type(subsequence))
if not subsequence:
raise ValueError('Given subsequence is empty!')
# optimization: prepare some often used things in advance
cdef size_t _subseq_len = len(subsequence)
cdef size_t _subseq_len_minus_one = _subseq_len - 1
cdef size_t _seq_len = len(sequence)
cdef unsigned int c_max_substitutions = max_substitutions if max_substitutions is not None else (1<<29)
cdef unsigned int c_max_insertions = max_insertions if max_insertions is not None else (1<<29)
cdef unsigned int c_max_deletions = max_deletions if max_deletions is not None else (1<<29)
# TODO: write a good comment
cdef unsigned int c_max_l_dist = min(
max_l_dist if max_l_dist is not None else (1<<29),
c_max_substitutions + c_max_insertions + c_max_deletions,
)
cdef char* c_sequence = sequence
cdef char* c_subsequence = subsequence
cdef char* ngram_str
cdef size_t ngram_len = _subseq_len // (c_max_l_dist + 1)
if ngram_len == 0:
raise ValueError('the subsequence length must be greater than max_l_dist')
ngram_str = <char *> malloc((ngram_len + 1) * sizeof(char))
if ngram_str is NULL:
raise MemoryError()
cdef int index
cdef size_t ngram_start, small_search_start_index
cdef char *match_ptr
try:
ngram_str[ngram_len] = 0
for ngram_start in xrange(0, _subseq_len - ngram_len + 1, ngram_len):
strncpy(ngram_str, c_subsequence + ngram_start, ngram_len)
match_ptr = strstr(c_sequence, ngram_str)
while match_ptr != NULL:
index = (match_ptr - c_sequence)
small_search_start_index = max(0, index - <int>(ngram_start + c_max_l_dist))
# try to expand left and/or right according to n_ngram
for match in c_find_near_matches_generic_linear_programming(
subsequence, sequence[small_search_start_index:index - ngram_start + _subseq_len + c_max_l_dist],
max_substitutions, max_insertions, max_deletions, c_max_l_dist,
):
yield match._replace(
start=match.start + small_search_start_index,
end=match.end + small_search_start_index,
)
match_ptr = strstr(match_ptr + 1, ngram_str)
finally:
free(ngram_str)

View File

@ -18,11 +18,50 @@ GenericSearchCandidate = namedtuple(
)
def _find_near_matches_generic_linear_programming(subsequence, sequence,
max_substitutions,
max_insertions,
max_deletions,
max_l_dist=None):
def _check_arguments(subsequence, sequence,
max_substitutions, max_insertions,
max_deletions, max_l_dist=None):
if not subsequence:
raise ValueError('Given subsequence is empty!')
if max_l_dist is None:
if (
max_substitutions is None or
max_insertions is None or
max_deletions is None
):
if (
max_substitutions is None and
max_insertions is None and
max_deletions is None
):
raise ValueError('No limitations given!')
if max_substitutions is None:
raise ValueError('# substitutions must be limited!')
if max_insertions is None:
raise ValueError('# insertions must be limited!')
if max_deletions is None:
raise ValueError('# deletions must be limited!')
def _get_max_l_dist(max_substitutions, max_insertions,
max_deletions, max_l_dist):
maxes_sum = (
(max_substitutions if max_substitutions is not None else (1 << 29)) +
(max_insertions if max_insertions is not None else (1 << 29)) +
(max_deletions if max_deletions is not None else (1 << 29))
)
return (
max_l_dist
if max_l_dist is not None and max_l_dist <= maxes_sum
else maxes_sum
)
def find_near_matches_generic(subsequence, sequence,
max_substitutions, max_insertions,
max_deletions, max_l_dist=None):
"""search for near-matches of subsequence in sequence
This searches for near-matches, where the nearly-matching parts of the
@ -33,19 +72,64 @@ def _find_near_matches_generic_linear_programming(subsequence, sequence,
* and the maximum allowed number of character deletions
* the total number of substitutions, insertions and deletions
"""
if not subsequence:
raise ValueError('Given subsequence is empty!')
_check_arguments(subsequence, sequence, max_substitutions, max_insertions,
max_deletions, max_l_dist)
max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
max_deletions, max_l_dist)
# if the limitations are so strict that only exact matches are allowed,
# use search_exact()
if max_l_dist == 0:
return [
Match(start_index, start_index + len(subsequence), 0)
for start_index in search_exact(subsequence, sequence)
]
# if the n-gram length would be at least 3, use the n-gram search method
elif len(subsequence) // (max_l_dist + 1) >= 3:
return find_near_matches_generic_ngrams(subsequence, sequence,
max_substitutions,
max_insertions,
max_deletions,
max_l_dist)
# use the linear programming search method
else:
matches = find_near_matches_generic_linear_programming(
subsequence, sequence,
max_substitutions, max_insertions,
max_deletions, max_l_dist)
match_groups = group_matches(matches)
best_matches = [get_best_match_in_group(group) for group in match_groups]
return sorted(best_matches)
def _find_near_matches_generic_linear_programming(subsequence, sequence,
max_substitutions,
max_insertions,
max_deletions,
max_l_dist=None):
"""search for near-matches of subsequence in sequence
This searches for near-matches, where the nearly-matching parts of the
sequence must meet the following limitations (relative to the subsequence):
* the maximum allowed number of character substitutions
* the maximum allowed number of new characters inserted
* and the maximum allowed number of character deletions
* the total number of substitutions, insertions and deletions
"""
_check_arguments(subsequence, sequence, max_substitutions, max_insertions,
max_deletions, max_l_dist)
max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
max_deletions, max_l_dist)
# optimization: prepare some often used things in advance
_subseq_len = len(subsequence)
maxes_sum = sum(
(x if x is not None else 0)
for x in [max_substitutions, max_insertions, max_deletions]
)
if max_l_dist is None or max_l_dist >= maxes_sum:
max_l_dist = maxes_sum
candidates = []
for index, char in enumerate(sequence):
candidates.append(GenericSearchCandidate(index, 0, 0, 0, 0, 0))
@ -179,13 +263,12 @@ def find_near_matches_generic_ngrams(subsequence, sequence,
* and the maximum allowed number of character deletions
* the total number of substitutions, insertions and deletions
"""
maxes_sum = (
(max_substitutions if max_substitutions is not None else 0) +
(max_insertions if max_insertions is not None else 0) +
(max_deletions if max_deletions is not None else 0)
)
if max_l_dist is None or max_l_dist >= maxes_sum:
max_l_dist = maxes_sum
_check_arguments(subsequence, sequence,
max_substitutions, max_insertions,
max_deletions, max_l_dist)
max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
max_deletions, max_l_dist)
matches = list(_find_near_matches_generic_ngrams(subsequence, sequence,
max_substitutions,
@ -244,13 +327,12 @@ def has_near_match_generic_ngrams(subsequence, sequence,
* and the maximum allowed number of character deletions
* the total number of substitutions, insertions and deletions
"""
maxes_sum = (
(max_substitutions if max_substitutions is not None else 0) +
(max_insertions if max_insertions is not None else 0) +
(max_deletions if max_deletions is not None else 0)
)
if max_l_dist is None or max_l_dist >= maxes_sum:
max_l_dist = maxes_sum
_check_arguments(subsequence, sequence,
max_substitutions, max_insertions,
max_deletions, max_l_dist)
max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
max_deletions, max_l_dist)
for match in _find_near_matches_generic_ngrams(subsequence, sequence,
max_substitutions,

View File

@ -28,7 +28,7 @@ class TestFindNearMatches(unittest.TestCase):
MockFunctionFailsUnlessDefined()
self.mock_find_near_matches_substitutions = \
MockFunctionFailsUnlessDefined()
self.mock_find_near_matches_generic_linear_programming = \
self.mock_find_near_matches_generic = \
MockFunctionFailsUnlessDefined()
patcher = mock.patch.multiple(
@ -38,8 +38,8 @@ class TestFindNearMatches(unittest.TestCase):
self.mock_find_near_matches_levenshtein,
find_near_matches_substitutions=
self.mock_find_near_matches_substitutions,
find_near_matches_generic_linear_programming=
self.mock_find_near_matches_generic_linear_programming,
find_near_matches_generic=
self.mock_find_near_matches_generic,
)
self.addCleanup(patcher.stop)
patcher.start()
@ -154,14 +154,14 @@ class TestFindNearMatches(unittest.TestCase):
)
def test_generic(self):
self.mock_find_near_matches_generic_linear_programming.return_value = [42]
self.mock_find_near_matches_generic.return_value = [42]
self.assertEqual(
find_near_matches('a', 'a', 1, 1, 1),
[42],
)
self.assertEqual(
self.mock_find_near_matches_generic_linear_programming.call_count,
self.mock_find_near_matches_generic.call_count,
1,
)
@ -170,6 +170,6 @@ class TestFindNearMatches(unittest.TestCase):
[42],
)
self.assertEqual(
self.mock_find_near_matches_generic_linear_programming.call_count,
self.mock_find_near_matches_generic.call_count,
2,
)

View File

@ -9,7 +9,7 @@ from fuzzysearch.generic_search import \
has_near_match_generic_ngrams as hnm_generic_ngrams
class TestGenericSearchLPAsLevenshtein(TestFindNearMatchesLevenshteinBase,
class TestGenericSearchLpAsLevenshtein(TestFindNearMatchesLevenshteinBase,
unittest.TestCase):
def search(self, subsequence, sequence, max_l_dist):
return [
@ -28,7 +28,7 @@ class TestGenericSearchNgramsAsLevenshtein(TestFindNearMatchesLevenshteinBase,
max_l_dist, max_l_dist, max_l_dist)
class TestGenericSearchLPAsSubstitutionsOnly(TestSubstitionsOnlyBase,
class TestGenericSearchLpAsSubstitutionsOnly(TestSubstitionsOnlyBase,
unittest.TestCase):
def search(self, subsequence, sequence, max_subs):
return list(
@ -90,11 +90,6 @@ class TestGenericSearchBase(object):
[Match(start=4, end=7, dist=0)],
)
self.assertListEqual(
self.search('def', 'abcddefg', 0, 1, 0),
[Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
)
self.assertIn(
Match(start=4, end=7, dist=0),
self.search('def', 'abcddefg', 0, 0, 1),
@ -129,6 +124,70 @@ class TestGenericSearchBase(object):
[Match(start=3, end=5, dist=1)],
)
def test_valid_none_arguments(self):
# check that no exception is raised when some values are None
self.assertEqual(
self.search('a', 'b', 0, None, None, 0),
[],
)
self.assertEqual(
self.search('a', 'b', None, 0, None, 0),
[],
)
self.assertEqual(
self.search('a', 'b', None, None, 0, 0),
[],
)
self.assertEqual(
self.search('a', 'b', 0, 0, None, 0),
[],
)
self.assertEqual(
self.search('a', 'b', 0, None, 0, 0),
[],
)
self.assertEqual(
self.search('a', 'b', None, 0, 0, 0),
[],
)
self.assertEqual(
self.search('a', 'b', None, None, None, 0),
[],
)
self.assertEqual(
self.search('a', 'b', 0, 0, 0, None),
[],
)
def test_invalid_none_arguments(self):
# check that an exception is raised when max_l_dist is None as well as
# at least one other limitation
with self.assertRaises(ValueError):
self.search('a', 'b', None, None, None, None)
class TestGenericSearchLp(TestGenericSearchBase, unittest.TestCase):
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
max_l_dist=None):
return list(fnm_generic_lp(pattern, sequence,
max_subs, max_ins, max_dels, max_l_dist))
def test_double_first_item_two_results(self):
# sequence = 'abcdefg'
# pattern = 'bde'
self.assertListEqual(
self.search('def', 'abcddefg', 0, 1, 0),
[Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
)
def test_missing_second_item_complex(self):
self.assertListEqual(
self.search('bde', 'abcdefg', 1, 1, 1, 1),
[Match(start=1, end=5, dist=1),
@ -136,7 +195,6 @@ class TestGenericSearchBase(object):
Match(start=3, end=5, dist=1)],
)
def test_missing_second_item_complex(self):
self.assertTrue(
set([
Match(start=1, end=5, dist=1),
@ -148,36 +206,6 @@ class TestGenericSearchBase(object):
))
)
def test_argument_handling(self):
# check that no exception is raised when some values are None
self.assertEqual(
self.search('a', 'b', 0, None, None, None),
[],
)
self.assertEqual(
self.search('a', 'b', None, 0, None, None),
[],
)
self.assertEqual(
self.search('a', 'b', None, None, 0, None),
[],
)
self.assertEqual(
self.search('a', 'b', None, None, None, 0),
[],
)
class TestGenericSearchLP(TestGenericSearchBase, unittest.TestCase):
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
max_l_dist=None):
return list(fnm_generic_lp(pattern, sequence,
max_subs, max_ins, max_dels, max_l_dist))
class TestGenericSearchNgrams(TestGenericSearchBase, unittest.TestCase):
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
max_l_dist=None):
@ -185,15 +213,13 @@ class TestGenericSearchNgrams(TestGenericSearchBase, unittest.TestCase):
max_subs, max_ins, max_dels, max_l_dist)
def test_missing_second_item_complex(self):
pass
@unittest.skip("Ngrams search doesn't return overlapping matches")
def test_double_first_item(self):
return super(TestGenericSearchNgrams, self).test_double_first_item()
@unittest.skip("Ngrams search doesn't return overlapping matches")
def test_missing_second_item(self):
return super(TestGenericSearchNgrams, self).test_double_first_item()
self.assertTrue(
set(self.search('bde', 'abcdefg', 1, 1, 1, 1)).issubset([
Match(start=1, end=5, dist=1),
Match(start=2, end=5, dist=1),
Match(start=3, end=5, dist=1),
])
)
class TestHasNearMatchGenericNgramsAsSubstitutionsOnly(

View File

@ -5,11 +5,12 @@ from tests.test_substitutions_only import TestSubstitionsOnlyBase
try:
from fuzzysearch._generic_search import \
c_find_near_matches_generic_linear_programming as c_fnm_generic_lp
c_find_near_matches_generic_linear_programming as c_fnm_generic_lp, \
c_find_near_matches_generic_ngrams as c_fnm_generic_ngrams
except ImportError:
pass
else:
class TestGenericSearchAsLevenshtein(TestFindNearMatchesLevenshteinBase,
class TestGenericSearchLpAsLevenshtein(TestFindNearMatchesLevenshteinBase,
unittest.TestCase):
def search(self, subsequence, sequence, max_l_dist):
return [
@ -17,13 +18,27 @@ else:
for group in group_matches(
c_fnm_generic_lp(subsequence.encode('ascii'),
sequence.encode('ascii'),
max_l_dist, max_l_dist, max_l_dist, max_l_dist)
max_l_dist, max_l_dist,
max_l_dist, max_l_dist)
)
]
class TestGenericSearchNgramsAsLevenshtein(
TestFindNearMatchesLevenshteinBase, unittest.TestCase):
def search(self, subsequence, sequence, max_l_dist):
return [
get_best_match_in_group(group)
for group in group_matches(
c_fnm_generic_ngrams(subsequence.encode('ascii'),
sequence.encode('ascii'),
max_l_dist, max_l_dist,
max_l_dist, max_l_dist)
)
]
class TestGenericSearchAsSubstitutionsOnly(TestSubstitionsOnlyBase,
unittest.TestCase):
class TestGenericSearchLpAsSubstitutionsOnly(TestSubstitionsOnlyBase,
unittest.TestCase):
def search(self, subsequence, sequence, max_subs):
return list(
c_fnm_generic_lp(subsequence.encode('ascii'),
@ -32,15 +47,25 @@ else:
)
class TestGenericSearch(unittest.TestCase):
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
max_l_dist=None):
return list(
c_fnm_generic_lp(pattern.encode('ascii'),
sequence.encode('ascii'),
max_subs, max_ins, max_dels, max_l_dist)
)
class TestGenericSearchNgramsAsSubstitutionsOnly(TestSubstitionsOnlyBase,
unittest.TestCase):
def search(self, subsequence, sequence, max_subs):
return [
get_best_match_in_group(group)
for group in group_matches(
c_fnm_generic_ngrams(subsequence.encode('ascii'),
sequence.encode('ascii'),
max_subs, 0, 0, max_subs)
)
]
@unittest.skip("Ngrams search doesn't return overlapping matches")
def test_double_first_item(self):
return super(TestGenericSearchNgramsAsSubstitutionsOnly,
self).test_double_first_item()
class TestGenericSearchBase(object):
def test_empty_sequence(self):
self.assertEqual([], self.search('PATTERN', '', 0, 0, 0))
@ -78,11 +103,6 @@ else:
self.search('def', 'abcddefg', 1, 0, 0),
)
self.assertListEqual(
[Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
self.search('def', 'abcddefg', 0, 1, 0),
)
self.assertIn(
Match(start=4, end=7, dist=0),
self.search('def', 'abcddefg', 0, 0, 1),
@ -117,6 +137,65 @@ else:
[Match(start=3, end=5, dist=1)],
)
def test_valid_none_arguments(self):
# check that no exception is raised when some values are None
self.assertEqual(
self.search('a', 'b', 0, None, None, 0),
[],
)
self.assertEqual(
self.search('a', 'b', None, 0, None, 0),
[],
)
self.assertEqual(
self.search('a', 'b', None, None, 0, 0),
[],
)
self.assertEqual(
self.search('a', 'b', 0, 0, None, 0),
[],
)
self.assertEqual(
self.search('a', 'b', 0, None, 0, 0),
[],
)
self.assertEqual(
self.search('a', 'b', None, 0, 0, 0),
[],
)
self.assertEqual(
self.search('a', 'b', None, None, None, 0),
[],
)
self.assertEqual(
self.search('a', 'b', 0, 0, 0, None),
[],
)
class TestGenericSearchLp(TestGenericSearchBase, unittest.TestCase):
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
max_l_dist=None):
return list(c_fnm_generic_lp(pattern.encode('ascii'),
sequence.encode('ascii'),
max_subs, max_ins,
max_dels, max_l_dist))
def test_double_first_item_two_results(self):
# sequence = 'abcdefg'
# pattern = 'bde'
self.assertListEqual(
self.search('def', 'abcddefg', 0, 1, 0),
[Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
)
def test_missing_second_item_complex(self):
self.assertListEqual(
self.search('bde', 'abcdefg', 1, 1, 1, 1),
[Match(start=1, end=5, dist=1),
@ -135,24 +214,24 @@ else:
))
)
def test_argument_handling(self):
# check that no exception is raised when some values are None
self.assertEqual(
self.search('a', 'b', 0, None, None, None),
[],
)
class TestGenericSearchNgrams(TestGenericSearchBase, unittest.TestCase):
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
max_l_dist=None):
return [
get_best_match_in_group(group)
for group in group_matches(
c_fnm_generic_ngrams(pattern.encode('ascii'),
sequence.encode('ascii'),
max_subs, max_ins,
max_dels, max_l_dist)
)
]
self.assertEqual(
self.search('a', 'b', None, 0, None, None),
[],
)
self.assertEqual(
self.search('a', 'b', None, None, 0, None),
[],
)
self.assertEqual(
self.search('a', 'b', None, None, None, 0),
[],
def test_missing_second_item_complex(self):
self.assertTrue(
set(self.search('bde', 'abcdefg', 1, 1, 1, 1)).issubset([
Match(start=1, end=5, dist=1),
Match(start=2, end=5, dist=1),
Match(start=3, end=5, dist=1),
])
)