multiple changes in testing, input validation and implementations

2014-04-24 19:37:59 +03:00 · 2014-04-24 19:37:59 +03:00 · 21f4014acc
parent 95a724ebc0
commit 21f4014acc
7 changed files with 2719 additions and 1479 deletions
--- a/fuzzysearch/init.py
+++ b/fuzzysearch/init.py
@ -25,8 +25,9 @@ from fuzzysearch.common import Match, get_best_match_in_group, group_matches, \
    search_exact
 from fuzzysearch.levenshtein import find_near_matches_levenshtein
 from fuzzysearch.substitutions_only import find_near_matches_substitutions
-from fuzzysearch.generic_search import \
-    find_near_matches_generic_linear_programming
+from fuzzysearch.generic_search import find_near_matches_generic, \
+    find_near_matches_generic_linear_programming, _check_arguments, \
+    _get_max_l_dist


 def find_near_matches(subsequence, sequence,
@ -45,31 +46,15 @@ def find_near_matches(subsequence, sequence,
    * the total number of substitutions, insertions and deletions
      (a.k.a. the Levenshtein distance)
    """
-    if max_l_dist is None:
-        if (
-                max_substitutions is None and
-                max_insertions is None and
-                max_deletions is None
-        ):
-            raise ValueError('No limitations given!')
+    _check_arguments(subsequence, sequence, max_substitutions, max_insertions,
+                     max_deletions, max_l_dist)

-        if max_substitutions is None:
-            raise ValueError('# substitutions must be limited!')
-        if max_insertions is None:
-            raise ValueError('# insertions must be limited!')
-        if max_deletions is None:
-            raise ValueError('# deletions must be limited!')
+    max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
+                                 max_deletions, max_l_dist)

    # if the limitations are so strict that only exact matches are allowed,
    # use search_exact()
-    if (
-            max_l_dist == 0 or
-            (
-                max_substitutions == 0 and
-                max_insertions == 0 and
-                max_deletions == 0
-            )
-    ):
+    if max_l_dist == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
@ -83,7 +68,7 @@ def find_near_matches(subsequence, sequence,

    # if it is enough to just take into account the maximum Levenshtein
    # distance, use find_near_matches_levenshtein()
-    elif max_l_dist is not None and max_l_dist <= min([max_l_dist] + [
+    elif max_l_dist <= min([
            param for param in [
                max_substitutions, max_insertions, max_deletions
            ]
@ -91,10 +76,8 @@ def find_near_matches(subsequence, sequence,
    ]):
        return find_near_matches_levenshtein(subsequence, sequence, max_l_dist)

-    # if none of the special cases above are met, use the most generic version:
-    # find_near_matches_generic_linear_programming()
+    # if none of the special cases above are met, use the most generic version
    else:
-        return list(find_near_matches_generic_linear_programming(
-            subsequence, sequence,
-            max_substitutions, max_insertions, max_deletions, max_l_dist,
-        ))
+        return find_near_matches_generic(subsequence, sequence,
+                                         max_substitutions, max_insertions,
+                                         max_deletions, max_l_dist)
--- a/fuzzysearch/_generic_search.c
+++ b/fuzzysearch/_generic_search.c
--- a/fuzzysearch/_generic_search.pyx
+++ b/fuzzysearch/_generic_search.pyx
@ -1,6 +1,8 @@
+from sys import maxint
 import six
 from fuzzysearch.common import Match
 from libc.stdlib cimport malloc, free, realloc
+from libc.string cimport strstr, strncpy


 __all__ = ['c_find_near_matches_generic_linear_programming']
@ -43,29 +45,27 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
        raise ValueError('Given subsequence is empty!')

    # optimization: prepare some often used things in advance
-    cdef int _subseq_len = len(subsequence)
-    cdef int _subseq_len_minus_one = _subseq_len - 1
+    cdef size_t _subseq_len = len(subsequence)
+    cdef size_t _subseq_len_minus_one = _subseq_len - 1

-    maxes_sum = sum(
-        (x if x is not None else 0)
-        for x in [max_substitutions, max_insertions, max_deletions]
+    cdef unsigned int c_max_substitutions = max_substitutions if max_substitutions is not None else (1<<29)
+    cdef unsigned int c_max_insertions = max_insertions if max_insertions is not None else (1<<29)
+    cdef unsigned int c_max_deletions = max_deletions if max_deletions is not None else (1<<29)
+
+    # TODO: write a good comment
+    cdef unsigned int c_max_l_dist = min(
+        max_l_dist if max_l_dist is not None else (1<<29),
+        c_max_substitutions + c_max_insertions + c_max_deletions,
    )
-    if max_l_dist is None or max_l_dist >= maxes_sum:
-        max_l_dist = maxes_sum

-    cdef c_max_l_dist = max_l_dist
-    cdef c_max_substitutions = max_substitutions
-    cdef c_max_insertions = max_insertions
-    cdef c_max_deletions = max_deletions
-
-    cdef alloc_size
+    cdef size_t alloc_size
    cdef GenericSearchCandidate* candidates
    cdef GenericSearchCandidate* new_candidates
    cdef GenericSearchCandidate* _tmp
    cdef GenericSearchCandidate cand
-    cdef int n_candidates = 0
-    cdef int n_new_candidates = 0
-    cdef int n_cand
+    cdef size_t n_candidates = 0
+    cdef size_t n_new_candidates = 0
+    cdef size_t n_cand

    cdef char* c_sequence = sequence
    cdef char* c_subsequence = subsequence
@ -80,7 +80,7 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
        free(candidates)
        raise MemoryError()

-    cdef unsigned int index
+    cdef size_t index
    try:
        index = 0
        have_realloced = False
@ -92,7 +92,7 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
                cand = candidates[n_cand]

                if n_new_candidates + 4 > alloc_size:
-                    alloc_size += alloc_size // 2
+                    alloc_size *= 2
                    _tmp = <GenericSearchCandidate *>realloc(new_candidates, alloc_size * sizeof(GenericSearchCandidate))
                    if _tmp is NULL:
                        raise MemoryError()
@ -218,3 +218,80 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
    finally:
        free(candidates)
        free(new_candidates)
+
+
+def c_find_near_matches_generic_ngrams(subsequence, sequence,
+                                       max_substitutions, max_insertions,
+                                       max_deletions, max_l_dist=None):
+    """search for near-matches of subsequence in sequence
+
+    This searches for near-matches, where the nearly-matching parts of the
+    sequence must meet the following limitations (relative to the subsequence):
+
+    * the maximum allowed number of character substitutions
+    * the maximum allowed number of new characters inserted
+    * and the maximum allowed number of character deletions
+    * the total number of substitutions, insertions and deletions
+    """
+    if not isinstance(sequence, ALLOWED_TYPES):
+        raise TypeError('sequence is of invalid type %s' % type(subsequence))
+    if not isinstance(subsequence, ALLOWED_TYPES):
+        raise TypeError('subsequence is of invalid type %s' % type(subsequence))
+
+    if not subsequence:
+        raise ValueError('Given subsequence is empty!')
+
+    # optimization: prepare some often used things in advance
+    cdef size_t _subseq_len = len(subsequence)
+    cdef size_t _subseq_len_minus_one = _subseq_len - 1
+    cdef size_t _seq_len = len(sequence)
+
+    cdef unsigned int c_max_substitutions = max_substitutions if max_substitutions is not None else (1<<29)
+    cdef unsigned int c_max_insertions = max_insertions if max_insertions is not None else (1<<29)
+    cdef unsigned int c_max_deletions = max_deletions if max_deletions is not None else (1<<29)
+
+    # TODO: write a good comment
+    cdef unsigned int c_max_l_dist = min(
+        max_l_dist if max_l_dist is not None else (1<<29),
+        c_max_substitutions + c_max_insertions + c_max_deletions,
+    )
+
+    cdef char* c_sequence = sequence
+    cdef char* c_subsequence = subsequence
+    cdef char* ngram_str
+
+    cdef size_t ngram_len = _subseq_len // (c_max_l_dist + 1)
+    if ngram_len == 0:
+        raise ValueError('the subsequence length must be greater than max_l_dist')
+
+    ngram_str = <char *> malloc((ngram_len + 1) * sizeof(char))
+    if ngram_str is NULL:
+        raise MemoryError()
+
+    cdef int index
+    cdef size_t ngram_start, small_search_start_index
+    cdef char *match_ptr
+
+    try:
+        ngram_str[ngram_len] = 0
+
+        for ngram_start in xrange(0, _subseq_len - ngram_len + 1, ngram_len):
+            strncpy(ngram_str, c_subsequence + ngram_start, ngram_len)
+
+            match_ptr = strstr(c_sequence, ngram_str)
+            while match_ptr != NULL:
+                index = (match_ptr - c_sequence)
+                small_search_start_index = max(0, index - <int>(ngram_start + c_max_l_dist))
+                # try to expand left and/or right according to n_ngram
+                for match in c_find_near_matches_generic_linear_programming(
+                    subsequence, sequence[small_search_start_index:index - ngram_start + _subseq_len + c_max_l_dist],
+                    max_substitutions, max_insertions, max_deletions, c_max_l_dist,
+                ):
+                    yield match._replace(
+                        start=match.start + small_search_start_index,
+                        end=match.end + small_search_start_index,
+                    )
+                match_ptr = strstr(match_ptr + 1, ngram_str)
+
+    finally:
+        free(ngram_str)
--- a/fuzzysearch/generic_search.py
+++ b/fuzzysearch/generic_search.py
@ -18,11 +18,50 @@ GenericSearchCandidate = namedtuple(
 )


-def _find_near_matches_generic_linear_programming(subsequence, sequence,
-                                                 max_substitutions,
-                                                 max_insertions,
-                                                 max_deletions,
-                                                 max_l_dist=None):
+def _check_arguments(subsequence, sequence,
+                     max_substitutions, max_insertions,
+                     max_deletions, max_l_dist=None):
+    if not subsequence:
+        raise ValueError('Given subsequence is empty!')
+
+    if max_l_dist is None:
+        if (
+                max_substitutions is None or
+                max_insertions is None or
+                max_deletions is None
+        ):
+            if (
+                    max_substitutions is None and
+                    max_insertions is None and
+                    max_deletions is None
+            ):
+                raise ValueError('No limitations given!')
+
+            if max_substitutions is None:
+                raise ValueError('# substitutions must be limited!')
+            if max_insertions is None:
+                raise ValueError('# insertions must be limited!')
+            if max_deletions is None:
+                raise ValueError('# deletions must be limited!')
+
+
+def _get_max_l_dist(max_substitutions, max_insertions,
+                    max_deletions, max_l_dist):
+    maxes_sum = (
+        (max_substitutions if max_substitutions is not None else (1 << 29)) +
+        (max_insertions if max_insertions is not None else (1 << 29)) +
+        (max_deletions if max_deletions is not None else (1 << 29))
+    )
+    return (
+        max_l_dist
+        if max_l_dist is not None and max_l_dist <= maxes_sum
+        else maxes_sum
+    )
+
+
+def find_near_matches_generic(subsequence, sequence,
+                              max_substitutions, max_insertions,
+                              max_deletions, max_l_dist=None):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
@ -33,19 +72,64 @@ def _find_near_matches_generic_linear_programming(subsequence, sequence,
    * and the maximum allowed number of character deletions
    * the total number of substitutions, insertions and deletions
    """
-    if not subsequence:
-        raise ValueError('Given subsequence is empty!')
+    _check_arguments(subsequence, sequence, max_substitutions, max_insertions,
+                     max_deletions, max_l_dist)
+
+    max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
+                                 max_deletions, max_l_dist)
+
+    # if the limitations are so strict that only exact matches are allowed,
+    # use search_exact()
+    if max_l_dist == 0:
+        return [
+            Match(start_index, start_index + len(subsequence), 0)
+            for start_index in search_exact(subsequence, sequence)
+        ]
+
+    # if the n-gram length would be at least 3, use the n-gram search method
+    elif len(subsequence) // (max_l_dist + 1) >= 3:
+        return find_near_matches_generic_ngrams(subsequence, sequence,
+                                                max_substitutions,
+                                                max_insertions,
+                                                max_deletions,
+                                                max_l_dist)
+
+    # use the linear programming search method
+    else:
+        matches = find_near_matches_generic_linear_programming(
+            subsequence, sequence,
+            max_substitutions, max_insertions,
+            max_deletions, max_l_dist)
+
+        match_groups = group_matches(matches)
+        best_matches = [get_best_match_in_group(group) for group in match_groups]
+        return sorted(best_matches)
+
+
+def _find_near_matches_generic_linear_programming(subsequence, sequence,
+                                                  max_substitutions,
+                                                  max_insertions,
+                                                  max_deletions,
+                                                  max_l_dist=None):
+    """search for near-matches of subsequence in sequence
+
+    This searches for near-matches, where the nearly-matching parts of the
+    sequence must meet the following limitations (relative to the subsequence):
+
+    * the maximum allowed number of character substitutions
+    * the maximum allowed number of new characters inserted
+    * and the maximum allowed number of character deletions
+    * the total number of substitutions, insertions and deletions
+    """
+    _check_arguments(subsequence, sequence, max_substitutions, max_insertions,
+                     max_deletions, max_l_dist)
+
+    max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
+                                 max_deletions, max_l_dist)

    # optimization: prepare some often used things in advance
    _subseq_len = len(subsequence)

-    maxes_sum = sum(
-        (x if x is not None else 0)
-        for x in [max_substitutions, max_insertions, max_deletions]
-    )
-    if max_l_dist is None or max_l_dist >= maxes_sum:
-        max_l_dist = maxes_sum
-
    candidates = []
    for index, char in enumerate(sequence):
        candidates.append(GenericSearchCandidate(index, 0, 0, 0, 0, 0))
@ -179,13 +263,12 @@ def find_near_matches_generic_ngrams(subsequence, sequence,
    * and the maximum allowed number of character deletions
    * the total number of substitutions, insertions and deletions
    """
-    maxes_sum = (
-        (max_substitutions if max_substitutions is not None else 0) +
-        (max_insertions if max_insertions is not None else 0) +
-        (max_deletions if max_deletions is not None else 0)
-    )
-    if max_l_dist is None or max_l_dist >= maxes_sum:
-        max_l_dist = maxes_sum
+    _check_arguments(subsequence, sequence,
+                     max_substitutions, max_insertions,
+                     max_deletions, max_l_dist)
+
+    max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
+                                 max_deletions, max_l_dist)

    matches = list(_find_near_matches_generic_ngrams(subsequence, sequence,
                                                     max_substitutions,
@ -244,13 +327,12 @@ def has_near_match_generic_ngrams(subsequence, sequence,
    * and the maximum allowed number of character deletions
    * the total number of substitutions, insertions and deletions
    """
-    maxes_sum = (
-        (max_substitutions if max_substitutions is not None else 0) +
-        (max_insertions if max_insertions is not None else 0) +
-        (max_deletions if max_deletions is not None else 0)
-    )
-    if max_l_dist is None or max_l_dist >= maxes_sum:
-        max_l_dist = maxes_sum
+    _check_arguments(subsequence, sequence,
+                     max_substitutions, max_insertions,
+                     max_deletions, max_l_dist)
+
+    max_l_dist = _get_max_l_dist(max_substitutions, max_insertions,
+                                 max_deletions, max_l_dist)

    for match in _find_near_matches_generic_ngrams(subsequence, sequence,
                                                   max_substitutions,
--- a/tests/test_find_near_matches.py
+++ b/tests/test_find_near_matches.py
@ -28,7 +28,7 @@ class TestFindNearMatches(unittest.TestCase):
            MockFunctionFailsUnlessDefined()
        self.mock_find_near_matches_substitutions = \
            MockFunctionFailsUnlessDefined()
-        self.mock_find_near_matches_generic_linear_programming = \
+        self.mock_find_near_matches_generic = \
            MockFunctionFailsUnlessDefined()

        patcher = mock.patch.multiple(
@ -38,8 +38,8 @@ class TestFindNearMatches(unittest.TestCase):
                self.mock_find_near_matches_levenshtein,
            find_near_matches_substitutions=
                self.mock_find_near_matches_substitutions,
-            find_near_matches_generic_linear_programming=
-                self.mock_find_near_matches_generic_linear_programming,
+            find_near_matches_generic=
+                self.mock_find_near_matches_generic,
        )
        self.addCleanup(patcher.stop)
        patcher.start()
@ -154,14 +154,14 @@ class TestFindNearMatches(unittest.TestCase):
        )

    def test_generic(self):
-        self.mock_find_near_matches_generic_linear_programming.return_value = [42]
+        self.mock_find_near_matches_generic.return_value = [42]

        self.assertEqual(
            find_near_matches('a', 'a', 1, 1, 1),
            [42],
        )
        self.assertEqual(
-            self.mock_find_near_matches_generic_linear_programming.call_count,
+            self.mock_find_near_matches_generic.call_count,
            1,
        )

@ -170,6 +170,6 @@ class TestFindNearMatches(unittest.TestCase):
            [42],
        )
        self.assertEqual(
-            self.mock_find_near_matches_generic_linear_programming.call_count,
+            self.mock_find_near_matches_generic.call_count,
            2,
        )
--- a/tests/test_generic_search.py
+++ b/tests/test_generic_search.py
@ -9,7 +9,7 @@ from fuzzysearch.generic_search import \
    has_near_match_generic_ngrams as hnm_generic_ngrams


-class TestGenericSearchLPAsLevenshtein(TestFindNearMatchesLevenshteinBase,
+class TestGenericSearchLpAsLevenshtein(TestFindNearMatchesLevenshteinBase,
                                       unittest.TestCase):
    def search(self, subsequence, sequence, max_l_dist):
        return [
@ -28,7 +28,7 @@ class TestGenericSearchNgramsAsLevenshtein(TestFindNearMatchesLevenshteinBase,
                                  max_l_dist, max_l_dist, max_l_dist)


-class TestGenericSearchLPAsSubstitutionsOnly(TestSubstitionsOnlyBase,
+class TestGenericSearchLpAsSubstitutionsOnly(TestSubstitionsOnlyBase,
                                             unittest.TestCase):
    def search(self, subsequence, sequence, max_subs):
        return list(
@ -90,11 +90,6 @@ class TestGenericSearchBase(object):
            [Match(start=4, end=7, dist=0)],
        )

-        self.assertListEqual(
-            self.search('def', 'abcddefg', 0, 1, 0),
-            [Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
-        )
-
        self.assertIn(
            Match(start=4, end=7, dist=0),
            self.search('def', 'abcddefg', 0, 0, 1),
@ -129,6 +124,70 @@ class TestGenericSearchBase(object):
            [Match(start=3, end=5, dist=1)],
        )

+    def test_valid_none_arguments(self):
+        # check that no exception is raised when some values are None
+        self.assertEqual(
+            self.search('a', 'b', 0, None, None, 0),
+            [],
+        )
+
+        self.assertEqual(
+            self.search('a', 'b', None, 0, None, 0),
+            [],
+        )
+
+        self.assertEqual(
+            self.search('a', 'b', None, None, 0, 0),
+            [],
+        )
+
+        self.assertEqual(
+            self.search('a', 'b', 0, 0, None, 0),
+            [],
+        )
+
+        self.assertEqual(
+            self.search('a', 'b', 0, None, 0, 0),
+            [],
+        )
+
+        self.assertEqual(
+            self.search('a', 'b', None, 0, 0, 0),
+            [],
+        )
+
+        self.assertEqual(
+            self.search('a', 'b', None, None, None, 0),
+            [],
+        )
+
+        self.assertEqual(
+            self.search('a', 'b', 0, 0, 0, None),
+            [],
+        )
+
+    def test_invalid_none_arguments(self):
+        # check that an exception is raised when max_l_dist is None as well as
+        # at least one other limitation
+        with self.assertRaises(ValueError):
+            self.search('a', 'b', None, None, None, None)
+
+
+class TestGenericSearchLp(TestGenericSearchBase, unittest.TestCase):
+    def search(self, pattern, sequence, max_subs, max_ins, max_dels,
+               max_l_dist=None):
+        return list(fnm_generic_lp(pattern, sequence,
+                                   max_subs, max_ins, max_dels, max_l_dist))
+
+    def test_double_first_item_two_results(self):
+        # sequence = 'abcdefg'
+        # pattern = 'bde'
+        self.assertListEqual(
+            self.search('def', 'abcddefg', 0, 1, 0),
+            [Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
+        )
+
+    def test_missing_second_item_complex(self):
        self.assertListEqual(
            self.search('bde', 'abcdefg', 1, 1, 1, 1),
            [Match(start=1, end=5, dist=1),
@ -136,7 +195,6 @@ class TestGenericSearchBase(object):
             Match(start=3, end=5, dist=1)],
        )

-    def test_missing_second_item_complex(self):
        self.assertTrue(
            set([
                Match(start=1, end=5, dist=1),
@ -148,36 +206,6 @@ class TestGenericSearchBase(object):
            ))
        )

-    def test_argument_handling(self):
-        # check that no exception is raised when some values are None
-        self.assertEqual(
-            self.search('a', 'b', 0, None, None, None),
-            [],
-        )
-
-        self.assertEqual(
-            self.search('a', 'b', None, 0, None, None),
-            [],
-        )
-
-        self.assertEqual(
-            self.search('a', 'b', None, None, 0, None),
-            [],
-        )
-
-        self.assertEqual(
-            self.search('a', 'b', None, None, None, 0),
-            [],
-        )
-
-
-class TestGenericSearchLP(TestGenericSearchBase, unittest.TestCase):
-    def search(self, pattern, sequence, max_subs, max_ins, max_dels,
-               max_l_dist=None):
-        return list(fnm_generic_lp(pattern, sequence,
-                                   max_subs, max_ins, max_dels, max_l_dist))
-
-
 class TestGenericSearchNgrams(TestGenericSearchBase, unittest.TestCase):
    def search(self, pattern, sequence, max_subs, max_ins, max_dels,
               max_l_dist=None):
@ -185,15 +213,13 @@ class TestGenericSearchNgrams(TestGenericSearchBase, unittest.TestCase):
                                  max_subs, max_ins, max_dels, max_l_dist)

    def test_missing_second_item_complex(self):
-        pass
-
-    @unittest.skip("Ngrams search doesn't return overlapping matches")
-    def test_double_first_item(self):
-        return super(TestGenericSearchNgrams, self).test_double_first_item()
-
-    @unittest.skip("Ngrams search doesn't return overlapping matches")
-    def test_missing_second_item(self):
-        return super(TestGenericSearchNgrams, self).test_double_first_item()
+        self.assertTrue(
+            set(self.search('bde', 'abcdefg', 1, 1, 1, 1)).issubset([
+                Match(start=1, end=5, dist=1),
+                Match(start=2, end=5, dist=1),
+                Match(start=3, end=5, dist=1),
+            ])
+        )


 class TestHasNearMatchGenericNgramsAsSubstitutionsOnly(
--- a/tests/test_generic_search_cython.py
+++ b/tests/test_generic_search_cython.py
@ -5,11 +5,12 @@ from tests.test_substitutions_only import TestSubstitionsOnlyBase

 try:
    from fuzzysearch._generic_search import \
-        c_find_near_matches_generic_linear_programming as c_fnm_generic_lp
+        c_find_near_matches_generic_linear_programming as c_fnm_generic_lp, \
+        c_find_near_matches_generic_ngrams as c_fnm_generic_ngrams
 except ImportError:
    pass
 else:
-    class TestGenericSearchAsLevenshtein(TestFindNearMatchesLevenshteinBase,
+    class TestGenericSearchLpAsLevenshtein(TestFindNearMatchesLevenshteinBase,
                                         unittest.TestCase):
        def search(self, subsequence, sequence, max_l_dist):
            return [
@ -17,13 +18,27 @@ else:
                for group in group_matches(
                    c_fnm_generic_lp(subsequence.encode('ascii'),
                                     sequence.encode('ascii'),
-                                     max_l_dist, max_l_dist, max_l_dist, max_l_dist)
+                                     max_l_dist, max_l_dist,
+                                     max_l_dist, max_l_dist)
+                )
+            ]
+
+    class TestGenericSearchNgramsAsLevenshtein(
+        TestFindNearMatchesLevenshteinBase, unittest.TestCase):
+        def search(self, subsequence, sequence, max_l_dist):
+            return [
+                get_best_match_in_group(group)
+                for group in group_matches(
+                    c_fnm_generic_ngrams(subsequence.encode('ascii'),
+                                         sequence.encode('ascii'),
+                                         max_l_dist, max_l_dist,
+                                         max_l_dist, max_l_dist)
                )
            ]


-    class TestGenericSearchAsSubstitutionsOnly(TestSubstitionsOnlyBase,
-                                               unittest.TestCase):
+    class TestGenericSearchLpAsSubstitutionsOnly(TestSubstitionsOnlyBase,
+                                                 unittest.TestCase):
        def search(self, subsequence, sequence, max_subs):
            return list(
                c_fnm_generic_lp(subsequence.encode('ascii'),
@ -32,15 +47,25 @@ else:
            )


-    class TestGenericSearch(unittest.TestCase):
-        def search(self, pattern, sequence, max_subs, max_ins, max_dels,
-                   max_l_dist=None):
-            return list(
-                c_fnm_generic_lp(pattern.encode('ascii'),
-                                 sequence.encode('ascii'),
-                                 max_subs, max_ins, max_dels, max_l_dist)
-            )
+    class TestGenericSearchNgramsAsSubstitutionsOnly(TestSubstitionsOnlyBase,
+                                                     unittest.TestCase):
+        def search(self, subsequence, sequence, max_subs):
+            return [
+                get_best_match_in_group(group)
+                for group in group_matches(
+                    c_fnm_generic_ngrams(subsequence.encode('ascii'),
+                                         sequence.encode('ascii'),
+                                         max_subs, 0, 0, max_subs)
+                )
+        ]

+        @unittest.skip("Ngrams search doesn't return overlapping matches")
+        def test_double_first_item(self):
+            return super(TestGenericSearchNgramsAsSubstitutionsOnly,
+                         self).test_double_first_item()
+
+
+    class TestGenericSearchBase(object):
        def test_empty_sequence(self):
            self.assertEqual([], self.search('PATTERN', '', 0, 0, 0))

@ -78,11 +103,6 @@ else:
                self.search('def', 'abcddefg', 1, 0, 0),
            )

-            self.assertListEqual(
-                [Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
-                self.search('def', 'abcddefg', 0, 1, 0),
-            )
-
            self.assertIn(
                Match(start=4, end=7, dist=0),
                self.search('def', 'abcddefg', 0, 0, 1),
@ -117,6 +137,65 @@ else:
                [Match(start=3, end=5, dist=1)],
            )

+        def test_valid_none_arguments(self):
+            # check that no exception is raised when some values are None
+            self.assertEqual(
+                self.search('a', 'b', 0, None, None, 0),
+                [],
+            )
+
+            self.assertEqual(
+                self.search('a', 'b', None, 0, None, 0),
+                [],
+            )
+
+            self.assertEqual(
+                self.search('a', 'b', None, None, 0, 0),
+                [],
+            )
+
+            self.assertEqual(
+                self.search('a', 'b', 0, 0, None, 0),
+                [],
+            )
+
+            self.assertEqual(
+                self.search('a', 'b', 0, None, 0, 0),
+                [],
+            )
+
+            self.assertEqual(
+                self.search('a', 'b', None, 0, 0, 0),
+                [],
+            )
+
+            self.assertEqual(
+                self.search('a', 'b', None, None, None, 0),
+                [],
+            )
+
+            self.assertEqual(
+                self.search('a', 'b', 0, 0, 0, None),
+                [],
+            )
+
+    class TestGenericSearchLp(TestGenericSearchBase, unittest.TestCase):
+        def search(self, pattern, sequence, max_subs, max_ins, max_dels,
+                   max_l_dist=None):
+            return list(c_fnm_generic_lp(pattern.encode('ascii'),
+                                         sequence.encode('ascii'),
+                                         max_subs, max_ins,
+                                         max_dels, max_l_dist))
+
+        def test_double_first_item_two_results(self):
+            # sequence = 'abcdefg'
+            # pattern = 'bde'
+            self.assertListEqual(
+                self.search('def', 'abcddefg', 0, 1, 0),
+                [Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
+            )
+
+        def test_missing_second_item_complex(self):
            self.assertListEqual(
                self.search('bde', 'abcdefg', 1, 1, 1, 1),
                [Match(start=1, end=5, dist=1),
@ -135,24 +214,24 @@ else:
                ))
            )

-        def test_argument_handling(self):
-            # check that no exception is raised when some values are None
-            self.assertEqual(
-                self.search('a', 'b', 0, None, None, None),
-                [],
-            )
+    class TestGenericSearchNgrams(TestGenericSearchBase, unittest.TestCase):
+        def search(self, pattern, sequence, max_subs, max_ins, max_dels,
+                   max_l_dist=None):
+            return [
+                get_best_match_in_group(group)
+                for group in group_matches(
+                    c_fnm_generic_ngrams(pattern.encode('ascii'),
+                                         sequence.encode('ascii'),
+                                         max_subs, max_ins,
+                                         max_dels, max_l_dist)
+                )
+            ]

-            self.assertEqual(
-                self.search('a', 'b', None, 0, None, None),
-                [],
-            )
-
-            self.assertEqual(
-                self.search('a', 'b', None, None, 0, None),
-                [],
-            )
-
-            self.assertEqual(
-                self.search('a', 'b', None, None, None, 0),
-                [],
+        def test_missing_second_item_complex(self):
+            self.assertTrue(
+                set(self.search('bde', 'abcdefg', 1, 1, 1, 1)).issubset([
+                    Match(start=1, end=5, dist=1),
+                    Match(start=2, end=5, dist=1),
+                    Match(start=3, end=5, dist=1),
+                ])
            )