first working Cython version of generic search

2014-03-28 10:49:21 +03:00 · 2014-03-28 10:49:21 +03:00 · 2863b57236
parent 8ba0cc8dc3
commit 2863b57236
4 changed files with 313 additions and 4 deletions
--- a/fuzzysearch/_generic_search.pyx
+++ b/fuzzysearch/_generic_search.pyx
@ -0,0 +1,156 @@
+from fuzzysearch.common import Match
+
+cdef struct GenericSearchCandidate:
+    int start, subseq_index, l_dist, n_subs, n_ins, n_dels
+
+
+def find_near_matches_generic_linear_programming(subsequence, sequence,
+                                                 max_substitutions,
+                                                 max_insertions,
+                                                 max_deletions,
+                                                 max_l_dist=None):
+    """search for near-matches of subsequence in sequence
+
+    This searches for near-matches, where the nearly-matching parts of the
+    sequence must meet the following limitations (relative to the subsequence):
+
+    * the maximum allowed number of character substitutions
+    * the maximum allowed number of new characters inserted
+    * and the maximum allowed number of character deletions
+    * the total number of substitutions, insertions and deletions
+    """
+    if not subsequence:
+        raise ValueError('Given subsequence is empty!')
+
+    # optimization: prepare some often used things in advance
+    _subseq_len = len(subsequence)
+
+    maxes_sum = sum(
+        (x if x is not None else 0)
+        for x in [max_substitutions, max_insertions, max_deletions]
+    )
+    if max_l_dist is None or max_l_dist >= maxes_sum:
+        max_l_dist = maxes_sum
+
+    cdef GenericSearchCandidate[1000] _candidates1
+    cdef GenericSearchCandidate[1000] _candidates2
+    cdef GenericSearchCandidate* candidates = _candidates1
+    cdef GenericSearchCandidate* new_candidates = _candidates2
+    cdef GenericSearchCandidate* _tmp
+    cdef GenericSearchCandidate cand
+    cdef int n_candidates = 0
+    cdef int n_new_candidates = 0
+    cdef int n_cand
+
+    for index, char in enumerate(sequence):
+        candidates[n_candidates] = GenericSearchCandidate(index, 0, 0, 0, 0, 0)
+        n_candidates += 1
+
+        for n_cand in xrange(n_candidates):
+            cand = candidates[n_cand]
+            # if this sequence char is the candidate's next expected char
+            if char == subsequence[cand.subseq_index]:
+                # if reached the end of the subsequence, return a match
+                if cand.subseq_index + 1 == _subseq_len:
+                    yield Match(cand.start, index + 1, cand.l_dist)
+                # otherwise, update the candidate's subseq_index and keep it
+                else:
+                    new_candidates[n_new_candidates] = GenericSearchCandidate(
+                        cand.start, cand.subseq_index + 1,
+                        cand.l_dist, cand.n_subs,
+                        cand.n_ins, cand.n_dels,
+                    )
+                    n_new_candidates += 1
+
+            # if this sequence char is *not* the candidate's next expected char
+            else:
+                # we can try skipping a sequence or sub-sequence char (or both),
+                # unless this candidate has already skipped the maximum allowed
+                # number of characters
+                if cand.l_dist == max_l_dist:
+                    continue
+
+                if cand.n_ins < max_insertions:
+                    # add a candidate skipping a sequence char
+                    new_candidates[n_new_candidates] = GenericSearchCandidate(
+                        cand.start, cand.subseq_index,
+                        cand.l_dist + 1, cand.n_subs,
+                        cand.n_ins + 1, cand.n_dels,
+                    )
+                    n_new_candidates += 1
+
+                if cand.subseq_index + 1 < _subseq_len:
+                    if cand.n_subs < max_substitutions:
+                        # add a candidate skipping both a sequence char and a
+                        # subsequence char
+                        new_candidates[n_new_candidates] = GenericSearchCandidate(
+                            cand.start, cand.subseq_index + 1,
+                            cand.l_dist + 1, cand.n_subs + 1,
+                            cand.n_ins, cand.n_dels,
+                        )
+                        n_new_candidates += 1
+                    elif cand.n_dels < max_deletions and cand.n_ins < max_insertions:
+                        # add a candidate skipping both a sequence char and a
+                        # subsequence char
+                        new_candidates[n_new_candidates] = GenericSearchCandidate(
+                            cand.start, cand.subseq_index + 1,
+                            cand.l_dist + 1, cand.n_subs,
+                            cand.n_ins + 1, cand.n_dels + 1,
+                        )
+                        n_new_candidates += 1
+                else:
+                    # cand.subseq_index == _subseq_len - 1
+                    if (
+                            cand.n_subs < max_substitutions or
+                            (
+                                cand.n_dels < max_deletions and
+                                cand.n_ins < max_insertions
+                            )
+                    ):
+                        yield Match(cand.start, index + 1, cand.l_dist + 1)
+
+                # try skipping subsequence chars
+                for n_skipped in xrange(1, min(max_deletions - cand.n_dels, max_l_dist - cand.l_dist) + 1):
+                    # if skipping n_dels sub-sequence chars reaches the end
+                    # of the sub-sequence, yield a match
+                    if cand.subseq_index + n_skipped == _subseq_len:
+                        yield Match(cand.start, index + 1,
+                                    cand.l_dist + n_skipped)
+                        break
+                    # otherwise, if skipping n_skipped sub-sequence chars
+                    # reaches a sub-sequence char identical to this sequence
+                    # char ...
+                    elif subsequence[cand.subseq_index + n_skipped] == char:
+                        # if this is the last char of the sub-sequence, yield
+                        # a match
+                        if cand.subseq_index + n_skipped + 1 == _subseq_len:
+                            yield Match(cand.start, index + 1,
+                                        cand.l_dist + n_skipped)
+                        # otherwise add a candidate skipping n_skipped
+                        # subsequence chars
+                        else:
+                            new_candidates[n_new_candidates] = GenericSearchCandidate(
+                                cand.start, cand.subseq_index + 1 + n_skipped,
+                                cand.l_dist + n_skipped, cand.n_subs,
+                                cand.n_ins, cand.n_dels + n_skipped,
+                            )
+                            n_new_candidates += 1
+                        break
+                # note: if the above loop ends without a break, that means that
+                # no candidate could be added / yielded by skipping sub-sequence
+                # chars
+
+        # new_candidates = candidates; candidates = []
+        _tmp = candidates
+        candidates = new_candidates
+        new_candidates = _tmp
+        n_candidates = n_new_candidates
+        n_new_candidates = 0
+
+    for n_cand in xrange(n_candidates):
+        cand = candidates[n_cand]
+        # note: index + 1 == length(sequence)
+        n_skipped = _subseq_len - cand.subseq_index
+        if cand.n_dels + n_skipped <= max_deletions and \
+           cand.l_dist + n_skipped <= max_l_dist:
+            yield Match(cand.start, index + 1, cand.l_dist + n_skipped)
--- a/setup.py
+++ b/setup.py
@ -3,7 +3,7 @@

 import os
 import sys
-
+from Cython.Build import cythonize

 try:
    from setuptools import setup
@ -29,9 +29,9 @@ setup(
        'fuzzysearch',
    ],
    package_dir={'fuzzysearch': 'fuzzysearch'},
+    ext_modules=cythonize("fuzzysearch/_generic_search.pyx"),
    include_package_data=True,
-    install_requires=[
-    ],
+    install_requires=[],
    use_2to3=True,
    license="MIT",
    zip_safe=False,
--- a/tests/test_generic_search.py
+++ b/tests/test_generic_search.py
@ -1,9 +1,9 @@
 from tests.compat import unittest
 from tests.test_levenshtein import TestFindNearMatchesLevenshteinBase
 from fuzzysearch.common import Match, get_best_match_in_group, group_matches
+from tests.test_substitutions_only import TestSubstitionsOnlyBase
 from fuzzysearch.generic_search import \
    find_near_matches_generic_linear_programming as fnm_generic_lp
-from tests.test_substitutions_only import TestSubstitionsOnlyBase


 class TestGenericSearchAsLevenshtein(TestFindNearMatchesLevenshteinBase,
--- a/tests/test_generic_search_cython.py
+++ b/tests/test_generic_search_cython.py
@ -0,0 +1,153 @@
+from tests.compat import unittest
+from tests.test_levenshtein import TestFindNearMatchesLevenshteinBase
+from fuzzysearch.common import Match, get_best_match_in_group, group_matches
+from tests.test_substitutions_only import TestSubstitionsOnlyBase
+
+import pyximport
+pyimporter, pyximporter = pyximport.install()
+from fuzzysearch._generic_search import \
+    find_near_matches_generic_linear_programming as fnm_generic_lp
+pyximport.uninstall(pyimporter, pyximporter)
+
+
+class TestGenericSearchAsLevenshtein(TestFindNearMatchesLevenshteinBase,
+                                     unittest.TestCase):
+    def search(self, subsequence, sequence, max_l_dist):
+        return [
+            get_best_match_in_group(group)
+            for group in group_matches(
+                fnm_generic_lp(subsequence, sequence, max_l_dist,
+                               max_l_dist, max_l_dist, max_l_dist)
+            )
+        ]
+
+
+class TestGenericSearchAsSubstitutionsOnly(TestSubstitionsOnlyBase,
+                                           unittest.TestCase):
+    def search(self, subsequence, sequence, max_subs):
+        return list(
+            fnm_generic_lp(subsequence, sequence, max_subs, 0, 0, max_subs)
+        )
+
+
+class TestGenericSearch(unittest.TestCase):
+    def search(self, pattern, sequence, max_subs, max_ins, max_dels,
+               max_l_dist=None):
+        return list(fnm_generic_lp(pattern, sequence, max_subs,
+                                   max_ins, max_dels, max_l_dist))
+
+    def test_empty_sequence(self):
+        self.assertEqual([], self.search('PATTERN', '', 0, 0, 0))
+
+    def test_empty_subsequence_exeption(self):
+        with self.assertRaises(ValueError):
+            self.search('', 'TEXT', 0, 0, 0)
+
+    def test_match_identical_sequence(self):
+        self.assertEqual(
+            [Match(start=0, end=len('PATTERN'), dist=0)],
+            self.search('PATTERN', 'PATTERN', 0, 0, 0),
+        )
+
+    def test_substring(self):
+        substring = 'PATTERN'
+        text = 'aaaaaaaaaaPATTERNaaaaaaaaa'
+        expected_match = Match(start=10, end=17, dist=0)
+
+        self.assertEqual(
+            [expected_match],
+            self.search(substring, text, 0, 0, 0)
+        )
+
+    def test_double_first_item(self):
+        # sequence = 'abcdefg'
+        # pattern = 'bde'
+
+        self.assertEqual(
+            [Match(start=4, end=7, dist=0)],
+            self.search('def', 'abcddefg', 0, 0, 0),
+        )
+
+        self.assertEqual(
+            [Match(start=4, end=7, dist=0)],
+            self.search('def', 'abcddefg', 1, 0, 0),
+        )
+
+        self.assertListEqual(
+            [Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
+            self.search('def', 'abcddefg', 0, 1, 0),
+        )
+
+        self.assertIn(
+            Match(start=4, end=7, dist=0),
+            self.search('def', 'abcddefg', 0, 0, 1),
+        )
+
+        self.assertEqual(
+            [Match(start=4, end=7, dist=0)],
+            self.search('def', 'abcddefg', 0, 1, 0, 0),
+        )
+
+    def test_missing_second_item(self):
+        # sequence = 'abcdefg'
+        # pattern = 'bde'
+
+        self.assertEqual(
+            self.search('bde', 'abcdefg', 0, 1, 0),
+            [Match(start=1, end=5, dist=1)],
+        )
+
+        self.assertEqual(
+            self.search('bde', 'abcdefg', 0, 0, 0),
+            [],
+        )
+
+        self.assertEqual(
+            self.search('bde', 'abcdefg', 1, 0, 0),
+            [Match(start=2, end=5, dist=1)],
+        )
+
+        self.assertEqual(
+            self.search('bde', 'abcdefg', 0, 0, 1),
+            [Match(start=3, end=5, dist=1)],
+        )
+
+        self.assertListEqual(
+            self.search('bde', 'abcdefg', 1, 1, 1, 1),
+            [Match(start=1, end=5, dist=1),
+             Match(start=2, end=5, dist=1),
+             Match(start=3, end=5, dist=1)],
+        )
+
+        self.assertTrue(
+            set([
+                Match(start=1, end=5, dist=1),
+                Match(start=2, end=5, dist=1),
+                Match(start=3, end=5, dist=1),
+                Match(start=2, end=5, dist=3),
+            ]).issubset(set(
+                self.search('bde', 'abcdefg', 1, 1, 1, 3),
+            ))
+        )
+
+    def test_argument_handling(self):
+        # check that no exception is raised when some values are None
+        self.assertEqual(
+            self.search('a', 'b', 0, None, None, None),
+            [],
+        )
+
+        self.assertEqual(
+            self.search('a', 'b', None, 0, None, None),
+            [],
+        )
+
+        self.assertEqual(
+            self.search('a', 'b', None, None, 0, None),
+            [],
+        )
+
+        self.assertEqual(
+            self.search('a', 'b', None, None, None, 0),
+            [],
+        )