diff --git a/fuzzysearch/_generic_search.pyx b/fuzzysearch/_generic_search.pyx new file mode 100644 index 0000000..892d811 --- /dev/null +++ b/fuzzysearch/_generic_search.pyx @@ -0,0 +1,156 @@ +from fuzzysearch.common import Match + +cdef struct GenericSearchCandidate: + int start, subseq_index, l_dist, n_subs, n_ins, n_dels + + +def find_near_matches_generic_linear_programming(subsequence, sequence, + max_substitutions, + max_insertions, + max_deletions, + max_l_dist=None): + """search for near-matches of subsequence in sequence + + This searches for near-matches, where the nearly-matching parts of the + sequence must meet the following limitations (relative to the subsequence): + + * the maximum allowed number of character substitutions + * the maximum allowed number of new characters inserted + * and the maximum allowed number of character deletions + * the total number of substitutions, insertions and deletions + """ + if not subsequence: + raise ValueError('Given subsequence is empty!') + + # optimization: prepare some often used things in advance + _subseq_len = len(subsequence) + + maxes_sum = sum( + (x if x is not None else 0) + for x in [max_substitutions, max_insertions, max_deletions] + ) + if max_l_dist is None or max_l_dist >= maxes_sum: + max_l_dist = maxes_sum + + cdef GenericSearchCandidate[1000] _candidates1 + cdef GenericSearchCandidate[1000] _candidates2 + cdef GenericSearchCandidate* candidates = _candidates1 + cdef GenericSearchCandidate* new_candidates = _candidates2 + cdef GenericSearchCandidate* _tmp + cdef GenericSearchCandidate cand + cdef int n_candidates = 0 + cdef int n_new_candidates = 0 + cdef int n_cand + + for index, char in enumerate(sequence): + candidates[n_candidates] = GenericSearchCandidate(index, 0, 0, 0, 0, 0) + n_candidates += 1 + + for n_cand in xrange(n_candidates): + cand = candidates[n_cand] + # if this sequence char is the candidate's next expected char + if char == subsequence[cand.subseq_index]: + # if reached the end of the subsequence, return a match + if cand.subseq_index + 1 == _subseq_len: + yield Match(cand.start, index + 1, cand.l_dist) + # otherwise, update the candidate's subseq_index and keep it + else: + new_candidates[n_new_candidates] = GenericSearchCandidate( + cand.start, cand.subseq_index + 1, + cand.l_dist, cand.n_subs, + cand.n_ins, cand.n_dels, + ) + n_new_candidates += 1 + + # if this sequence char is *not* the candidate's next expected char + else: + # we can try skipping a sequence or sub-sequence char (or both), + # unless this candidate has already skipped the maximum allowed + # number of characters + if cand.l_dist == max_l_dist: + continue + + if cand.n_ins < max_insertions: + # add a candidate skipping a sequence char + new_candidates[n_new_candidates] = GenericSearchCandidate( + cand.start, cand.subseq_index, + cand.l_dist + 1, cand.n_subs, + cand.n_ins + 1, cand.n_dels, + ) + n_new_candidates += 1 + + if cand.subseq_index + 1 < _subseq_len: + if cand.n_subs < max_substitutions: + # add a candidate skipping both a sequence char and a + # subsequence char + new_candidates[n_new_candidates] = GenericSearchCandidate( + cand.start, cand.subseq_index + 1, + cand.l_dist + 1, cand.n_subs + 1, + cand.n_ins, cand.n_dels, + ) + n_new_candidates += 1 + elif cand.n_dels < max_deletions and cand.n_ins < max_insertions: + # add a candidate skipping both a sequence char and a + # subsequence char + new_candidates[n_new_candidates] = GenericSearchCandidate( + cand.start, cand.subseq_index + 1, + cand.l_dist + 1, cand.n_subs, + cand.n_ins + 1, cand.n_dels + 1, + ) + n_new_candidates += 1 + else: + # cand.subseq_index == _subseq_len - 1 + if ( + cand.n_subs < max_substitutions or + ( + cand.n_dels < max_deletions and + cand.n_ins < max_insertions + ) + ): + yield Match(cand.start, index + 1, cand.l_dist + 1) + + # try skipping subsequence chars + for n_skipped in xrange(1, min(max_deletions - cand.n_dels, max_l_dist - cand.l_dist) + 1): + # if skipping n_dels sub-sequence chars reaches the end + # of the sub-sequence, yield a match + if cand.subseq_index + n_skipped == _subseq_len: + yield Match(cand.start, index + 1, + cand.l_dist + n_skipped) + break + # otherwise, if skipping n_skipped sub-sequence chars + # reaches a sub-sequence char identical to this sequence + # char ... + elif subsequence[cand.subseq_index + n_skipped] == char: + # if this is the last char of the sub-sequence, yield + # a match + if cand.subseq_index + n_skipped + 1 == _subseq_len: + yield Match(cand.start, index + 1, + cand.l_dist + n_skipped) + # otherwise add a candidate skipping n_skipped + # subsequence chars + else: + new_candidates[n_new_candidates] = GenericSearchCandidate( + cand.start, cand.subseq_index + 1 + n_skipped, + cand.l_dist + n_skipped, cand.n_subs, + cand.n_ins, cand.n_dels + n_skipped, + ) + n_new_candidates += 1 + break + # note: if the above loop ends without a break, that means that + # no candidate could be added / yielded by skipping sub-sequence + # chars + + # new_candidates = candidates; candidates = [] + _tmp = candidates + candidates = new_candidates + new_candidates = _tmp + n_candidates = n_new_candidates + n_new_candidates = 0 + + for n_cand in xrange(n_candidates): + cand = candidates[n_cand] + # note: index + 1 == length(sequence) + n_skipped = _subseq_len - cand.subseq_index + if cand.n_dels + n_skipped <= max_deletions and \ + cand.l_dist + n_skipped <= max_l_dist: + yield Match(cand.start, index + 1, cand.l_dist + n_skipped) diff --git a/setup.py b/setup.py index 5e9b777..ef84b57 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ import os import sys - +from Cython.Build import cythonize try: from setuptools import setup @@ -29,9 +29,9 @@ setup( 'fuzzysearch', ], package_dir={'fuzzysearch': 'fuzzysearch'}, + ext_modules=cythonize("fuzzysearch/_generic_search.pyx"), include_package_data=True, - install_requires=[ - ], + install_requires=[], use_2to3=True, license="MIT", zip_safe=False, diff --git a/tests/test_generic_search.py b/tests/test_generic_search.py index d70d28d..5eec30d 100644 --- a/tests/test_generic_search.py +++ b/tests/test_generic_search.py @@ -1,9 +1,9 @@ from tests.compat import unittest from tests.test_levenshtein import TestFindNearMatchesLevenshteinBase from fuzzysearch.common import Match, get_best_match_in_group, group_matches +from tests.test_substitutions_only import TestSubstitionsOnlyBase from fuzzysearch.generic_search import \ find_near_matches_generic_linear_programming as fnm_generic_lp -from tests.test_substitutions_only import TestSubstitionsOnlyBase class TestGenericSearchAsLevenshtein(TestFindNearMatchesLevenshteinBase, diff --git a/tests/test_generic_search_cython.py b/tests/test_generic_search_cython.py new file mode 100644 index 0000000..1458814 --- /dev/null +++ b/tests/test_generic_search_cython.py @@ -0,0 +1,153 @@ +from tests.compat import unittest +from tests.test_levenshtein import TestFindNearMatchesLevenshteinBase +from fuzzysearch.common import Match, get_best_match_in_group, group_matches +from tests.test_substitutions_only import TestSubstitionsOnlyBase + +import pyximport +pyimporter, pyximporter = pyximport.install() +from fuzzysearch._generic_search import \ + find_near_matches_generic_linear_programming as fnm_generic_lp +pyximport.uninstall(pyimporter, pyximporter) + + +class TestGenericSearchAsLevenshtein(TestFindNearMatchesLevenshteinBase, + unittest.TestCase): + def search(self, subsequence, sequence, max_l_dist): + return [ + get_best_match_in_group(group) + for group in group_matches( + fnm_generic_lp(subsequence, sequence, max_l_dist, + max_l_dist, max_l_dist, max_l_dist) + ) + ] + + +class TestGenericSearchAsSubstitutionsOnly(TestSubstitionsOnlyBase, + unittest.TestCase): + def search(self, subsequence, sequence, max_subs): + return list( + fnm_generic_lp(subsequence, sequence, max_subs, 0, 0, max_subs) + ) + + +class TestGenericSearch(unittest.TestCase): + def search(self, pattern, sequence, max_subs, max_ins, max_dels, + max_l_dist=None): + return list(fnm_generic_lp(pattern, sequence, max_subs, + max_ins, max_dels, max_l_dist)) + + def test_empty_sequence(self): + self.assertEqual([], self.search('PATTERN', '', 0, 0, 0)) + + def test_empty_subsequence_exeption(self): + with self.assertRaises(ValueError): + self.search('', 'TEXT', 0, 0, 0) + + def test_match_identical_sequence(self): + self.assertEqual( + [Match(start=0, end=len('PATTERN'), dist=0)], + self.search('PATTERN', 'PATTERN', 0, 0, 0), + ) + + def test_substring(self): + substring = 'PATTERN' + text = 'aaaaaaaaaaPATTERNaaaaaaaaa' + expected_match = Match(start=10, end=17, dist=0) + + self.assertEqual( + [expected_match], + self.search(substring, text, 0, 0, 0) + ) + + def test_double_first_item(self): + # sequence = 'abcdefg' + # pattern = 'bde' + + self.assertEqual( + [Match(start=4, end=7, dist=0)], + self.search('def', 'abcddefg', 0, 0, 0), + ) + + self.assertEqual( + [Match(start=4, end=7, dist=0)], + self.search('def', 'abcddefg', 1, 0, 0), + ) + + self.assertListEqual( + [Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)], + self.search('def', 'abcddefg', 0, 1, 0), + ) + + self.assertIn( + Match(start=4, end=7, dist=0), + self.search('def', 'abcddefg', 0, 0, 1), + ) + + self.assertEqual( + [Match(start=4, end=7, dist=0)], + self.search('def', 'abcddefg', 0, 1, 0, 0), + ) + + def test_missing_second_item(self): + # sequence = 'abcdefg' + # pattern = 'bde' + + self.assertEqual( + self.search('bde', 'abcdefg', 0, 1, 0), + [Match(start=1, end=5, dist=1)], + ) + + self.assertEqual( + self.search('bde', 'abcdefg', 0, 0, 0), + [], + ) + + self.assertEqual( + self.search('bde', 'abcdefg', 1, 0, 0), + [Match(start=2, end=5, dist=1)], + ) + + self.assertEqual( + self.search('bde', 'abcdefg', 0, 0, 1), + [Match(start=3, end=5, dist=1)], + ) + + self.assertListEqual( + self.search('bde', 'abcdefg', 1, 1, 1, 1), + [Match(start=1, end=5, dist=1), + Match(start=2, end=5, dist=1), + Match(start=3, end=5, dist=1)], + ) + + self.assertTrue( + set([ + Match(start=1, end=5, dist=1), + Match(start=2, end=5, dist=1), + Match(start=3, end=5, dist=1), + Match(start=2, end=5, dist=3), + ]).issubset(set( + self.search('bde', 'abcdefg', 1, 1, 1, 3), + )) + ) + + def test_argument_handling(self): + # check that no exception is raised when some values are None + self.assertEqual( + self.search('a', 'b', 0, None, None, None), + [], + ) + + self.assertEqual( + self.search('a', 'b', None, 0, None, None), + [], + ) + + self.assertEqual( + self.search('a', 'b', None, None, 0, None), + [], + ) + + self.assertEqual( + self.search('a', 'b', None, None, None, 0), + [], + )