diff --git a/fuzzysearch/no_deletions.py b/fuzzysearch/no_deletions.py new file mode 100644 index 0000000..76d832e --- /dev/null +++ b/fuzzysearch/no_deletions.py @@ -0,0 +1,121 @@ +"""fuzzy searching allowing subsitutions and insertions, but no deletions""" + +__all__ = [ + 'find_near_matches_no_deletions_ngrams', +] + +import array + +from fuzzysearch.common import Ngram, search_exact, Match + + +def _expand(subsequence, sequence, max_substitutions, max_insertions, + max_l_dist): + if not subsequence: + return (0, 0) + + n_subs = array.array('L', [0] * (max_insertions + 1)) + for subseq_index, char in enumerate(subsequence): + n_subs[0] += (char != sequence[subseq_index]) + for n_ins in range(1, max_insertions + 1): + n_subs[n_ins] = min( + n_subs[n_ins] + (char != sequence[subseq_index + n_ins]), + n_subs[n_ins - 1] + ) + + matches = [ + (_n_subs, _n_ins) for (_n_ins, _n_subs) in enumerate(n_subs) + if _n_subs <= max_substitutions + and _n_ins + _n_subs <= max_l_dist + ] + return [ + match for (i, match) in enumerate(matches) + if i == 0 or match[0] < matches[i-1][0] + ] + + +def find_near_matches_no_deletions_ngrams(subsequence, sequence, + max_substitutions, max_insertions, + max_l_dist=None): + """search for near-matches of subsequence in sequence + + This searches for near-matches, where the nearly-matching parts of the + sequence must meet the following limitations (relative to the subsequence): + + * the number of character substitutions must be less than max_substitutions + * no deletions or insertions are allowed + """ + if not subsequence: + raise ValueError('Given subsequence is empty!') + + _SUBSEQ_LEN = len(subsequence) + _SEQ_LEN = len(sequence) + + if max_l_dist is None or max_l_dist > max_substitutions + max_insertions: + max_l_dist = max_substitutions + max_insertions + max_substitutions = min(max_substitutions, max_l_dist) + max_insertions = min(max_insertions, max_l_dist) + + ngram_len = _SUBSEQ_LEN // (max_substitutions + max_insertions + 1) + if ngram_len == 0: + raise ValueError( + "The subsequence's length must be greater than max_subs + max_ins!" + ) + + ngrams = [ + Ngram(start, start + ngram_len) + for start in range(0, len(subsequence) - ngram_len + 1, ngram_len) + ] + matches = [] + matched_indexes = set() + + for ngram in ngrams: + _subseq_before = subsequence[:ngram.start] + _subseq_before_reversed = _subseq_before[::-1] + _subseq_after = subsequence[ngram.end:] + start_index = max(0, ngram.start - max_insertions) + end_index = min(_SEQ_LEN, _SEQ_LEN - (_SUBSEQ_LEN - ngram.end) + max_insertions) + + for index in search_exact(subsequence[ngram.start:ngram.end], sequence, start_index, end_index): + if index - ngram.start in matched_indexes: + continue + + seq_after = sequence[index + ngram_len:index + _SUBSEQ_LEN - ngram.start + max_insertions] + if seq_after.startswith(_subseq_after): + matches_after = [(0, 0)] + else: + matches_after = _expand(_subseq_after, seq_after, + max_substitutions, max_insertions, max_l_dist) + if not matches_after: + continue + + _max_substitutions = max_substitutions - min(m[0] for m in matches_after) + _max_insertions = max_insertions - min(m[1] for m in matches_after) + _max_l_dist = max_l_dist - min(m[0] + m[1] for m in matches_after) + seq_before = sequence[index - ngram.start - _max_insertions:index] + if seq_before.endswith(_subseq_before): + matches_before = [(0, 0)] + else: + matches_before = _expand( + _subseq_before_reversed, seq_before[::-1], + _max_substitutions, _max_insertions, _max_l_dist, + ) + + for (subs_before, ins_before) in matches_before: + for (subs_after, ins_after) in matches_after: + if ( + subs_before + subs_after <= max_substitutions and + ins_before + ins_after <= max_insertions and + subs_before + subs_after + ins_before + ins_after <= max_l_dist + ): + matches.append(Match( + start=index - ngram.start - ins_before, + end=index - ngram.start + _SUBSEQ_LEN + ins_after, + dist=subs_before + subs_after + ins_before + ins_after, + )) + matched_indexes |= set(range( + index - ngram.start - ins_before, + index - ngram.start - ins_before + max_insertions + 1, + )) + + return sorted(matches, key=lambda match: match.start) diff --git a/tests/test_no_deletions.py b/tests/test_no_deletions.py new file mode 100644 index 0000000..af7a599 --- /dev/null +++ b/tests/test_no_deletions.py @@ -0,0 +1,110 @@ +from tests.compat import unittest, mock + +from fuzzysearch.common import Match +from fuzzysearch.no_deletions import _expand, \ + find_near_matches_no_deletions_ngrams as fnm_nodels_ngrams +from tests.test_substitutions_only import TestFindNearMatchesSubstitionsNgrams + + +class TestExpand(unittest.TestCase): + def test_identical(self): + self.assertEqual(_expand('abc', 'abc', 0, 0, 0), [(0, 0)]) + + def test_startswith(self): + self.assertEqual(_expand('abc', 'abcdef', 0, 0, 0), [(0, 0)]) + self.assertEqual(_expand('abc', 'abcdef', 1, 0, 1), [(0, 0)]) + self.assertEqual(_expand('abc', 'abcdef', 2, 0, 2), [(0, 0)]) + self.assertEqual(_expand('abc', 'abcdef', 0, 1, 1), [(0, 0)]) + self.assertEqual(_expand('abc', 'abcdef', 0, 2, 2), [(0, 0)]) + self.assertEqual(_expand('abc', 'abcdef', 1, 1, 1), [(0, 0)]) + self.assertEqual(_expand('abc', 'abcdef', 1, 1, 2), [(0, 0)]) + self.assertEqual(_expand('abc', 'abcdef', 2, 2, 2), [(0, 0)]) + + def test_one_missing(self): + # first item missing + self.assertEqual(_expand('abcd', 'bcd---', 0, 1, 1), []) + self.assertEqual(_expand('abcd', 'bcd---', 1, 0, 1), []) + self.assertEqual(_expand('abcd', 'bcd---', 1, 1, 2), []) + + # second item missing + self.assertEqual(_expand('abcd', 'acd---', 0, 1, 1), []) + self.assertEqual(_expand('abcd', 'acd---', 1, 0, 1), []) + self.assertEqual(_expand('abcd', 'acd---', 1, 1, 2), []) + + # last item missing + self.assertEqual(_expand('abcd', 'abc---', 0, 1, 1), []) + self.assertEqual(_expand('abcd', 'abc---', 1, 0, 1), [(1, 0)]) + self.assertEqual(_expand('abcd', 'abc---', 1, 1, 2), [(1, 0)]) + + def test_no_result(self): + self.assertEqual(_expand('abc', 'def', 0, 0, 0), []) + self.assertEqual(_expand('abc', 'defg', 1, 1, 1), []) + self.assertEqual(_expand('abc', 'defg', 1, 1, 2), []) + + def test_one_extra(self): + # extra first item + self.assertEqual(_expand('bcd', 'abcd', 0, 0, 0), []) + self.assertEqual(_expand('bcd', 'abcd', 0, 1, 1), [(0, 1)]) + + # extra third item + self.assertEqual(_expand('abd', 'abcd', 0, 0, 0), []) + self.assertEqual(_expand('abd', 'abcd', 0, 1, 1), [(0, 1)]) + + # extra last item + self.assertEqual(_expand('abc', 'abcd', 0, 0, 0), [(0, 0)]) + self.assertEqual(_expand('abc', 'abcd', 0, 1, 1), [(0, 0)]) + + def test_insert_and_substitute(self): + self.assertEqual(_expand('abcdefg', 'abc-def----', 1, 1, 2), [(1, 1)]) + self.assertEqual(_expand('abcdefg', 'abc-def----', 1, 1, 1), []) + self.assertEqual(_expand('abcdefg', 'abc-def----', 1, 0, 1), []) + self.assertEqual(_expand('abcdefg', 'abc-def----', 0, 1, 1), []) + + def test_double_first_item(self): + self.assertEqual(_expand('abc', 'aabc', 1, 1, 1), [(0, 1)]) + + def test_two_insertions(self): + self.assertEqual(_expand('abc', 'a--bc', 0, 2, 2), [(0, 2)]) + self.assertEqual(_expand('abc', 'a--bc', 2, 0, 2), [(2, 0)]) + self.assertEqual(_expand('abc', 'a--bc', 2, 2, 2), [(2, 0), (0, 2)]) + self.assertEqual(_expand('abc', 'a--bc', 1, 1, 2), []) + + +class TestFindNearMatchesNoDeletionsNgramsAsNoSubstituions( + TestFindNearMatchesSubstitionsNgrams, unittest.TestCase): + def search(self, subsequence, sequence, max_subs): + return fnm_nodels_ngrams(subsequence, sequence, max_subs, 0) + + +class TestFindNearMatchesNoDeletionsNgrams(unittest.TestCase): + def test_one_sub_one_ins(self): + sequence = 'abcdefghij' + pattern = 'bceXghi' + expected_match = Match(start=1, end=9, dist=2) + self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 0, 0, 0), []) + self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 0, 1, 2), []) + self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 1, 0, 2), []) + self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 1, 1, 1), []) + self.assertEqual( + fnm_nodels_ngrams(pattern, sequence, 1, 1, 2), + [expected_match], + ) + + def test_two_extra(self): + sequence = '--abc--de--' + pattern = 'abcde' + + self.assertEqual( + fnm_nodels_ngrams(pattern, sequence, 0, 2, 2), + [Match(start=2, end=9, dist=2)], + ) + + self.assertEqual( + fnm_nodels_ngrams(pattern, sequence, 2, 0, 2), + [Match(start=2, end=7, dist=2)], + ) + + self.assertEqual( + fnm_nodels_ngrams(pattern, sequence, 2, 2, 2), + [Match(start=2, end=7, dist=2), Match(start=2, end=9, dist=2)], + )