added fuzzy search allowing insertions and substitutions but no deletions

This commit is contained in:
Tal Einat 2014-03-17 17:07:59 +02:00
parent 10815b726b
commit 26f927dbec
2 changed files with 231 additions and 0 deletions

121
fuzzysearch/no_deletions.py Normal file
View File

@ -0,0 +1,121 @@
"""fuzzy searching allowing subsitutions and insertions, but no deletions"""
__all__ = [
'find_near_matches_no_deletions_ngrams',
]
import array
from fuzzysearch.common import Ngram, search_exact, Match
def _expand(subsequence, sequence, max_substitutions, max_insertions,
max_l_dist):
if not subsequence:
return (0, 0)
n_subs = array.array('L', [0] * (max_insertions + 1))
for subseq_index, char in enumerate(subsequence):
n_subs[0] += (char != sequence[subseq_index])
for n_ins in range(1, max_insertions + 1):
n_subs[n_ins] = min(
n_subs[n_ins] + (char != sequence[subseq_index + n_ins]),
n_subs[n_ins - 1]
)
matches = [
(_n_subs, _n_ins) for (_n_ins, _n_subs) in enumerate(n_subs)
if _n_subs <= max_substitutions
and _n_ins + _n_subs <= max_l_dist
]
return [
match for (i, match) in enumerate(matches)
if i == 0 or match[0] < matches[i-1][0]
]
def find_near_matches_no_deletions_ngrams(subsequence, sequence,
max_substitutions, max_insertions,
max_l_dist=None):
"""search for near-matches of subsequence in sequence
This searches for near-matches, where the nearly-matching parts of the
sequence must meet the following limitations (relative to the subsequence):
* the number of character substitutions must be less than max_substitutions
* no deletions or insertions are allowed
"""
if not subsequence:
raise ValueError('Given subsequence is empty!')
_SUBSEQ_LEN = len(subsequence)
_SEQ_LEN = len(sequence)
if max_l_dist is None or max_l_dist > max_substitutions + max_insertions:
max_l_dist = max_substitutions + max_insertions
max_substitutions = min(max_substitutions, max_l_dist)
max_insertions = min(max_insertions, max_l_dist)
ngram_len = _SUBSEQ_LEN // (max_substitutions + max_insertions + 1)
if ngram_len == 0:
raise ValueError(
"The subsequence's length must be greater than max_subs + max_ins!"
)
ngrams = [
Ngram(start, start + ngram_len)
for start in range(0, len(subsequence) - ngram_len + 1, ngram_len)
]
matches = []
matched_indexes = set()
for ngram in ngrams:
_subseq_before = subsequence[:ngram.start]
_subseq_before_reversed = _subseq_before[::-1]
_subseq_after = subsequence[ngram.end:]
start_index = max(0, ngram.start - max_insertions)
end_index = min(_SEQ_LEN, _SEQ_LEN - (_SUBSEQ_LEN - ngram.end) + max_insertions)
for index in search_exact(subsequence[ngram.start:ngram.end], sequence, start_index, end_index):
if index - ngram.start in matched_indexes:
continue
seq_after = sequence[index + ngram_len:index + _SUBSEQ_LEN - ngram.start + max_insertions]
if seq_after.startswith(_subseq_after):
matches_after = [(0, 0)]
else:
matches_after = _expand(_subseq_after, seq_after,
max_substitutions, max_insertions, max_l_dist)
if not matches_after:
continue
_max_substitutions = max_substitutions - min(m[0] for m in matches_after)
_max_insertions = max_insertions - min(m[1] for m in matches_after)
_max_l_dist = max_l_dist - min(m[0] + m[1] for m in matches_after)
seq_before = sequence[index - ngram.start - _max_insertions:index]
if seq_before.endswith(_subseq_before):
matches_before = [(0, 0)]
else:
matches_before = _expand(
_subseq_before_reversed, seq_before[::-1],
_max_substitutions, _max_insertions, _max_l_dist,
)
for (subs_before, ins_before) in matches_before:
for (subs_after, ins_after) in matches_after:
if (
subs_before + subs_after <= max_substitutions and
ins_before + ins_after <= max_insertions and
subs_before + subs_after + ins_before + ins_after <= max_l_dist
):
matches.append(Match(
start=index - ngram.start - ins_before,
end=index - ngram.start + _SUBSEQ_LEN + ins_after,
dist=subs_before + subs_after + ins_before + ins_after,
))
matched_indexes |= set(range(
index - ngram.start - ins_before,
index - ngram.start - ins_before + max_insertions + 1,
))
return sorted(matches, key=lambda match: match.start)

110
tests/test_no_deletions.py Normal file
View File

@ -0,0 +1,110 @@
from tests.compat import unittest, mock
from fuzzysearch.common import Match
from fuzzysearch.no_deletions import _expand, \
find_near_matches_no_deletions_ngrams as fnm_nodels_ngrams
from tests.test_substitutions_only import TestFindNearMatchesSubstitionsNgrams
class TestExpand(unittest.TestCase):
def test_identical(self):
self.assertEqual(_expand('abc', 'abc', 0, 0, 0), [(0, 0)])
def test_startswith(self):
self.assertEqual(_expand('abc', 'abcdef', 0, 0, 0), [(0, 0)])
self.assertEqual(_expand('abc', 'abcdef', 1, 0, 1), [(0, 0)])
self.assertEqual(_expand('abc', 'abcdef', 2, 0, 2), [(0, 0)])
self.assertEqual(_expand('abc', 'abcdef', 0, 1, 1), [(0, 0)])
self.assertEqual(_expand('abc', 'abcdef', 0, 2, 2), [(0, 0)])
self.assertEqual(_expand('abc', 'abcdef', 1, 1, 1), [(0, 0)])
self.assertEqual(_expand('abc', 'abcdef', 1, 1, 2), [(0, 0)])
self.assertEqual(_expand('abc', 'abcdef', 2, 2, 2), [(0, 0)])
def test_one_missing(self):
# first item missing
self.assertEqual(_expand('abcd', 'bcd---', 0, 1, 1), [])
self.assertEqual(_expand('abcd', 'bcd---', 1, 0, 1), [])
self.assertEqual(_expand('abcd', 'bcd---', 1, 1, 2), [])
# second item missing
self.assertEqual(_expand('abcd', 'acd---', 0, 1, 1), [])
self.assertEqual(_expand('abcd', 'acd---', 1, 0, 1), [])
self.assertEqual(_expand('abcd', 'acd---', 1, 1, 2), [])
# last item missing
self.assertEqual(_expand('abcd', 'abc---', 0, 1, 1), [])
self.assertEqual(_expand('abcd', 'abc---', 1, 0, 1), [(1, 0)])
self.assertEqual(_expand('abcd', 'abc---', 1, 1, 2), [(1, 0)])
def test_no_result(self):
self.assertEqual(_expand('abc', 'def', 0, 0, 0), [])
self.assertEqual(_expand('abc', 'defg', 1, 1, 1), [])
self.assertEqual(_expand('abc', 'defg', 1, 1, 2), [])
def test_one_extra(self):
# extra first item
self.assertEqual(_expand('bcd', 'abcd', 0, 0, 0), [])
self.assertEqual(_expand('bcd', 'abcd', 0, 1, 1), [(0, 1)])
# extra third item
self.assertEqual(_expand('abd', 'abcd', 0, 0, 0), [])
self.assertEqual(_expand('abd', 'abcd', 0, 1, 1), [(0, 1)])
# extra last item
self.assertEqual(_expand('abc', 'abcd', 0, 0, 0), [(0, 0)])
self.assertEqual(_expand('abc', 'abcd', 0, 1, 1), [(0, 0)])
def test_insert_and_substitute(self):
self.assertEqual(_expand('abcdefg', 'abc-def----', 1, 1, 2), [(1, 1)])
self.assertEqual(_expand('abcdefg', 'abc-def----', 1, 1, 1), [])
self.assertEqual(_expand('abcdefg', 'abc-def----', 1, 0, 1), [])
self.assertEqual(_expand('abcdefg', 'abc-def----', 0, 1, 1), [])
def test_double_first_item(self):
self.assertEqual(_expand('abc', 'aabc', 1, 1, 1), [(0, 1)])
def test_two_insertions(self):
self.assertEqual(_expand('abc', 'a--bc', 0, 2, 2), [(0, 2)])
self.assertEqual(_expand('abc', 'a--bc', 2, 0, 2), [(2, 0)])
self.assertEqual(_expand('abc', 'a--bc', 2, 2, 2), [(2, 0), (0, 2)])
self.assertEqual(_expand('abc', 'a--bc', 1, 1, 2), [])
class TestFindNearMatchesNoDeletionsNgramsAsNoSubstituions(
TestFindNearMatchesSubstitionsNgrams, unittest.TestCase):
def search(self, subsequence, sequence, max_subs):
return fnm_nodels_ngrams(subsequence, sequence, max_subs, 0)
class TestFindNearMatchesNoDeletionsNgrams(unittest.TestCase):
def test_one_sub_one_ins(self):
sequence = 'abcdefghij'
pattern = 'bceXghi'
expected_match = Match(start=1, end=9, dist=2)
self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 0, 0, 0), [])
self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 0, 1, 2), [])
self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 1, 0, 2), [])
self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 1, 1, 1), [])
self.assertEqual(
fnm_nodels_ngrams(pattern, sequence, 1, 1, 2),
[expected_match],
)
def test_two_extra(self):
sequence = '--abc--de--'
pattern = 'abcde'
self.assertEqual(
fnm_nodels_ngrams(pattern, sequence, 0, 2, 2),
[Match(start=2, end=9, dist=2)],
)
self.assertEqual(
fnm_nodels_ngrams(pattern, sequence, 2, 0, 2),
[Match(start=2, end=7, dist=2)],
)
self.assertEqual(
fnm_nodels_ngrams(pattern, sequence, 2, 2, 2),
[Match(start=2, end=7, dist=2), Match(start=2, end=9, dist=2)],
)