added fuzzy search allowing insertions and substitutions but no deletions
This commit is contained in:
parent
10815b726b
commit
26f927dbec
|
@ -0,0 +1,121 @@
|
|||
"""fuzzy searching allowing subsitutions and insertions, but no deletions"""
|
||||
|
||||
__all__ = [
|
||||
'find_near_matches_no_deletions_ngrams',
|
||||
]
|
||||
|
||||
import array
|
||||
|
||||
from fuzzysearch.common import Ngram, search_exact, Match
|
||||
|
||||
|
||||
def _expand(subsequence, sequence, max_substitutions, max_insertions,
|
||||
max_l_dist):
|
||||
if not subsequence:
|
||||
return (0, 0)
|
||||
|
||||
n_subs = array.array('L', [0] * (max_insertions + 1))
|
||||
for subseq_index, char in enumerate(subsequence):
|
||||
n_subs[0] += (char != sequence[subseq_index])
|
||||
for n_ins in range(1, max_insertions + 1):
|
||||
n_subs[n_ins] = min(
|
||||
n_subs[n_ins] + (char != sequence[subseq_index + n_ins]),
|
||||
n_subs[n_ins - 1]
|
||||
)
|
||||
|
||||
matches = [
|
||||
(_n_subs, _n_ins) for (_n_ins, _n_subs) in enumerate(n_subs)
|
||||
if _n_subs <= max_substitutions
|
||||
and _n_ins + _n_subs <= max_l_dist
|
||||
]
|
||||
return [
|
||||
match for (i, match) in enumerate(matches)
|
||||
if i == 0 or match[0] < matches[i-1][0]
|
||||
]
|
||||
|
||||
|
||||
def find_near_matches_no_deletions_ngrams(subsequence, sequence,
|
||||
max_substitutions, max_insertions,
|
||||
max_l_dist=None):
|
||||
"""search for near-matches of subsequence in sequence
|
||||
|
||||
This searches for near-matches, where the nearly-matching parts of the
|
||||
sequence must meet the following limitations (relative to the subsequence):
|
||||
|
||||
* the number of character substitutions must be less than max_substitutions
|
||||
* no deletions or insertions are allowed
|
||||
"""
|
||||
if not subsequence:
|
||||
raise ValueError('Given subsequence is empty!')
|
||||
|
||||
_SUBSEQ_LEN = len(subsequence)
|
||||
_SEQ_LEN = len(sequence)
|
||||
|
||||
if max_l_dist is None or max_l_dist > max_substitutions + max_insertions:
|
||||
max_l_dist = max_substitutions + max_insertions
|
||||
max_substitutions = min(max_substitutions, max_l_dist)
|
||||
max_insertions = min(max_insertions, max_l_dist)
|
||||
|
||||
ngram_len = _SUBSEQ_LEN // (max_substitutions + max_insertions + 1)
|
||||
if ngram_len == 0:
|
||||
raise ValueError(
|
||||
"The subsequence's length must be greater than max_subs + max_ins!"
|
||||
)
|
||||
|
||||
ngrams = [
|
||||
Ngram(start, start + ngram_len)
|
||||
for start in range(0, len(subsequence) - ngram_len + 1, ngram_len)
|
||||
]
|
||||
matches = []
|
||||
matched_indexes = set()
|
||||
|
||||
for ngram in ngrams:
|
||||
_subseq_before = subsequence[:ngram.start]
|
||||
_subseq_before_reversed = _subseq_before[::-1]
|
||||
_subseq_after = subsequence[ngram.end:]
|
||||
start_index = max(0, ngram.start - max_insertions)
|
||||
end_index = min(_SEQ_LEN, _SEQ_LEN - (_SUBSEQ_LEN - ngram.end) + max_insertions)
|
||||
|
||||
for index in search_exact(subsequence[ngram.start:ngram.end], sequence, start_index, end_index):
|
||||
if index - ngram.start in matched_indexes:
|
||||
continue
|
||||
|
||||
seq_after = sequence[index + ngram_len:index + _SUBSEQ_LEN - ngram.start + max_insertions]
|
||||
if seq_after.startswith(_subseq_after):
|
||||
matches_after = [(0, 0)]
|
||||
else:
|
||||
matches_after = _expand(_subseq_after, seq_after,
|
||||
max_substitutions, max_insertions, max_l_dist)
|
||||
if not matches_after:
|
||||
continue
|
||||
|
||||
_max_substitutions = max_substitutions - min(m[0] for m in matches_after)
|
||||
_max_insertions = max_insertions - min(m[1] for m in matches_after)
|
||||
_max_l_dist = max_l_dist - min(m[0] + m[1] for m in matches_after)
|
||||
seq_before = sequence[index - ngram.start - _max_insertions:index]
|
||||
if seq_before.endswith(_subseq_before):
|
||||
matches_before = [(0, 0)]
|
||||
else:
|
||||
matches_before = _expand(
|
||||
_subseq_before_reversed, seq_before[::-1],
|
||||
_max_substitutions, _max_insertions, _max_l_dist,
|
||||
)
|
||||
|
||||
for (subs_before, ins_before) in matches_before:
|
||||
for (subs_after, ins_after) in matches_after:
|
||||
if (
|
||||
subs_before + subs_after <= max_substitutions and
|
||||
ins_before + ins_after <= max_insertions and
|
||||
subs_before + subs_after + ins_before + ins_after <= max_l_dist
|
||||
):
|
||||
matches.append(Match(
|
||||
start=index - ngram.start - ins_before,
|
||||
end=index - ngram.start + _SUBSEQ_LEN + ins_after,
|
||||
dist=subs_before + subs_after + ins_before + ins_after,
|
||||
))
|
||||
matched_indexes |= set(range(
|
||||
index - ngram.start - ins_before,
|
||||
index - ngram.start - ins_before + max_insertions + 1,
|
||||
))
|
||||
|
||||
return sorted(matches, key=lambda match: match.start)
|
|
@ -0,0 +1,110 @@
|
|||
from tests.compat import unittest, mock
|
||||
|
||||
from fuzzysearch.common import Match
|
||||
from fuzzysearch.no_deletions import _expand, \
|
||||
find_near_matches_no_deletions_ngrams as fnm_nodels_ngrams
|
||||
from tests.test_substitutions_only import TestFindNearMatchesSubstitionsNgrams
|
||||
|
||||
|
||||
class TestExpand(unittest.TestCase):
|
||||
def test_identical(self):
|
||||
self.assertEqual(_expand('abc', 'abc', 0, 0, 0), [(0, 0)])
|
||||
|
||||
def test_startswith(self):
|
||||
self.assertEqual(_expand('abc', 'abcdef', 0, 0, 0), [(0, 0)])
|
||||
self.assertEqual(_expand('abc', 'abcdef', 1, 0, 1), [(0, 0)])
|
||||
self.assertEqual(_expand('abc', 'abcdef', 2, 0, 2), [(0, 0)])
|
||||
self.assertEqual(_expand('abc', 'abcdef', 0, 1, 1), [(0, 0)])
|
||||
self.assertEqual(_expand('abc', 'abcdef', 0, 2, 2), [(0, 0)])
|
||||
self.assertEqual(_expand('abc', 'abcdef', 1, 1, 1), [(0, 0)])
|
||||
self.assertEqual(_expand('abc', 'abcdef', 1, 1, 2), [(0, 0)])
|
||||
self.assertEqual(_expand('abc', 'abcdef', 2, 2, 2), [(0, 0)])
|
||||
|
||||
def test_one_missing(self):
|
||||
# first item missing
|
||||
self.assertEqual(_expand('abcd', 'bcd---', 0, 1, 1), [])
|
||||
self.assertEqual(_expand('abcd', 'bcd---', 1, 0, 1), [])
|
||||
self.assertEqual(_expand('abcd', 'bcd---', 1, 1, 2), [])
|
||||
|
||||
# second item missing
|
||||
self.assertEqual(_expand('abcd', 'acd---', 0, 1, 1), [])
|
||||
self.assertEqual(_expand('abcd', 'acd---', 1, 0, 1), [])
|
||||
self.assertEqual(_expand('abcd', 'acd---', 1, 1, 2), [])
|
||||
|
||||
# last item missing
|
||||
self.assertEqual(_expand('abcd', 'abc---', 0, 1, 1), [])
|
||||
self.assertEqual(_expand('abcd', 'abc---', 1, 0, 1), [(1, 0)])
|
||||
self.assertEqual(_expand('abcd', 'abc---', 1, 1, 2), [(1, 0)])
|
||||
|
||||
def test_no_result(self):
|
||||
self.assertEqual(_expand('abc', 'def', 0, 0, 0), [])
|
||||
self.assertEqual(_expand('abc', 'defg', 1, 1, 1), [])
|
||||
self.assertEqual(_expand('abc', 'defg', 1, 1, 2), [])
|
||||
|
||||
def test_one_extra(self):
|
||||
# extra first item
|
||||
self.assertEqual(_expand('bcd', 'abcd', 0, 0, 0), [])
|
||||
self.assertEqual(_expand('bcd', 'abcd', 0, 1, 1), [(0, 1)])
|
||||
|
||||
# extra third item
|
||||
self.assertEqual(_expand('abd', 'abcd', 0, 0, 0), [])
|
||||
self.assertEqual(_expand('abd', 'abcd', 0, 1, 1), [(0, 1)])
|
||||
|
||||
# extra last item
|
||||
self.assertEqual(_expand('abc', 'abcd', 0, 0, 0), [(0, 0)])
|
||||
self.assertEqual(_expand('abc', 'abcd', 0, 1, 1), [(0, 0)])
|
||||
|
||||
def test_insert_and_substitute(self):
|
||||
self.assertEqual(_expand('abcdefg', 'abc-def----', 1, 1, 2), [(1, 1)])
|
||||
self.assertEqual(_expand('abcdefg', 'abc-def----', 1, 1, 1), [])
|
||||
self.assertEqual(_expand('abcdefg', 'abc-def----', 1, 0, 1), [])
|
||||
self.assertEqual(_expand('abcdefg', 'abc-def----', 0, 1, 1), [])
|
||||
|
||||
def test_double_first_item(self):
|
||||
self.assertEqual(_expand('abc', 'aabc', 1, 1, 1), [(0, 1)])
|
||||
|
||||
def test_two_insertions(self):
|
||||
self.assertEqual(_expand('abc', 'a--bc', 0, 2, 2), [(0, 2)])
|
||||
self.assertEqual(_expand('abc', 'a--bc', 2, 0, 2), [(2, 0)])
|
||||
self.assertEqual(_expand('abc', 'a--bc', 2, 2, 2), [(2, 0), (0, 2)])
|
||||
self.assertEqual(_expand('abc', 'a--bc', 1, 1, 2), [])
|
||||
|
||||
|
||||
class TestFindNearMatchesNoDeletionsNgramsAsNoSubstituions(
|
||||
TestFindNearMatchesSubstitionsNgrams, unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_subs):
|
||||
return fnm_nodels_ngrams(subsequence, sequence, max_subs, 0)
|
||||
|
||||
|
||||
class TestFindNearMatchesNoDeletionsNgrams(unittest.TestCase):
|
||||
def test_one_sub_one_ins(self):
|
||||
sequence = 'abcdefghij'
|
||||
pattern = 'bceXghi'
|
||||
expected_match = Match(start=1, end=9, dist=2)
|
||||
self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 0, 0, 0), [])
|
||||
self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 0, 1, 2), [])
|
||||
self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 1, 0, 2), [])
|
||||
self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 1, 1, 1), [])
|
||||
self.assertEqual(
|
||||
fnm_nodels_ngrams(pattern, sequence, 1, 1, 2),
|
||||
[expected_match],
|
||||
)
|
||||
|
||||
def test_two_extra(self):
|
||||
sequence = '--abc--de--'
|
||||
pattern = 'abcde'
|
||||
|
||||
self.assertEqual(
|
||||
fnm_nodels_ngrams(pattern, sequence, 0, 2, 2),
|
||||
[Match(start=2, end=9, dist=2)],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
fnm_nodels_ngrams(pattern, sequence, 2, 0, 2),
|
||||
[Match(start=2, end=7, dist=2)],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
fnm_nodels_ngrams(pattern, sequence, 2, 2, 2),
|
||||
[Match(start=2, end=7, dist=2), Match(start=2, end=9, dist=2)],
|
||||
)
|
Loading…
Reference in New Issue