optimized ngrams versions of levenshtein and subs-only searches
This commit is contained in:
parent
8ba0cc8dc3
commit
ef17fce7c5
|
@ -36,41 +36,37 @@ def _expand(subsequence, sequence, max_l_dist):
|
||||||
return (min_score, min_score_idx + 1) if min_score is not None and min_score <= max_l_dist else (None, None)
|
return (min_score, min_score_idx + 1) if min_score is not None and min_score <= max_l_dist else (None, None)
|
||||||
|
|
||||||
|
|
||||||
def _choose_search_range(subseq_len, seq_len, ngram, max_l_dist):
|
|
||||||
start_index = max(0, ngram.start - max_l_dist)
|
|
||||||
end_index = min(seq_len, seq_len - subseq_len + ngram.end + max_l_dist)
|
|
||||||
return start_index, end_index
|
|
||||||
|
|
||||||
|
|
||||||
def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist):
|
def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist):
|
||||||
ngram_len = len(subsequence) // (max_l_dist + 1)
|
subseq_len = len(subsequence)
|
||||||
|
seq_len = len(sequence)
|
||||||
|
|
||||||
|
ngram_len = subseq_len // (max_l_dist + 1)
|
||||||
if ngram_len == 0:
|
if ngram_len == 0:
|
||||||
raise ValueError('the subsequence length must be greater than max_l_dist')
|
raise ValueError('the subsequence length must be greater than max_l_dist')
|
||||||
|
|
||||||
ngrams = [
|
|
||||||
Ngram(start, start + ngram_len)
|
|
||||||
for start in range(0, len(subsequence) - ngram_len + 1, ngram_len)
|
|
||||||
]
|
|
||||||
|
|
||||||
matches = []
|
matches = []
|
||||||
for ngram in ngrams:
|
for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len):
|
||||||
start_index, end_index = _choose_search_range(len(subsequence), len(sequence), ngram, max_l_dist)
|
ngram_end = ngram_start + ngram_len
|
||||||
for index in search_exact(subsequence[ngram.start:ngram.end], sequence, start_index, end_index):
|
subseq_before_reversed = subsequence[:ngram_start][::-1]
|
||||||
|
subseq_after = subsequence[ngram_end:]
|
||||||
|
start_index = max(0, ngram_start - max_l_dist)
|
||||||
|
end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist)
|
||||||
|
for index in search_exact(subsequence[ngram_start:ngram_end], sequence, start_index, end_index):
|
||||||
# try to expand left and/or right according to n_ngram
|
# try to expand left and/or right according to n_ngram
|
||||||
dist_left, left_expand_size = _expand(
|
dist_right, right_expand_size = _expand(
|
||||||
subsequence[:ngram.start][::-1],
|
subseq_after,
|
||||||
sequence[index - ngram.start - max_l_dist:index][::-1],
|
sequence[index + ngram_len:index - ngram_start + subseq_len + max_l_dist],
|
||||||
max_l_dist,
|
max_l_dist,
|
||||||
)
|
)
|
||||||
if dist_left is None:
|
|
||||||
continue
|
|
||||||
dist_right, right_expand_size = _expand(
|
|
||||||
subsequence[ngram.end:],
|
|
||||||
sequence[index + ngram_len:index - ngram.start + len(subsequence) + max_l_dist],
|
|
||||||
max_l_dist - dist_left,
|
|
||||||
)
|
|
||||||
if dist_right is None:
|
if dist_right is None:
|
||||||
continue
|
continue
|
||||||
|
dist_left, left_expand_size = _expand(
|
||||||
|
subseq_before_reversed,
|
||||||
|
sequence[max(0, index - ngram_start - (max_l_dist - dist_right)):index][::-1],
|
||||||
|
max_l_dist - dist_right,
|
||||||
|
)
|
||||||
|
if dist_left is None:
|
||||||
|
continue
|
||||||
assert dist_left + dist_right <= max_l_dist
|
assert dist_left + dist_right <= max_l_dist
|
||||||
|
|
||||||
matches.append(Match(
|
matches.append(Match(
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from collections import deque, defaultdict
|
from collections import deque, defaultdict
|
||||||
from itertools import islice, chain
|
from itertools import islice, chain
|
||||||
|
|
||||||
from fuzzysearch.common import Match, Ngram, search_exact
|
from fuzzysearch.common import Match, search_exact
|
||||||
|
|
||||||
|
|
||||||
def find_near_matches_substitutions(subsequence, sequence, max_substitutions):
|
def find_near_matches_substitutions(subsequence, sequence, max_substitutions):
|
||||||
|
@ -111,45 +111,73 @@ def find_near_matches_substitutions_ngrams(subsequence, sequence,
|
||||||
* the number of character substitutions must be less than max_substitutions
|
* the number of character substitutions must be less than max_substitutions
|
||||||
* no deletions or insertions are allowed
|
* no deletions or insertions are allowed
|
||||||
"""
|
"""
|
||||||
if not subsequence:
|
match_starts = set()
|
||||||
raise ValueError('Given subsequence is empty!')
|
matches = []
|
||||||
|
for match in _find_near_matches_substitutions_ngrams(subsequence, sequence,
|
||||||
|
max_substitutions):
|
||||||
|
if match.start not in match_starts:
|
||||||
|
match_starts.add(match.start)
|
||||||
|
matches.append(match)
|
||||||
|
return sorted(matches, key=lambda match: match.start)
|
||||||
|
|
||||||
_SUBSEQ_LEN = len(subsequence)
|
|
||||||
_SEQ_LEN = len(sequence)
|
|
||||||
|
|
||||||
ngram_len = _SUBSEQ_LEN // (max_substitutions + 1)
|
def _find_near_matches_substitutions_ngrams(subsequence, sequence,
|
||||||
|
max_substitutions):
|
||||||
|
subseq_len = len(subsequence)
|
||||||
|
seq_len = len(sequence)
|
||||||
|
|
||||||
|
ngram_len = subseq_len // (max_substitutions + 1)
|
||||||
if ngram_len == 0:
|
if ngram_len == 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The subsequence's length must be greater than max_substitutions!"
|
"The subsequence's length must be greater than max_substitutions!"
|
||||||
)
|
)
|
||||||
|
|
||||||
ngrams = [
|
for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len):
|
||||||
Ngram(start, start + ngram_len)
|
ngram_end = ngram_start + ngram_len
|
||||||
for start in range(0, len(subsequence) - ngram_len + 1, ngram_len)
|
_subseq_before = subsequence[:ngram_start]
|
||||||
]
|
_subseq_after = subsequence[ngram_end:]
|
||||||
|
for index in search_exact(
|
||||||
matches = []
|
subsequence[ngram_start:ngram_end], sequence,
|
||||||
match_starts = set()
|
ngram_start, seq_len - (subseq_len - ngram_end),
|
||||||
for ngram in ngrams:
|
):
|
||||||
_subseq_before = subsequence[:ngram.start]
|
n_substitutions = 0
|
||||||
_subseq_after = subsequence[ngram.end:]
|
_seq_before = sequence[index - ngram_start:index]
|
||||||
start_index = ngram.start
|
if _subseq_before != _seq_before:
|
||||||
end_index = _SEQ_LEN - (_SUBSEQ_LEN - ngram.end)
|
n_substitutions += sum(
|
||||||
for index in search_exact(subsequence[ngram.start:ngram.end], sequence, start_index, end_index):
|
(a != b) for (a, b) in zip(_seq_before, _subseq_before)
|
||||||
if (index - ngram.start) in match_starts:
|
)
|
||||||
|
if n_substitutions > max_substitutions:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
n_substitutions = sum((a != b) for (a, b) in chain(
|
_seq_after = sequence[index + ngram_len:index - ngram_start + subseq_len]
|
||||||
zip(sequence[index - ngram.start:index], _subseq_before),
|
if _subseq_after != _seq_after:
|
||||||
zip(sequence[index + ngram_len:index - ngram.start + _SUBSEQ_LEN], _subseq_after),
|
if n_substitutions == max_substitutions:
|
||||||
))
|
continue
|
||||||
|
n_substitutions += sum(
|
||||||
|
(a != b) for (a, b) in zip(_seq_after, _subseq_after)
|
||||||
|
)
|
||||||
|
if n_substitutions > max_substitutions:
|
||||||
|
continue
|
||||||
|
|
||||||
if n_substitutions <= max_substitutions:
|
yield Match(
|
||||||
matches.append(Match(
|
start=index - ngram_start,
|
||||||
start=index - ngram.start,
|
end=index - ngram_start + subseq_len,
|
||||||
end=index - ngram.start + _SUBSEQ_LEN,
|
|
||||||
dist=n_substitutions,
|
dist=n_substitutions,
|
||||||
))
|
)
|
||||||
match_starts.add(index - ngram.start)
|
|
||||||
|
|
||||||
return sorted(matches, key=lambda match: match.start)
|
|
||||||
|
def has_near_match_substitutions_ngrams(subsequence, sequence,
|
||||||
|
max_substitutions):
|
||||||
|
"""search for near-matches of subsequence in sequence
|
||||||
|
|
||||||
|
This searches for near-matches, where the nearly-matching parts of the
|
||||||
|
sequence must meet the following limitations (relative to the subsequence):
|
||||||
|
|
||||||
|
* the number of character substitutions must be less than max_substitutions
|
||||||
|
* no deletions or insertions are allowed
|
||||||
|
* the total number of substitutions, insertions and deletions
|
||||||
|
"""
|
||||||
|
for match in _find_near_matches_substitutions_ngrams(subsequence, sequence,
|
||||||
|
max_substitutions):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
Loading…
Reference in New Issue