From ef17fce7c561f8f6f48cc1a195609862108f3757 Mon Sep 17 00:00:00 2001 From: Tal Einat Date: Sat, 29 Mar 2014 14:06:55 +0300 Subject: [PATCH] optimized ngrams versions of levenshtein and subs-only searches --- fuzzysearch/levenshtein_ngram.py | 46 +++++++-------- fuzzysearch/susbstitutions_only.py | 92 +++++++++++++++++++----------- 2 files changed, 81 insertions(+), 57 deletions(-) diff --git a/fuzzysearch/levenshtein_ngram.py b/fuzzysearch/levenshtein_ngram.py index c859d1e..ab574f7 100644 --- a/fuzzysearch/levenshtein_ngram.py +++ b/fuzzysearch/levenshtein_ngram.py @@ -36,41 +36,37 @@ def _expand(subsequence, sequence, max_l_dist): return (min_score, min_score_idx + 1) if min_score is not None and min_score <= max_l_dist else (None, None) -def _choose_search_range(subseq_len, seq_len, ngram, max_l_dist): - start_index = max(0, ngram.start - max_l_dist) - end_index = min(seq_len, seq_len - subseq_len + ngram.end + max_l_dist) - return start_index, end_index - - def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist): - ngram_len = len(subsequence) // (max_l_dist + 1) + subseq_len = len(subsequence) + seq_len = len(sequence) + + ngram_len = subseq_len // (max_l_dist + 1) if ngram_len == 0: raise ValueError('the subsequence length must be greater than max_l_dist') - ngrams = [ - Ngram(start, start + ngram_len) - for start in range(0, len(subsequence) - ngram_len + 1, ngram_len) - ] - matches = [] - for ngram in ngrams: - start_index, end_index = _choose_search_range(len(subsequence), len(sequence), ngram, max_l_dist) - for index in search_exact(subsequence[ngram.start:ngram.end], sequence, start_index, end_index): + for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len): + ngram_end = ngram_start + ngram_len + subseq_before_reversed = subsequence[:ngram_start][::-1] + subseq_after = subsequence[ngram_end:] + start_index = max(0, ngram_start - max_l_dist) + end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist) + for index in search_exact(subsequence[ngram_start:ngram_end], sequence, start_index, end_index): # try to expand left and/or right according to n_ngram - dist_left, left_expand_size = _expand( - subsequence[:ngram.start][::-1], - sequence[index - ngram.start - max_l_dist:index][::-1], + dist_right, right_expand_size = _expand( + subseq_after, + sequence[index + ngram_len:index - ngram_start + subseq_len + max_l_dist], max_l_dist, ) - if dist_left is None: - continue - dist_right, right_expand_size = _expand( - subsequence[ngram.end:], - sequence[index + ngram_len:index - ngram.start + len(subsequence) + max_l_dist], - max_l_dist - dist_left, - ) if dist_right is None: continue + dist_left, left_expand_size = _expand( + subseq_before_reversed, + sequence[max(0, index - ngram_start - (max_l_dist - dist_right)):index][::-1], + max_l_dist - dist_right, + ) + if dist_left is None: + continue assert dist_left + dist_right <= max_l_dist matches.append(Match( diff --git a/fuzzysearch/susbstitutions_only.py b/fuzzysearch/susbstitutions_only.py index 0430e90..ce3f295 100644 --- a/fuzzysearch/susbstitutions_only.py +++ b/fuzzysearch/susbstitutions_only.py @@ -1,7 +1,7 @@ from collections import deque, defaultdict from itertools import islice, chain -from fuzzysearch.common import Match, Ngram, search_exact +from fuzzysearch.common import Match, search_exact def find_near_matches_substitutions(subsequence, sequence, max_substitutions): @@ -111,45 +111,73 @@ def find_near_matches_substitutions_ngrams(subsequence, sequence, * the number of character substitutions must be less than max_substitutions * no deletions or insertions are allowed """ - if not subsequence: - raise ValueError('Given subsequence is empty!') + match_starts = set() + matches = [] + for match in _find_near_matches_substitutions_ngrams(subsequence, sequence, + max_substitutions): + if match.start not in match_starts: + match_starts.add(match.start) + matches.append(match) + return sorted(matches, key=lambda match: match.start) - _SUBSEQ_LEN = len(subsequence) - _SEQ_LEN = len(sequence) - ngram_len = _SUBSEQ_LEN // (max_substitutions + 1) +def _find_near_matches_substitutions_ngrams(subsequence, sequence, + max_substitutions): + subseq_len = len(subsequence) + seq_len = len(sequence) + + ngram_len = subseq_len // (max_substitutions + 1) if ngram_len == 0: raise ValueError( "The subsequence's length must be greater than max_substitutions!" ) - ngrams = [ - Ngram(start, start + ngram_len) - for start in range(0, len(subsequence) - ngram_len + 1, ngram_len) - ] + for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len): + ngram_end = ngram_start + ngram_len + _subseq_before = subsequence[:ngram_start] + _subseq_after = subsequence[ngram_end:] + for index in search_exact( + subsequence[ngram_start:ngram_end], sequence, + ngram_start, seq_len - (subseq_len - ngram_end), + ): + n_substitutions = 0 + _seq_before = sequence[index - ngram_start:index] + if _subseq_before != _seq_before: + n_substitutions += sum( + (a != b) for (a, b) in zip(_seq_before, _subseq_before) + ) + if n_substitutions > max_substitutions: + continue - matches = [] - match_starts = set() - for ngram in ngrams: - _subseq_before = subsequence[:ngram.start] - _subseq_after = subsequence[ngram.end:] - start_index = ngram.start - end_index = _SEQ_LEN - (_SUBSEQ_LEN - ngram.end) - for index in search_exact(subsequence[ngram.start:ngram.end], sequence, start_index, end_index): - if (index - ngram.start) in match_starts: - continue + _seq_after = sequence[index + ngram_len:index - ngram_start + subseq_len] + if _subseq_after != _seq_after: + if n_substitutions == max_substitutions: + continue + n_substitutions += sum( + (a != b) for (a, b) in zip(_seq_after, _subseq_after) + ) + if n_substitutions > max_substitutions: + continue - n_substitutions = sum((a != b) for (a, b) in chain( - zip(sequence[index - ngram.start:index], _subseq_before), - zip(sequence[index + ngram_len:index - ngram.start + _SUBSEQ_LEN], _subseq_after), - )) + yield Match( + start=index - ngram_start, + end=index - ngram_start + subseq_len, + dist=n_substitutions, + ) - if n_substitutions <= max_substitutions: - matches.append(Match( - start=index - ngram.start, - end=index - ngram.start + _SUBSEQ_LEN, - dist=n_substitutions, - )) - match_starts.add(index - ngram.start) - return sorted(matches, key=lambda match: match.start) +def has_near_match_substitutions_ngrams(subsequence, sequence, + max_substitutions): + """search for near-matches of subsequence in sequence + + This searches for near-matches, where the nearly-matching parts of the + sequence must meet the following limitations (relative to the subsequence): + + * the number of character substitutions must be less than max_substitutions + * no deletions or insertions are allowed + * the total number of substitutions, insertions and deletions + """ + for match in _find_near_matches_substitutions_ngrams(subsequence, sequence, + max_substitutions): + return True + return False