From c873b767e7774f3b1f187372f06f65597abb06e1 Mon Sep 17 00:00:00 2001 From: Tal Einat Date: Thu, 7 May 2020 16:45:52 +0300 Subject: [PATCH] fix Levenshtein ngrams search sometimes returning too short matches --- src/fuzzysearch/_levenshtein_ngrams.c | 12 ++++++------ src/fuzzysearch/_levenshtein_ngrams.pyx | 2 +- tests/test_levenshtein.py | 4 ++++ 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/fuzzysearch/_levenshtein_ngrams.c b/src/fuzzysearch/_levenshtein_ngrams.c index efee709..763fd1b 100644 --- a/src/fuzzysearch/_levenshtein_ngrams.c +++ b/src/fuzzysearch/_levenshtein_ngrams.c @@ -803,7 +803,7 @@ static const char *__pyx_filename; static const char *__pyx_f[] = { - "src\\fuzzysearch\\_levenshtein_ngrams.pyx", + "src/fuzzysearch/_levenshtein_ngrams.pyx", }; /*--- Type declarations ---*/ @@ -1180,7 +1180,7 @@ static const char __pyx_k_needle_idx_range_start[] = "needle_idx_range_start"; static const char __pyx_k_new_needle_idx_range_end[] = "new_needle_idx_range_end"; static const char __pyx_k_new_needle_idx_range_start[] = "new_needle_idx_range_start"; static const char __pyx_k_fuzzysearch__levenshtein_ngrams[] = "fuzzysearch._levenshtein_ngrams"; -static const char __pyx_k_src_fuzzysearch__levenshtein_ngr[] = "src\\fuzzysearch\\_levenshtein_ngrams.pyx"; +static const char __pyx_k_src_fuzzysearch__levenshtein_ngr[] = "src/fuzzysearch/_levenshtein_ngrams.pyx"; static PyObject *__pyx_n_s_MemoryError; static PyObject *__pyx_n_s_a; static PyObject *__pyx_n_s_all; @@ -1725,16 +1725,16 @@ static PyObject *__pyx_pf_11fuzzysearch_19_levenshtein_ngrams_c_expand_short(CYT /* "fuzzysearch/_levenshtein_ngrams.pyx":64 * * # bail early when it is impossible to find a better expansion - * if min_intermediate_score >= min_score: # <<<<<<<<<<<<<< + * if min_intermediate_score > min_score: # <<<<<<<<<<<<<< * break * */ - __pyx_t_2 = ((__pyx_v_min_intermediate_score >= __pyx_v_min_score) != 0); + __pyx_t_2 = ((__pyx_v_min_intermediate_score > __pyx_v_min_score) != 0); if (__pyx_t_2) { /* "fuzzysearch/_levenshtein_ngrams.pyx":65 * # bail early when it is impossible to find a better expansion - * if min_intermediate_score >= min_score: + * if min_intermediate_score > min_score: * break # <<<<<<<<<<<<<< * * # keep the minimum score found for matches of the entire sub-sequence @@ -1744,7 +1744,7 @@ static PyObject *__pyx_pf_11fuzzysearch_19_levenshtein_ngrams_c_expand_short(CYT /* "fuzzysearch/_levenshtein_ngrams.pyx":64 * * # bail early when it is impossible to find a better expansion - * if min_intermediate_score >= min_score: # <<<<<<<<<<<<<< + * if min_intermediate_score > min_score: # <<<<<<<<<<<<<< * break * */ diff --git a/src/fuzzysearch/_levenshtein_ngrams.pyx b/src/fuzzysearch/_levenshtein_ngrams.pyx index 6ca1dc4..80a7a59 100644 --- a/src/fuzzysearch/_levenshtein_ngrams.pyx +++ b/src/fuzzysearch/_levenshtein_ngrams.pyx @@ -61,7 +61,7 @@ def c_expand_short(subsequence, sequence, max_l_dist): min_intermediate_score = c # bail early when it is impossible to find a better expansion - if min_intermediate_score >= min_score: + if min_intermediate_score > min_score: break # keep the minimum score found for matches of the entire sub-sequence diff --git a/tests/test_levenshtein.py b/tests/test_levenshtein.py index a42df92..2483f46 100644 --- a/tests/test_levenshtein.py +++ b/tests/test_levenshtein.py @@ -243,6 +243,10 @@ class TestFindNearMatchesLevenshteinBase(object): 'one missing at end': ('defg', 'abcdef', [ (1, [(3, 6, 1)]), ]), + 'highly repetetive': ('a' * 9, 'a' * 7 + 'xx', [ + (1, []), + (2, [(0, 9, 2)]), + ]), 'DNA search': ( 'TGCACTGTAGGGATAACAAT', longstr('''