fix Levenshtein ngrams search sometimes returning too short matches

2020-05-07 16:45:52 +03:00 · 2020-05-07 16:45:52 +03:00 · c873b767e7
parent e352398d29
commit c873b767e7
3 changed files with 11 additions and 7 deletions
--- a/src/fuzzysearch/_levenshtein_ngrams.c
+++ b/src/fuzzysearch/_levenshtein_ngrams.c
@ -803,7 +803,7 @@ static const char *__pyx_filename;


 static const char *__pyx_f[] = {
-  "src\\fuzzysearch\\_levenshtein_ngrams.pyx",
+  "src/fuzzysearch/_levenshtein_ngrams.pyx",
 };

 /*--- Type declarations ---*/
@ -1180,7 +1180,7 @@ static const char __pyx_k_needle_idx_range_start[] = "needle_idx_range_start";
 static const char __pyx_k_new_needle_idx_range_end[] = "new_needle_idx_range_end";
 static const char __pyx_k_new_needle_idx_range_start[] = "new_needle_idx_range_start";
 static const char __pyx_k_fuzzysearch__levenshtein_ngrams[] = "fuzzysearch._levenshtein_ngrams";
-static const char __pyx_k_src_fuzzysearch__levenshtein_ngr[] = "src\\fuzzysearch\\_levenshtein_ngrams.pyx";
+static const char __pyx_k_src_fuzzysearch__levenshtein_ngr[] = "src/fuzzysearch/_levenshtein_ngrams.pyx";
 static PyObject *__pyx_n_s_MemoryError;
 static PyObject *__pyx_n_s_a;
 static PyObject *__pyx_n_s_all;
@ -1725,16 +1725,16 @@ static PyObject *__pyx_pf_11fuzzysearch_19_levenshtein_ngrams_c_expand_short(CYT
      /* "fuzzysearch/_levenshtein_ngrams.pyx":64
 * 
 *             # bail early when it is impossible to find a better expansion
- *             if min_intermediate_score >= min_score:             # <<<<<<<<<<<<<<
+ *             if min_intermediate_score > min_score:             # <<<<<<<<<<<<<<
 *                 break
 * 
 */
-      __pyx_t_2 = ((__pyx_v_min_intermediate_score >= __pyx_v_min_score) != 0);
+      __pyx_t_2 = ((__pyx_v_min_intermediate_score > __pyx_v_min_score) != 0);
      if (__pyx_t_2) {

        /* "fuzzysearch/_levenshtein_ngrams.pyx":65
 *             # bail early when it is impossible to find a better expansion
- *             if min_intermediate_score >= min_score:
+ *             if min_intermediate_score > min_score:
 *                 break             # <<<<<<<<<<<<<<
 * 
 *             # keep the minimum score found for matches of the entire sub-sequence
@ -1744,7 +1744,7 @@ static PyObject *__pyx_pf_11fuzzysearch_19_levenshtein_ngrams_c_expand_short(CYT
        /* "fuzzysearch/_levenshtein_ngrams.pyx":64
 * 
 *             # bail early when it is impossible to find a better expansion
- *             if min_intermediate_score >= min_score:             # <<<<<<<<<<<<<<
+ *             if min_intermediate_score > min_score:             # <<<<<<<<<<<<<<
 *                 break
 * 
 */
--- a/src/fuzzysearch/_levenshtein_ngrams.pyx
+++ b/src/fuzzysearch/_levenshtein_ngrams.pyx
@ -61,7 +61,7 @@ def c_expand_short(subsequence, sequence, max_l_dist):
                    min_intermediate_score = c

            # bail early when it is impossible to find a better expansion
-            if min_intermediate_score >= min_score:
+            if min_intermediate_score > min_score:
                break

            # keep the minimum score found for matches of the entire sub-sequence
--- a/tests/test_levenshtein.py
+++ b/tests/test_levenshtein.py
@ -243,6 +243,10 @@ class TestFindNearMatchesLevenshteinBase(object):
        'one missing at end': ('defg', 'abcdef', [
            (1, [(3, 6, 1)]),
        ]),
+        'highly repetetive': ('a' * 9, 'a' * 7 + 'xx', [
+            (1, []),
+            (2, [(0, 9, 2)]),
+        ]),
        'DNA search': (
            'TGCACTGTAGGGATAACAAT',
            longstr('''