diff --git a/fuzzysearch/_substitutions_only.c b/fuzzysearch/_substitutions_only.c index e108aa3..1f044b3 100644 --- a/fuzzysearch/_substitutions_only.c +++ b/fuzzysearch/_substitutions_only.c @@ -1,4 +1,5 @@ #include +#include "fuzzysearch/kmp.h" #if PY_MAJOR_VERSION >= 3 #define IS_PY3K @@ -75,11 +76,99 @@ substitutions_only_has_near_matches_byteslike(PyObject *self, PyObject *args) Py_RETURN_FALSE; } +static PyObject * +substitutions_only_has_near_matches_ngrams_byteslike(PyObject *self, PyObject *args) +{ + /* input params */ + const char *subsequence; + const char *sequence; + int subseq_len, seq_len, max_substitutions; + + int ngram_len, ngram_start, subseq_len_after_ngram; + const char *match_ptr, *seq_ptr, *subseq_ptr, *subseq_end; + int *kmpNext; + struct KMPstate kmp_state; + int n_differences, i; + + if (!PyArg_ParseTuple( + args, "s#s#i", + &subsequence, &subseq_len, + &sequence, &seq_len, + &max_substitutions + )) { + return NULL; + } + + if (seq_len < subseq_len) { + Py_RETURN_FALSE; + } + + ngram_len = subseq_len / (max_substitutions + 1); + if (ngram_len == 0) { + PyErr_SetString(PyExc_ValueError, + "The subsequence's length must be greater than max_substitutions!" + ); + return NULL; + } + + subseq_end = subsequence + subseq_len; + + kmpNext = (int *) malloc(ngram_len * sizeof(int)); + if (kmpNext == NULL) { + return PyErr_NoMemory(); + } + + for (ngram_start = 0; ngram_start <= subseq_len - ngram_len; ngram_start += ngram_len) { + subseq_len_after_ngram = subseq_len - (ngram_start + ngram_len); + + preKMP(subsequence + ngram_start, ngram_len, kmpNext); + kmp_state = KMP_init(subsequence + ngram_start, + ngram_len, + sequence + ngram_start, + seq_len - ngram_start - subseq_len_after_ngram, + kmpNext); + + match_ptr = KMP_find_next(&kmp_state); + while (match_ptr != NULL) { + n_differences = max_substitutions + 1; + + subseq_ptr = subsequence + ngram_start; + seq_ptr = match_ptr; + while (subseq_ptr != subsequence && n_differences) { + n_differences -= *(--subseq_ptr) != *(--seq_ptr); + } + + if (n_differences) { + subseq_ptr = subseq_end - subseq_len_after_ngram; + seq_ptr = match_ptr + ngram_len; + while (subseq_ptr != subseq_end && n_differences) { + n_differences -= (*subseq_ptr++) != (*seq_ptr++); + } + + if (n_differences) { + free(kmpNext); + Py_RETURN_TRUE; + } + } + + match_ptr = KMP_find_next(&kmp_state); + } + } + + free(kmpNext); + Py_RETURN_FALSE; +} + + static PyMethodDef substitutions_only_methods[] = { {"substitutions_only_has_near_matches_byteslike", substitutions_only_has_near_matches_byteslike, METH_VARARGS, "DOCSTRING."}, + {"substitutions_only_has_near_matches_ngrams_byteslike", + substitutions_only_has_near_matches_ngrams_byteslike, + METH_VARARGS, + "DOCSTRING."}, {NULL, NULL, 0, NULL} /* Sentinel */ }; diff --git a/setup.py b/setup.py index bccba4c..d4ec900 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,8 @@ history = open('HISTORY.rst').read().replace('.. :changelog:', '') _substitutions_only_module = Extension( 'fuzzysearch._substitutions_only', - sources=['fuzzysearch/_substitutions_only.c'], + sources=['fuzzysearch/_substitutions_only.c', 'fuzzysearch/kmp.c'], + include_dirs=['.'], ) _common_module = Extension( 'fuzzysearch._common', diff --git a/tests/test_substitutions_only.py b/tests/test_substitutions_only.py index 0fda733..81f4aca 100644 --- a/tests/test_substitutions_only.py +++ b/tests/test_substitutions_only.py @@ -298,7 +298,9 @@ class TestHasNearMatchSubstitionsOnly(TestHasNearMatchSubstitionsOnlyBase, try: from fuzzysearch._substitutions_only import \ - substitutions_only_has_near_matches_byteslike as hnm_subs_byteslike + substitutions_only_has_near_matches_byteslike as hnm_subs_byteslike, \ + substitutions_only_has_near_matches_ngrams_byteslike as \ + hnm_subs_ngrams_byteslike except ImportError: pass else: @@ -309,3 +311,9 @@ else: def search(self, subsequence, sequence, max_subs): return hnm_subs_byteslike(subsequence, sequence, max_subs) + class TestHasNearMatchesSubstitionsNgramsByteslike( + TestHasNearMatchSubstitionsOnlyBase, + unittest.TestCase + ): + def search(self, subsequence, sequence, max_subs): + return hnm_subs_ngrams_byteslike(subsequence, sequence, max_subs)