added C implementation of subs-only hnm_byteslike
This commit is contained in:
parent
087beb37d6
commit
e1a7b0cc30
|
@ -1,4 +1,5 @@
|
|||
#include <Python.h>
|
||||
#include "fuzzysearch/kmp.h"
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
#define IS_PY3K
|
||||
|
@ -75,11 +76,99 @@ substitutions_only_has_near_matches_byteslike(PyObject *self, PyObject *args)
|
|||
Py_RETURN_FALSE;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
substitutions_only_has_near_matches_ngrams_byteslike(PyObject *self, PyObject *args)
|
||||
{
|
||||
/* input params */
|
||||
const char *subsequence;
|
||||
const char *sequence;
|
||||
int subseq_len, seq_len, max_substitutions;
|
||||
|
||||
int ngram_len, ngram_start, subseq_len_after_ngram;
|
||||
const char *match_ptr, *seq_ptr, *subseq_ptr, *subseq_end;
|
||||
int *kmpNext;
|
||||
struct KMPstate kmp_state;
|
||||
int n_differences, i;
|
||||
|
||||
if (!PyArg_ParseTuple(
|
||||
args, "s#s#i",
|
||||
&subsequence, &subseq_len,
|
||||
&sequence, &seq_len,
|
||||
&max_substitutions
|
||||
)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (seq_len < subseq_len) {
|
||||
Py_RETURN_FALSE;
|
||||
}
|
||||
|
||||
ngram_len = subseq_len / (max_substitutions + 1);
|
||||
if (ngram_len == 0) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"The subsequence's length must be greater than max_substitutions!"
|
||||
);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
subseq_end = subsequence + subseq_len;
|
||||
|
||||
kmpNext = (int *) malloc(ngram_len * sizeof(int));
|
||||
if (kmpNext == NULL) {
|
||||
return PyErr_NoMemory();
|
||||
}
|
||||
|
||||
for (ngram_start = 0; ngram_start <= subseq_len - ngram_len; ngram_start += ngram_len) {
|
||||
subseq_len_after_ngram = subseq_len - (ngram_start + ngram_len);
|
||||
|
||||
preKMP(subsequence + ngram_start, ngram_len, kmpNext);
|
||||
kmp_state = KMP_init(subsequence + ngram_start,
|
||||
ngram_len,
|
||||
sequence + ngram_start,
|
||||
seq_len - ngram_start - subseq_len_after_ngram,
|
||||
kmpNext);
|
||||
|
||||
match_ptr = KMP_find_next(&kmp_state);
|
||||
while (match_ptr != NULL) {
|
||||
n_differences = max_substitutions + 1;
|
||||
|
||||
subseq_ptr = subsequence + ngram_start;
|
||||
seq_ptr = match_ptr;
|
||||
while (subseq_ptr != subsequence && n_differences) {
|
||||
n_differences -= *(--subseq_ptr) != *(--seq_ptr);
|
||||
}
|
||||
|
||||
if (n_differences) {
|
||||
subseq_ptr = subseq_end - subseq_len_after_ngram;
|
||||
seq_ptr = match_ptr + ngram_len;
|
||||
while (subseq_ptr != subseq_end && n_differences) {
|
||||
n_differences -= (*subseq_ptr++) != (*seq_ptr++);
|
||||
}
|
||||
|
||||
if (n_differences) {
|
||||
free(kmpNext);
|
||||
Py_RETURN_TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
match_ptr = KMP_find_next(&kmp_state);
|
||||
}
|
||||
}
|
||||
|
||||
free(kmpNext);
|
||||
Py_RETURN_FALSE;
|
||||
}
|
||||
|
||||
|
||||
static PyMethodDef substitutions_only_methods[] = {
|
||||
{"substitutions_only_has_near_matches_byteslike",
|
||||
substitutions_only_has_near_matches_byteslike,
|
||||
METH_VARARGS,
|
||||
"DOCSTRING."},
|
||||
{"substitutions_only_has_near_matches_ngrams_byteslike",
|
||||
substitutions_only_has_near_matches_ngrams_byteslike,
|
||||
METH_VARARGS,
|
||||
"DOCSTRING."},
|
||||
{NULL, NULL, 0, NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
|
|
3
setup.py
3
setup.py
|
@ -17,7 +17,8 @@ history = open('HISTORY.rst').read().replace('.. :changelog:', '')
|
|||
|
||||
_substitutions_only_module = Extension(
|
||||
'fuzzysearch._substitutions_only',
|
||||
sources=['fuzzysearch/_substitutions_only.c'],
|
||||
sources=['fuzzysearch/_substitutions_only.c', 'fuzzysearch/kmp.c'],
|
||||
include_dirs=['.'],
|
||||
)
|
||||
_common_module = Extension(
|
||||
'fuzzysearch._common',
|
||||
|
|
|
@ -298,7 +298,9 @@ class TestHasNearMatchSubstitionsOnly(TestHasNearMatchSubstitionsOnlyBase,
|
|||
|
||||
try:
|
||||
from fuzzysearch._substitutions_only import \
|
||||
substitutions_only_has_near_matches_byteslike as hnm_subs_byteslike
|
||||
substitutions_only_has_near_matches_byteslike as hnm_subs_byteslike, \
|
||||
substitutions_only_has_near_matches_ngrams_byteslike as \
|
||||
hnm_subs_ngrams_byteslike
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
|
@ -309,3 +311,9 @@ else:
|
|||
def search(self, subsequence, sequence, max_subs):
|
||||
return hnm_subs_byteslike(subsequence, sequence, max_subs)
|
||||
|
||||
class TestHasNearMatchesSubstitionsNgramsByteslike(
|
||||
TestHasNearMatchSubstitionsOnlyBase,
|
||||
unittest.TestCase
|
||||
):
|
||||
def search(self, subsequence, sequence, max_subs):
|
||||
return hnm_subs_ngrams_byteslike(subsequence, sequence, max_subs)
|
||||
|
|
Loading…
Reference in New Issue