added C implementation of subs-only hnm_byteslike

This commit is contained in:
Tal Einat 2014-05-11 02:37:32 +03:00
parent 087beb37d6
commit e1a7b0cc30
3 changed files with 100 additions and 2 deletions

View File

@ -1,4 +1,5 @@
#include <Python.h>
#include "fuzzysearch/kmp.h"
#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
@ -75,11 +76,99 @@ substitutions_only_has_near_matches_byteslike(PyObject *self, PyObject *args)
Py_RETURN_FALSE;
}
static PyObject *
substitutions_only_has_near_matches_ngrams_byteslike(PyObject *self, PyObject *args)
{
/* input params */
const char *subsequence;
const char *sequence;
int subseq_len, seq_len, max_substitutions;
int ngram_len, ngram_start, subseq_len_after_ngram;
const char *match_ptr, *seq_ptr, *subseq_ptr, *subseq_end;
int *kmpNext;
struct KMPstate kmp_state;
int n_differences, i;
if (!PyArg_ParseTuple(
args, "s#s#i",
&subsequence, &subseq_len,
&sequence, &seq_len,
&max_substitutions
)) {
return NULL;
}
if (seq_len < subseq_len) {
Py_RETURN_FALSE;
}
ngram_len = subseq_len / (max_substitutions + 1);
if (ngram_len == 0) {
PyErr_SetString(PyExc_ValueError,
"The subsequence's length must be greater than max_substitutions!"
);
return NULL;
}
subseq_end = subsequence + subseq_len;
kmpNext = (int *) malloc(ngram_len * sizeof(int));
if (kmpNext == NULL) {
return PyErr_NoMemory();
}
for (ngram_start = 0; ngram_start <= subseq_len - ngram_len; ngram_start += ngram_len) {
subseq_len_after_ngram = subseq_len - (ngram_start + ngram_len);
preKMP(subsequence + ngram_start, ngram_len, kmpNext);
kmp_state = KMP_init(subsequence + ngram_start,
ngram_len,
sequence + ngram_start,
seq_len - ngram_start - subseq_len_after_ngram,
kmpNext);
match_ptr = KMP_find_next(&kmp_state);
while (match_ptr != NULL) {
n_differences = max_substitutions + 1;
subseq_ptr = subsequence + ngram_start;
seq_ptr = match_ptr;
while (subseq_ptr != subsequence && n_differences) {
n_differences -= *(--subseq_ptr) != *(--seq_ptr);
}
if (n_differences) {
subseq_ptr = subseq_end - subseq_len_after_ngram;
seq_ptr = match_ptr + ngram_len;
while (subseq_ptr != subseq_end && n_differences) {
n_differences -= (*subseq_ptr++) != (*seq_ptr++);
}
if (n_differences) {
free(kmpNext);
Py_RETURN_TRUE;
}
}
match_ptr = KMP_find_next(&kmp_state);
}
}
free(kmpNext);
Py_RETURN_FALSE;
}
static PyMethodDef substitutions_only_methods[] = {
{"substitutions_only_has_near_matches_byteslike",
substitutions_only_has_near_matches_byteslike,
METH_VARARGS,
"DOCSTRING."},
{"substitutions_only_has_near_matches_ngrams_byteslike",
substitutions_only_has_near_matches_ngrams_byteslike,
METH_VARARGS,
"DOCSTRING."},
{NULL, NULL, 0, NULL} /* Sentinel */
};

View File

@ -17,7 +17,8 @@ history = open('HISTORY.rst').read().replace('.. :changelog:', '')
_substitutions_only_module = Extension(
'fuzzysearch._substitutions_only',
sources=['fuzzysearch/_substitutions_only.c'],
sources=['fuzzysearch/_substitutions_only.c', 'fuzzysearch/kmp.c'],
include_dirs=['.'],
)
_common_module = Extension(
'fuzzysearch._common',

View File

@ -298,7 +298,9 @@ class TestHasNearMatchSubstitionsOnly(TestHasNearMatchSubstitionsOnlyBase,
try:
from fuzzysearch._substitutions_only import \
substitutions_only_has_near_matches_byteslike as hnm_subs_byteslike
substitutions_only_has_near_matches_byteslike as hnm_subs_byteslike, \
substitutions_only_has_near_matches_ngrams_byteslike as \
hnm_subs_ngrams_byteslike
except ImportError:
pass
else:
@ -309,3 +311,9 @@ else:
def search(self, subsequence, sequence, max_subs):
return hnm_subs_byteslike(subsequence, sequence, max_subs)
class TestHasNearMatchesSubstitionsNgramsByteslike(
TestHasNearMatchSubstitionsOnlyBase,
unittest.TestCase
):
def search(self, subsequence, sequence, max_subs):
return hnm_subs_ngrams_byteslike(subsequence, sequence, max_subs)