From 8be3d79a00aa9dd002d29f996348b89e24736d3e Mon Sep 17 00:00:00 2001 From: Tal Einat Date: Sat, 27 Jun 2020 22:17:57 +0300 Subject: [PATCH] fix handling of inputs in bytes-only C extension functions --- MANIFEST.in | 1 + src/fuzzysearch/_c_ext_base.h | 32 +++++ src/fuzzysearch/_common.c | 126 ++++++++++-------- src/fuzzysearch/_generic_search.c | 126 +++++++++++++----- src/fuzzysearch/_levenshtein_ngrams.c | 67 +++++++--- src/fuzzysearch/_substitutions_only.c | 17 +-- .../_substitutions_only_lp_template.h | 82 ++++++------ .../_substitutions_only_ngrams_template.h | 67 +++++----- src/fuzzysearch/generic_search.py | 24 ++-- src/fuzzysearch/substitutions_only.py | 69 ++++------ 10 files changed, 361 insertions(+), 250 deletions(-) create mode 100644 src/fuzzysearch/_c_ext_base.h diff --git a/MANIFEST.in b/MANIFEST.in index aff3e9b..e4de98b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,6 +4,7 @@ include HISTORY.rst include LICENSE include README.rst include src/fuzzysearch/memmem.h +include src/fuzzysearch/_c_ext_base.h include src/fuzzysearch/_substitutions_only_lp_template.h include src/fuzzysearch/_substitutions_only_ngrams_template.h include src/fuzzysearch/wordlen_memmem.h diff --git a/src/fuzzysearch/_c_ext_base.h b/src/fuzzysearch/_c_ext_base.h new file mode 100644 index 0000000..10208c9 --- /dev/null +++ b/src/fuzzysearch/_c_ext_base.h @@ -0,0 +1,32 @@ +#define PY_SSIZE_T_CLEAN +#include + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + +#ifndef unlikely + #ifdef __GNUC__ + /* Test for GCC > 2.95 */ + #if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)) + #define likely(x) __builtin_expect(!!(x), 1) + #define unlikely(x) __builtin_expect(!!(x), 0) + #else /* __GNUC__ > 2 ... */ + #define likely(x) (x) + #define unlikely(x) (x) + #endif /* __GNUC__ > 2 ... */ + #else /* __GNUC__ */ + #define likely(x) (x) + #define unlikely(x) (x) + #endif /* __GNUC__ */ +#endif + + +inline static int is_simple_buffer(Py_buffer pybuf) { + return ( + pybuf.itemsize == 1 && + pybuf.ndim == 1 && + (pybuf.strides == NULL || pybuf.strides[0] == 1) && + pybuf.suboffsets == NULL + ); +} diff --git a/src/fuzzysearch/_common.c b/src/fuzzysearch/_common.c index a93b4e6..8018a2b 100644 --- a/src/fuzzysearch/_common.c +++ b/src/fuzzysearch/_common.c @@ -1,45 +1,17 @@ -#define PY_SSIZE_T_CLEAN -#include +#include "src/fuzzysearch/_c_ext_base.h" #include "src/fuzzysearch/memmem.h" -#if PY_MAJOR_VERSION >= 3 -#define IS_PY3K -#endif - -#ifdef __GNUC__ - /* Test for GCC > 2.95 */ - #if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)) - #define likely(x) __builtin_expect(!!(x), 1) - #define unlikely(x) __builtin_expect(!!(x), 0) - #else /* __GNUC__ > 2 ... */ - #define likely(x) (x) - #define unlikely(x) (x) - #endif /* __GNUC__ > 2 ... */ -#else /* __GNUC__ */ - #define likely(x) (x) - #define unlikely(x) (x) -#endif /* __GNUC__ */ - - -#ifdef IS_PY3K - #define ARG_TYPES_DEF "y#y#|ll:search_exact_byteslike" -#else - #if PY_HEX_VERSION >= 0x02070000 - #define ARG_TYPES_DEF "t#t#|ll:search_exact_byteslike" - #else - #define ARG_TYPES_DEF "s#s#|ll:search_exact_byteslike" - #endif -#endif static PyObject * search_exact_byteslike(PyObject *self, PyObject *args, PyObject *kwdict) { /* input params */ - const char *subseq, *seq; - Py_ssize_t subseq_len, seq_len; + Py_buffer subseq_pybuf, seq_pybuf; Py_ssize_t start_index=0, end_index=-1; static char *kwlist[] = {"subsequence", "sequence", "start_index", "end_index", NULL}; + const char *subseq, *seq; + Py_ssize_t subseq_len, seq_len; PyObject *results; PyObject *next_result; size_t next_match_index; @@ -47,36 +19,55 @@ search_exact_byteslike(PyObject *self, PyObject *args, PyObject *kwdict) { char *next_match_ptr; if (unlikely(!PyArg_ParseTupleAndKeywords( - args, kwdict, ARG_TYPES_DEF, kwlist, - &subseq, &subseq_len, - &seq, &seq_len, + args, kwdict, +#ifdef IS_PY3K + "y*y*|ll:search_exact_byteslike", +#else + "s*s*|ll:search_exact_byteslike", +#endif + kwlist, + &subseq_pybuf, + &seq_pybuf, &start_index, &end_index ))) { return NULL; } + if (unlikely(!( + is_simple_buffer(subseq_pybuf) && + is_simple_buffer(seq_pybuf) + ))) { + PyErr_SetString(PyExc_TypeError, "only contiguous sequences of single-byte values are supported"); + goto error; + } + + subseq = (const char*)(subseq_pybuf.buf); + seq = (const char*)(seq_pybuf.buf); + subseq_len = subseq_pybuf.len; + seq_len = seq_pybuf.len; + /* this is required because simple_memmem_with_needle_sum() returns the haystack if the needle is empty */ if (unlikely(subseq_len == 0)) { PyErr_SetString(PyExc_ValueError, "subsequence must not be empty"); - return NULL; + goto error; } if (unlikely(start_index < 0)) { PyErr_SetString(PyExc_ValueError, "start_index must be non-negative"); - return NULL; + goto error; } if (end_index == -1) end_index = seq_len; if (unlikely(end_index < 0)) { PyErr_SetString(PyExc_ValueError, "end_index must be non-negative"); - return NULL; + goto error; } results = PyList_New(0); if (unlikely(!results)) { - return NULL; + goto error; } seq_len = (end_index < seq_len ? end_index : seq_len); @@ -84,13 +75,14 @@ search_exact_byteslike(PyObject *self, PyObject *args, PyObject *kwdict) { seq_len -= (start_index <= seq_len ? start_index : seq_len); if (unlikely(seq_len < subseq_len)) { - return results; + next_match_ptr = NULL; + } else { + subseq_sum = calc_sum(subseq, subseq_len); + next_match_ptr = simple_memmem_with_needle_sum(seq, seq_len, + subseq, subseq_len, + subseq_sum); } - subseq_sum = calc_sum(subseq, subseq_len); - next_match_ptr = simple_memmem_with_needle_sum(seq, seq_len, - subseq, subseq_len, - subseq_sum); while (next_match_ptr != NULL) { next_match_index = (const char *)next_match_ptr - seq; #ifdef IS_PY3K @@ -99,10 +91,12 @@ search_exact_byteslike(PyObject *self, PyObject *args, PyObject *kwdict) { next_result = PyInt_FromLong(next_match_index + start_index); #endif if (unlikely(next_result == NULL)) { + Py_DECREF(results); goto error; } if (unlikely(PyList_Append(results, next_result) == -1)) { Py_DECREF(next_result); + Py_DECREF(results); goto error; } Py_DECREF(next_result); @@ -113,10 +107,13 @@ search_exact_byteslike(PyObject *self, PyObject *args, PyObject *kwdict) { subseq_sum); } + PyBuffer_Release(&subseq_pybuf); + PyBuffer_Release(&seq_pybuf); return results; error: - Py_DECREF(results); + PyBuffer_Release(&subseq_pybuf); + PyBuffer_Release(&seq_pybuf); return NULL; } @@ -125,35 +122,45 @@ static PyObject * count_differences_with_maximum_byteslike(PyObject *self, PyObject *args) { /* input params */ - const char *seq1, *seq2; - Py_ssize_t seq1_len, seq2_len; + Py_buffer seq1_pybuf, seq2_pybuf; int max_differences; + const char *seq1, *seq2; + Py_ssize_t seq1_len, seq2_len; Py_ssize_t i; int n_differences; if (!PyArg_ParseTuple( args, #ifdef IS_PY3K - "y#y#i", + "y*y*i", #else - #if PY_HEX_VERSION >= 0x02070000 - "t#t#i", - #else - "s#s#i", - #endif + "s*s*i", #endif - &seq1, &seq1_len, - &seq2, &seq2_len, + &seq1_pybuf, + &seq2_pybuf, &max_differences )) { return NULL; } + if (unlikely(!( + is_simple_buffer(seq1_pybuf) && + is_simple_buffer(seq2_pybuf) + ))) { + PyErr_SetString(PyExc_TypeError, "only contiguous sequences of single-byte values are supported"); + goto error; + } + + seq1 = (const char*)(seq1_pybuf.buf); + seq2 = (const char*)(seq2_pybuf.buf); + seq1_len = seq1_pybuf.len; + seq2_len = seq2_pybuf.len; + if (seq1_len != seq2_len) { PyErr_SetString(PyExc_ValueError, "The lengths of the given sequences must be equal."); - return NULL; + goto error; } n_differences = max_differences; @@ -163,7 +170,14 @@ count_differences_with_maximum_byteslike(PyObject *self, PyObject *args) ++seq2; } + PyBuffer_Release(&seq1_pybuf); + PyBuffer_Release(&seq2_pybuf); return PyLong_FromLong((long) (max_differences - n_differences)); + +error: + PyBuffer_Release(&seq1_pybuf); + PyBuffer_Release(&seq2_pybuf); + return NULL; } static PyMethodDef _common_methods[] = { diff --git a/src/fuzzysearch/_generic_search.c b/src/fuzzysearch/_generic_search.c index a2de482..cb23186 100644 --- a/src/fuzzysearch/_generic_search.c +++ b/src/fuzzysearch/_generic_search.c @@ -1,4 +1,4 @@ -/* Generated by Cython 0.29.14 */ +/* Generated by Cython 0.29.20 */ #define PY_SSIZE_T_CLEAN #include "Python.h" @@ -7,8 +7,8 @@ #elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000) #error Cython requires Python 2.6+ or Python 3.3+. #else -#define CYTHON_ABI "0_29_14" -#define CYTHON_HEX_VERSION 0x001D0EF0 +#define CYTHON_ABI "0_29_20" +#define CYTHON_HEX_VERSION 0x001D14F0 #define CYTHON_FUTURE_DIVISION 0 #include #ifndef offsetof @@ -484,8 +484,10 @@ static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) { #define PyString_Type PyUnicode_Type #define PyString_Check PyUnicode_Check #define PyString_CheckExact PyUnicode_CheckExact +#ifndef PyObject_Unicode #define PyObject_Unicode PyObject_Str #endif +#endif #if PY_MAJOR_VERSION >= 3 #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj) #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj) @@ -496,6 +498,13 @@ static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) { #ifndef PySet_CheckExact #define PySet_CheckExact(obj) (Py_TYPE(obj) == &PySet_Type) #endif +#if PY_VERSION_HEX >= 0x030900A4 + #define __Pyx_SET_REFCNT(obj, refcnt) Py_SET_REFCNT(obj, refcnt) + #define __Pyx_SET_SIZE(obj, size) Py_SET_SIZE(obj, size) +#else + #define __Pyx_SET_REFCNT(obj, refcnt) Py_REFCNT(obj) = (refcnt) + #define __Pyx_SET_SIZE(obj, size) Py_SIZE(obj) = (size) +#endif #if CYTHON_ASSUME_SAFE_MACROS #define __Pyx_PySequence_SIZE(seq) Py_SIZE(seq) #else @@ -576,11 +585,10 @@ static CYTHON_INLINE float __PYX_NAN() { #define __Pyx_truncl truncl #endif - +#define __PYX_MARK_ERR_POS(f_index, lineno) \ + { __pyx_filename = __pyx_f[f_index]; (void)__pyx_filename; __pyx_lineno = lineno; (void)__pyx_lineno; __pyx_clineno = __LINE__; (void)__pyx_clineno; } #define __PYX_ERR(f_index, lineno, Ln_error) \ -{ \ - __pyx_filename = __pyx_f[f_index]; __pyx_lineno = lineno; __pyx_clineno = __LINE__; goto Ln_error; \ -} + { __PYX_MARK_ERR_POS(f_index, lineno) goto Ln_error; } #ifndef __PYX_EXTERN_C #ifdef __cplusplus @@ -1078,7 +1086,7 @@ static CYTHON_INLINE int __Pyx_PyList_Append(PyObject* list, PyObject* x) { if (likely(L->allocated > len) & likely(len > (L->allocated >> 1))) { Py_INCREF(x); PyList_SET_ITEM(list, len, x); - Py_SIZE(list) = len+1; + __Pyx_SET_SIZE(list, len + 1); return 0; } return PyList_Append(list, x); @@ -1093,7 +1101,7 @@ static CYTHON_INLINE void __Pyx_RaiseClosureNameError(const char *varname); /* FetchCommonType.proto */ static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type); -/* CythonFunction.proto */ +/* CythonFunctionShared.proto */ #define __Pyx_CyFunction_USED 1 #define __Pyx_CYFUNCTION_STATICMETHOD 0x01 #define __Pyx_CYFUNCTION_CLASSMETHOD 0x02 @@ -1121,6 +1129,7 @@ typedef struct { PyObject *func_classobj; void *defaults; int defaults_pyobjects; + size_t defaults_size; // used by FusedFunction for copying defaults int flags; PyObject *defaults_tuple; PyObject *defaults_kwdict; @@ -1129,9 +1138,7 @@ typedef struct { } __pyx_CyFunctionObject; static PyTypeObject *__pyx_CyFunctionType = 0; #define __Pyx_CyFunction_Check(obj) (__Pyx_TypeCheck(obj, __pyx_CyFunctionType)) -#define __Pyx_CyFunction_NewEx(ml, flags, qualname, self, module, globals, code)\ - __Pyx_CyFunction_New(__pyx_CyFunctionType, ml, flags, qualname, self, module, globals, code) -static PyObject *__Pyx_CyFunction_New(PyTypeObject *, PyMethodDef *ml, +static PyObject *__Pyx_CyFunction_Init(__pyx_CyFunctionObject* op, PyMethodDef *ml, int flags, PyObject* qualname, PyObject *self, PyObject *module, PyObject *globals, @@ -1147,6 +1154,13 @@ static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *m, PyObject *dict); static int __pyx_CyFunction_init(void); +/* CythonFunction.proto */ +static PyObject *__Pyx_CyFunction_New(PyMethodDef *ml, + int flags, PyObject* qualname, + PyObject *closure, + PyObject *module, PyObject *globals, + PyObject* code); + /* GetException.proto */ #if CYTHON_FAST_THREAD_STATE #define __Pyx_GetException(type, value, tb) __Pyx__GetException(__pyx_tstate, type, value, tb) @@ -1435,6 +1449,9 @@ static PyObject *__pyx_pw_11fuzzysearch_15_generic_search_1c_find_near_matches_g PyObject *__pyx_v_subsequence = 0; PyObject *__pyx_v_sequence = 0; PyObject *__pyx_v_search_params = 0; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("c_find_near_matches_generic_linear_programming (wrapper)", 0); @@ -1528,6 +1545,9 @@ static PyObject *__pyx_pf_11fuzzysearch_15_generic_search_c_find_near_matches_ge unsigned int __pyx_t_16; unsigned int __pyx_t_17; unsigned int __pyx_t_18; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; __Pyx_RefNannySetupContext("c_find_near_matches_generic_linear_programming", 0); /* "fuzzysearch/_generic_search.pyx":36 @@ -1881,6 +1901,9 @@ static PyObject *__pyx_pw_11fuzzysearch_15_generic_search_47_c_find_near_matches PyObject *__pyx_v_start = 0; PyObject *__pyx_v_end = 0; PyObject *__pyx_v_dist = 0; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("add_match (wrapper)", 0); @@ -1961,6 +1984,9 @@ static PyObject *__pyx_pf_11fuzzysearch_15_generic_search_47_c_find_near_matches Py_ssize_t __pyx_t_7; Py_ssize_t __pyx_t_8; int __pyx_t_9; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; __Pyx_RefNannySetupContext("add_match", 0); __pyx_outer_scope = (struct __pyx_obj_11fuzzysearch_15_generic_search___pyx_scope_struct____pyx_f_11fuzzysearch_15_generic_search__c_find_near_matches_generic_linear_programming *) __Pyx_CyFunction_GetClosure(__pyx_self); __pyx_cur_scope = __pyx_outer_scope; @@ -2101,6 +2127,9 @@ static PyObject *__pyx_f_11fuzzysearch_15_generic_search__c_find_near_matches_ge PyObject *__pyx_t_25 = NULL; PyObject *__pyx_t_26 = NULL; PyObject *__pyx_t_27 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; __Pyx_RefNannySetupContext("_c_find_near_matches_generic_linear_programming", 0); __pyx_cur_scope = (struct __pyx_obj_11fuzzysearch_15_generic_search___pyx_scope_struct____pyx_f_11fuzzysearch_15_generic_search__c_find_near_matches_generic_linear_programming *)__pyx_tp_new_11fuzzysearch_15_generic_search___pyx_scope_struct____pyx_f_11fuzzysearch_15_generic_search__c_find_near_matches_generic_linear_programming(__pyx_ptype_11fuzzysearch_15_generic_search___pyx_scope_struct____pyx_f_11fuzzysearch_15_generic_search__c_find_near_matches_generic_linear_programming, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_cur_scope)) { @@ -2258,7 +2287,7 @@ static PyObject *__pyx_f_11fuzzysearch_15_generic_search__c_find_near_matches_ge * matches.append(Match(start, end, dist, matched=sequence[start:end])) * */ - __pyx_t_5 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11fuzzysearch_15_generic_search_47_c_find_near_matches_generic_linear_programming_1add_match, 0, __pyx_n_s_c_find_near_matches_generic_lin, ((PyObject*)__pyx_cur_scope), __pyx_n_s_fuzzysearch__generic_search, __pyx_d, ((PyObject *)__pyx_codeobj__3)); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 90, __pyx_L1_error) + __pyx_t_5 = __Pyx_CyFunction_New(&__pyx_mdef_11fuzzysearch_15_generic_search_47_c_find_near_matches_generic_linear_programming_1add_match, 0, __pyx_n_s_c_find_near_matches_generic_lin, ((PyObject*)__pyx_cur_scope), __pyx_n_s_fuzzysearch__generic_search, __pyx_d, ((PyObject *)__pyx_codeobj__3)); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 90, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __pyx_v_add_match = __pyx_t_5; __pyx_t_5 = 0; @@ -3455,6 +3484,9 @@ static PyObject *__pyx_pw_11fuzzysearch_15_generic_search_3c_find_near_matches_g PyObject *__pyx_v_subsequence = 0; PyObject *__pyx_v_sequence = 0; PyObject *__pyx_v_search_params = 0; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("c_find_near_matches_generic_ngrams (wrapper)", 0); @@ -3567,6 +3599,9 @@ static PyObject *__pyx_pf_11fuzzysearch_15_generic_search_2c_find_near_matches_g PyObject *__pyx_t_20 = NULL; PyObject *__pyx_t_21 = NULL; int __pyx_t_22; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; __Pyx_RefNannySetupContext("c_find_near_matches_generic_ngrams", 0); /* "fuzzysearch/_generic_search.pyx":248 @@ -4722,6 +4757,9 @@ static int __Pyx_modinit_function_export_code(void) { static int __Pyx_modinit_type_init_code(void) { __Pyx_RefNannyDeclarations + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; __Pyx_RefNannySetupContext("__Pyx_modinit_type_init_code", 0); /*--- Type init code ---*/ if (PyType_Ready(&__pyx_type_11fuzzysearch_15_generic_search___pyx_scope_struct____pyx_f_11fuzzysearch_15_generic_search__c_find_near_matches_generic_linear_programming) < 0) __PYX_ERR(0, 61, __pyx_L1_error) @@ -4764,17 +4802,19 @@ static int __Pyx_modinit_function_import_code(void) { } -#if PY_MAJOR_VERSION < 3 -#ifdef CYTHON_NO_PYINIT_EXPORT -#define __Pyx_PyMODINIT_FUNC void -#else +#ifndef CYTHON_NO_PYINIT_EXPORT #define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC +#elif PY_MAJOR_VERSION < 3 +#ifdef __cplusplus +#define __Pyx_PyMODINIT_FUNC extern "C" void +#else +#define __Pyx_PyMODINIT_FUNC void #endif #else -#ifdef CYTHON_NO_PYINIT_EXPORT -#define __Pyx_PyMODINIT_FUNC PyObject * +#ifdef __cplusplus +#define __Pyx_PyMODINIT_FUNC extern "C" PyObject * #else -#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC +#define __Pyx_PyMODINIT_FUNC PyObject * #endif #endif @@ -4857,6 +4897,9 @@ static CYTHON_SMALL_CODE int __pyx_pymod_exec__generic_search(PyObject *__pyx_py { PyObject *__pyx_t_1 = NULL; PyObject *__pyx_t_2 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; __Pyx_RefNannyDeclarations #if CYTHON_PEP489_MULTI_PHASE_INIT if (__pyx_m) { @@ -4945,14 +4988,14 @@ if (!__Pyx_RefNanny) { } #endif /*--- Builtin init code ---*/ - if (__Pyx_InitCachedBuiltins() < 0) goto __pyx_L1_error; + if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error) /*--- Constants init code ---*/ - if (__Pyx_InitCachedConstants() < 0) goto __pyx_L1_error; + if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error) /*--- Global type/function init code ---*/ (void)__Pyx_modinit_global_init_code(); (void)__Pyx_modinit_variable_export_code(); (void)__Pyx_modinit_function_export_code(); - if (unlikely(__Pyx_modinit_type_init_code() != 0)) goto __pyx_L1_error; + if (unlikely(__Pyx_modinit_type_init_code() < 0)) __PYX_ERR(0, 1, __pyx_L1_error) (void)__Pyx_modinit_type_import_code(); (void)__Pyx_modinit_variable_import_code(); (void)__Pyx_modinit_function_import_code(); @@ -5194,7 +5237,7 @@ static int __Pyx_ParseOptionalKeywords( } name = first_kw_arg; #if PY_MAJOR_VERSION < 3 - if (likely(PyString_CheckExact(key)) || likely(PyString_Check(key))) { + if (likely(PyString_Check(key))) { while (*name) { if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key)) && _PyString_Eq(**name, key)) { @@ -5221,7 +5264,7 @@ static int __Pyx_ParseOptionalKeywords( while (*name) { int cmp = (**name == key) ? 0 : #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3 - (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 : + (__Pyx_PyUnicode_GET_LENGTH(**name) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 : #endif PyUnicode_Compare(**name, key); if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad; @@ -5237,7 +5280,7 @@ static int __Pyx_ParseOptionalKeywords( while (argname != first_kw_arg) { int cmp = (**argname == key) ? 0 : #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3 - (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 : + (__Pyx_PyUnicode_GET_LENGTH(**argname) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 : #endif PyUnicode_Compare(**argname, key); if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad; @@ -5844,7 +5887,7 @@ bad: goto done; } -/* CythonFunction */ +/* CythonFunctionShared */ #include static PyObject * __Pyx_CyFunction_get_doc(__pyx_CyFunctionObject *op, CYTHON_UNUSED void *closure) @@ -6151,10 +6194,9 @@ static PyMethodDef __pyx_CyFunction_methods[] = { #else #define __Pyx_CyFunction_weakreflist(cyfunc) ((cyfunc)->func.m_weakreflist) #endif -static PyObject *__Pyx_CyFunction_New(PyTypeObject *type, PyMethodDef *ml, int flags, PyObject* qualname, - PyObject *closure, PyObject *module, PyObject* globals, PyObject* code) { - __pyx_CyFunctionObject *op = PyObject_GC_New(__pyx_CyFunctionObject, type); - if (op == NULL) +static PyObject *__Pyx_CyFunction_Init(__pyx_CyFunctionObject *op, PyMethodDef *ml, int flags, PyObject* qualname, + PyObject *closure, PyObject *module, PyObject* globals, PyObject* code) { + if (unlikely(op == NULL)) return NULL; op->flags = flags; __Pyx_CyFunction_weakreflist(op) = NULL; @@ -6175,12 +6217,12 @@ static PyObject *__Pyx_CyFunction_New(PyTypeObject *type, PyMethodDef *ml, int f Py_XINCREF(code); op->func_code = code; op->defaults_pyobjects = 0; + op->defaults_size = 0; op->defaults = NULL; op->defaults_tuple = NULL; op->defaults_kwdict = NULL; op->defaults_getter = NULL; op->func_annotations = NULL; - PyObject_GC_Track(op); return (PyObject *) op; } static int @@ -6428,6 +6470,7 @@ static CYTHON_INLINE void *__Pyx_CyFunction_InitDefaults(PyObject *func, size_t return PyErr_NoMemory(); memset(m->defaults, 0, size); m->defaults_pyobjects = pyobjects; + m->defaults_size = size; return m->defaults; } static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsTuple(PyObject *func, PyObject *tuple) { @@ -6446,6 +6489,19 @@ static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *func, Py Py_INCREF(dict); } +/* CythonFunction */ +static PyObject *__Pyx_CyFunction_New(PyMethodDef *ml, int flags, PyObject* qualname, + PyObject *closure, PyObject *module, PyObject* globals, PyObject* code) { + PyObject *op = __Pyx_CyFunction_Init( + PyObject_GC_New(__pyx_CyFunctionObject, __pyx_CyFunctionType), + ml, flags, qualname, closure, module, globals, code + ); + if (likely(op)) { + PyObject_GC_Track(op); + } + return op; +} + /* GetException */ #if CYTHON_FAST_THREAD_STATE static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) @@ -6681,7 +6737,7 @@ static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) { { #if PY_MAJOR_VERSION >= 3 if (level == -1) { - if (strchr(__Pyx_MODULE_NAME, '.')) { + if ((1) && (strchr(__Pyx_MODULE_NAME, '.'))) { module = PyImport_ImportModuleLevelObject( name, global_dict, empty_dict, list, 1); if (!module) { @@ -6732,7 +6788,7 @@ static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name) { /* CLineInTraceback */ #ifndef CYTHON_CLINE_IN_TRACEBACK -static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line) { +static int __Pyx_CLineForTraceback(CYTHON_NCP_UNUSED PyThreadState *tstate, int c_line) { PyObject *use_cline; PyObject *ptype, *pvalue, *ptraceback; #if CYTHON_COMPILING_IN_CPYTHON @@ -6836,7 +6892,7 @@ static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) { if (__pyx_code_cache.count == __pyx_code_cache.max_count) { int new_max = __pyx_code_cache.max_count + 64; entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc( - __pyx_code_cache.entries, (size_t)new_max*sizeof(__Pyx_CodeObjectCacheEntry)); + __pyx_code_cache.entries, ((size_t)new_max) * sizeof(__Pyx_CodeObjectCacheEntry)); if (unlikely(!entries)) { return; } diff --git a/src/fuzzysearch/_levenshtein_ngrams.c b/src/fuzzysearch/_levenshtein_ngrams.c index 763fd1b..d6738ee 100644 --- a/src/fuzzysearch/_levenshtein_ngrams.c +++ b/src/fuzzysearch/_levenshtein_ngrams.c @@ -1,4 +1,4 @@ -/* Generated by Cython 0.29.14 */ +/* Generated by Cython 0.29.20 */ #define PY_SSIZE_T_CLEAN #include "Python.h" @@ -7,8 +7,8 @@ #elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000) #error Cython requires Python 2.6+ or Python 3.3+. #else -#define CYTHON_ABI "0_29_14" -#define CYTHON_HEX_VERSION 0x001D0EF0 +#define CYTHON_ABI "0_29_20" +#define CYTHON_HEX_VERSION 0x001D14F0 #define CYTHON_FUTURE_DIVISION 0 #include #ifndef offsetof @@ -484,8 +484,10 @@ static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) { #define PyString_Type PyUnicode_Type #define PyString_Check PyUnicode_Check #define PyString_CheckExact PyUnicode_CheckExact +#ifndef PyObject_Unicode #define PyObject_Unicode PyObject_Str #endif +#endif #if PY_MAJOR_VERSION >= 3 #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj) #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj) @@ -496,6 +498,13 @@ static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) { #ifndef PySet_CheckExact #define PySet_CheckExact(obj) (Py_TYPE(obj) == &PySet_Type) #endif +#if PY_VERSION_HEX >= 0x030900A4 + #define __Pyx_SET_REFCNT(obj, refcnt) Py_SET_REFCNT(obj, refcnt) + #define __Pyx_SET_SIZE(obj, size) Py_SET_SIZE(obj, size) +#else + #define __Pyx_SET_REFCNT(obj, refcnt) Py_REFCNT(obj) = (refcnt) + #define __Pyx_SET_SIZE(obj, size) Py_SIZE(obj) = (size) +#endif #if CYTHON_ASSUME_SAFE_MACROS #define __Pyx_PySequence_SIZE(seq) Py_SIZE(seq) #else @@ -576,11 +585,10 @@ static CYTHON_INLINE float __PYX_NAN() { #define __Pyx_truncl truncl #endif - +#define __PYX_MARK_ERR_POS(f_index, lineno) \ + { __pyx_filename = __pyx_f[f_index]; (void)__pyx_filename; __pyx_lineno = lineno; (void)__pyx_lineno; __pyx_clineno = __LINE__; (void)__pyx_clineno; } #define __PYX_ERR(f_index, lineno, Ln_error) \ -{ \ - __pyx_filename = __pyx_f[f_index]; __pyx_lineno = lineno; __pyx_clineno = __LINE__; goto Ln_error; \ -} + { __PYX_MARK_ERR_POS(f_index, lineno) goto Ln_error; } #ifndef __PYX_EXTERN_C #ifdef __cplusplus @@ -1243,6 +1251,9 @@ static PyObject *__pyx_pw_11fuzzysearch_19_levenshtein_ngrams_1c_expand_short(Py PyObject *__pyx_v_subsequence = 0; PyObject *__pyx_v_sequence = 0; PyObject *__pyx_v_max_l_dist = 0; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("c_expand_short (wrapper)", 0); @@ -1349,6 +1360,9 @@ static PyObject *__pyx_pf_11fuzzysearch_19_levenshtein_ngrams_c_expand_short(CYT PyObject *__pyx_t_23 = NULL; PyObject *__pyx_t_24 = NULL; PyObject *__pyx_t_25 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; __Pyx_RefNannySetupContext("c_expand_short", 0); /* "fuzzysearch/_levenshtein_ngrams.pyx":28 @@ -1938,6 +1952,9 @@ static PyObject *__pyx_pw_11fuzzysearch_19_levenshtein_ngrams_3c_expand_long(PyO PyObject *__pyx_v_subsequence = 0; PyObject *__pyx_v_sequence = 0; PyObject *__pyx_v_max_l_dist = 0; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("c_expand_long (wrapper)", 0); @@ -2051,6 +2068,9 @@ static PyObject *__pyx_pf_11fuzzysearch_19_levenshtein_ngrams_2c_expand_long(CYT PyObject *__pyx_t_26 = NULL; PyObject *__pyx_t_27 = NULL; PyObject *__pyx_t_28 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; __Pyx_RefNannySetupContext("c_expand_long", 0); /* "fuzzysearch/_levenshtein_ngrams.pyx":87 @@ -3123,17 +3143,19 @@ static int __Pyx_modinit_function_import_code(void) { } -#if PY_MAJOR_VERSION < 3 -#ifdef CYTHON_NO_PYINIT_EXPORT -#define __Pyx_PyMODINIT_FUNC void -#else +#ifndef CYTHON_NO_PYINIT_EXPORT #define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC +#elif PY_MAJOR_VERSION < 3 +#ifdef __cplusplus +#define __Pyx_PyMODINIT_FUNC extern "C" void +#else +#define __Pyx_PyMODINIT_FUNC void #endif #else -#ifdef CYTHON_NO_PYINIT_EXPORT -#define __Pyx_PyMODINIT_FUNC PyObject * +#ifdef __cplusplus +#define __Pyx_PyMODINIT_FUNC extern "C" PyObject * #else -#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC +#define __Pyx_PyMODINIT_FUNC PyObject * #endif #endif @@ -3215,6 +3237,9 @@ static CYTHON_SMALL_CODE int __pyx_pymod_exec__levenshtein_ngrams(PyObject *__py #endif { PyObject *__pyx_t_1 = NULL; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; __Pyx_RefNannyDeclarations #if CYTHON_PEP489_MULTI_PHASE_INIT if (__pyx_m) { @@ -3303,9 +3328,9 @@ if (!__Pyx_RefNanny) { } #endif /*--- Builtin init code ---*/ - if (__Pyx_InitCachedBuiltins() < 0) goto __pyx_L1_error; + if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error) /*--- Constants init code ---*/ - if (__Pyx_InitCachedConstants() < 0) goto __pyx_L1_error; + if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error) /*--- Global type/function init code ---*/ (void)__Pyx_modinit_global_init_code(); (void)__Pyx_modinit_variable_export_code(); @@ -3503,7 +3528,7 @@ static int __Pyx_ParseOptionalKeywords( } name = first_kw_arg; #if PY_MAJOR_VERSION < 3 - if (likely(PyString_CheckExact(key)) || likely(PyString_Check(key))) { + if (likely(PyString_Check(key))) { while (*name) { if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key)) && _PyString_Eq(**name, key)) { @@ -3530,7 +3555,7 @@ static int __Pyx_ParseOptionalKeywords( while (*name) { int cmp = (**name == key) ? 0 : #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3 - (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 : + (__Pyx_PyUnicode_GET_LENGTH(**name) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 : #endif PyUnicode_Compare(**name, key); if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad; @@ -3546,7 +3571,7 @@ static int __Pyx_ParseOptionalKeywords( while (argname != first_kw_arg) { int cmp = (**argname == key) ? 0 : #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3 - (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 : + (__Pyx_PyUnicode_GET_LENGTH(**argname) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 : #endif PyUnicode_Compare(**argname, key); if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad; @@ -4262,7 +4287,7 @@ static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UIN /* CLineInTraceback */ #ifndef CYTHON_CLINE_IN_TRACEBACK -static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line) { +static int __Pyx_CLineForTraceback(CYTHON_NCP_UNUSED PyThreadState *tstate, int c_line) { PyObject *use_cline; PyObject *ptype, *pvalue, *ptraceback; #if CYTHON_COMPILING_IN_CPYTHON @@ -4366,7 +4391,7 @@ static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) { if (__pyx_code_cache.count == __pyx_code_cache.max_count) { int new_max = __pyx_code_cache.max_count + 64; entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc( - __pyx_code_cache.entries, (size_t)new_max*sizeof(__Pyx_CodeObjectCacheEntry)); + __pyx_code_cache.entries, ((size_t)new_max) * sizeof(__Pyx_CodeObjectCacheEntry)); if (unlikely(!entries)) { return; } diff --git a/src/fuzzysearch/_substitutions_only.c b/src/fuzzysearch/_substitutions_only.c index ddf7437..6ee66de 100644 --- a/src/fuzzysearch/_substitutions_only.c +++ b/src/fuzzysearch/_substitutions_only.c @@ -1,15 +1,10 @@ -#define PY_SSIZE_T_CLEAN -#include - -#if PY_MAJOR_VERSION >= 3 -#define IS_PY3K -#endif +#include "src/fuzzysearch/_c_ext_base.h" #define DECLARE_VARS #define PREPARE #define OUTPUT_VALUE(x) DO_FREES; Py_RETURN_TRUE -#define RETURN_AT_END Py_RETURN_FALSE +#define RETURN_AT_END DO_FREES; Py_RETURN_FALSE #define FUNCTION_NAME substitutions_only_has_near_matches_lp_byteslike #include "src/fuzzysearch/_substitutions_only_lp_template.h" #undef FUNCTION_NAME @@ -31,21 +26,21 @@ #define PREPARE \ results = PyList_New(0); \ if (unlikely(!results)) \ - return NULL; + goto error; #define OUTPUT_VALUE(x) do { \ next_result = PyInt_FromSsize_t((x)); \ if (unlikely(next_result == NULL)) { \ Py_DECREF(results); \ - return NULL; \ + goto error; \ } \ if (unlikely(PyList_Append(results, next_result) == -1)) { \ Py_DECREF(next_result); \ Py_DECREF(results); \ - return NULL; \ + goto error; \ } \ Py_DECREF(next_result); \ } while(0) -#define RETURN_AT_END return results +#define RETURN_AT_END DO_FREES; return results #define FUNCTION_NAME substitutions_only_find_near_matches_lp_byteslike #include "src/fuzzysearch/_substitutions_only_lp_template.h" #undef FUNCTION_NAME diff --git a/src/fuzzysearch/_substitutions_only_lp_template.h b/src/fuzzysearch/_substitutions_only_lp_template.h index 982a1db..3b8be4b 100644 --- a/src/fuzzysearch/_substitutions_only_lp_template.h +++ b/src/fuzzysearch/_substitutions_only_lp_template.h @@ -1,50 +1,34 @@ -#ifdef __GNUC__ - /* Test for GCC > 2.95 */ - #if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)) - #define likely(x) __builtin_expect(!!(x), 1) - #define unlikely(x) __builtin_expect(!!(x), 0) - #else /* __GNUC__ > 2 ... */ - #define likely(x) (x) - #define unlikely(x) (x) - #endif /* __GNUC__ > 2 ... */ -#else /* __GNUC__ */ - #define likely(x) (x) - #define unlikely(x) (x) -#endif /* __GNUC__ */ +#define DO_FREES \ + free(sub_counts); \ + PyBuffer_Release(&subseq_pybuf); \ + PyBuffer_Release(&seq_pybuf) -#define DO_FREES free(sub_counts) - static PyObject * FUNCTION_NAME(PyObject *self, PyObject *args) { /* input params */ + Py_buffer subseq_pybuf, seq_pybuf; + int max_substitutions_input; + const char *subsequence; const char *sequence; Py_ssize_t subseq_len, seq_len; - int max_substitutions_input; unsigned int max_substitutions; - - unsigned int *sub_counts; + unsigned int *sub_counts = NULL; Py_ssize_t seq_idx, subseq_idx, count_idx; DECLARE_VARS; -#ifdef IS_PY3K - #define ARGSPEC "y#y#i" -#else - #if PY_HEX_VERSION >= 0x02070000 - #define ARGSPEC "t#t#i" - #else - #define ARGSPEC "s#s#i" - #endif -#endif - if (unlikely(!PyArg_ParseTuple( args, - ARGSPEC, - &subsequence, &subseq_len, - &sequence, &seq_len, +#ifdef IS_PY3K + "y*y*i", +#else + "s*s*i", +#endif + &subseq_pybuf, + &seq_pybuf, &max_substitutions_input ))) { return NULL; @@ -52,18 +36,32 @@ FUNCTION_NAME(PyObject *self, PyObject *args) if (unlikely(max_substitutions_input < 0)) { PyErr_SetString(PyExc_ValueError, "max_l_dist must be non-negative"); - return NULL; + goto error; } + /// TODO: check for overflow max_substitutions = (unsigned int) max_substitutions_input; + if (unlikely(!( + is_simple_buffer(subseq_pybuf) && + is_simple_buffer(seq_pybuf) + ))) { + PyErr_SetString(PyExc_TypeError, "only contiguous sequences of single-byte values are supported"); + goto error; + } + + subsequence = (const char*)(subseq_pybuf.buf); + sequence = (const char*)(seq_pybuf.buf); + subseq_len = subseq_pybuf.len; + seq_len = seq_pybuf.len; + if (unlikely(subseq_len < 0 || seq_len < 0)) { PyErr_SetString(PyExc_Exception, "an unknown error occurred"); - return NULL; + goto error; } if (unlikely(subseq_len == 0)) { PyErr_SetString(PyExc_ValueError, "subsequence must not be empty"); - return NULL; + goto error; } PREPARE; @@ -72,11 +70,6 @@ FUNCTION_NAME(PyObject *self, PyObject *args) RETURN_AT_END; } - sub_counts = (unsigned int *) malloc (sizeof(unsigned int) * subseq_len); - if (sub_counts == NULL) { - return PyErr_NoMemory(); - } - if (unlikely(max_substitutions >= subseq_len)) { for (seq_idx = 0; seq_idx <= seq_len - subseq_len; ++seq_idx) { OUTPUT_VALUE(seq_idx); @@ -84,6 +77,12 @@ FUNCTION_NAME(PyObject *self, PyObject *args) RETURN_AT_END; } + sub_counts = (unsigned int *) malloc (sizeof(unsigned int) * subseq_len); + if (sub_counts == NULL) { + DO_FREES; + return PyErr_NoMemory(); + } + for (seq_idx = 0; seq_idx < subseq_len - 1; ++seq_idx) { sub_counts[seq_idx] = 0; for (subseq_idx = 0; subseq_idx <= seq_idx; ++subseq_idx) { @@ -108,8 +107,11 @@ FUNCTION_NAME(PyObject *self, PyObject *args) sub_counts[count_idx] = 0; } - DO_FREES; RETURN_AT_END; + +error: + DO_FREES; + return NULL; } #undef DO_FREES diff --git a/src/fuzzysearch/_substitutions_only_ngrams_template.h b/src/fuzzysearch/_substitutions_only_ngrams_template.h index 0bf0b10..067240a 100644 --- a/src/fuzzysearch/_substitutions_only_ngrams_template.h +++ b/src/fuzzysearch/_substitutions_only_ngrams_template.h @@ -1,33 +1,22 @@ #include "src/fuzzysearch/memmem.h" -#ifdef __GNUC__ - /* Test for GCC > 2.95 */ - #if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)) - #define likely(x) __builtin_expect(!!(x), 1) - #define unlikely(x) __builtin_expect(!!(x), 0) - #else /* __GNUC__ > 2 ... */ - #define likely(x) (x) - #define unlikely(x) (x) - #endif /* __GNUC__ > 2 ... */ -#else /* __GNUC__ */ - #define likely(x) (x) - #define unlikely(x) (x) -#endif /* __GNUC__ */ +#define DO_FREES \ + PyBuffer_Release(&subseq_pybuf); \ + PyBuffer_Release(&seq_pybuf) -#define DO_FREES - static PyObject * FUNCTION_NAME(PyObject *self, PyObject *args) { /* input params */ + Py_buffer subseq_pybuf, seq_pybuf; + int max_substitutions_input; + const char *subsequence; const char *sequence; Py_ssize_t subseq_len, seq_len; - int max_substitutions_input; unsigned int max_substitutions; - unsigned int ngram_len, ngram_start, subseq_len_after_ngram; const char *match_ptr, *seq_ptr, *subseq_ptr, *subseq_end; int subseq_sum; @@ -35,21 +24,15 @@ FUNCTION_NAME(PyObject *self, PyObject *args) DECLARE_VARS; -#ifdef IS_PY3K - #define ARGSPEC "y#y#i" -#else - #if PY_HEX_VERSION >= 0x02070000 - #define ARGSPEC "t#t#i" - #else - #define ARGSPEC "s#s#i" - #endif -#endif - if (unlikely(!PyArg_ParseTuple( args, - ARGSPEC, - &subsequence, &subseq_len, - &sequence, &seq_len, +#ifdef IS_PY3K + "y*y*i", +#else + "s*s*i", +#endif + &subseq_pybuf, + &seq_pybuf, &max_substitutions_input ))) { return NULL; @@ -57,20 +40,34 @@ FUNCTION_NAME(PyObject *self, PyObject *args) if (unlikely(max_substitutions_input < 0)) { PyErr_SetString(PyExc_ValueError, "max_l_dist must be non-negative"); - return NULL; + goto error; } + /// TODO: check for overflow max_substitutions = (unsigned int) max_substitutions_input; + if (unlikely(!( + is_simple_buffer(subseq_pybuf) && + is_simple_buffer(seq_pybuf) + ))) { + PyErr_SetString(PyExc_TypeError, "only contiguous sequences of single-byte values are supported"); + goto error; + } + + subsequence = (const char*)(subseq_pybuf.buf); + sequence = (const char*)(seq_pybuf.buf); + subseq_len = subseq_pybuf.len; + seq_len = seq_pybuf.len; + if (unlikely(subseq_len < 0 || seq_len < 0)) { PyErr_SetString(PyExc_Exception, "an unknown error occurred"); - return NULL; + goto error; } /* this is required because simple_memmem_with_needle_sum() returns the haystack if the needle is empty */ if (unlikely(subseq_len == 0)) { PyErr_SetString(PyExc_ValueError, "subsequence must not be empty"); - return NULL; + goto error; } PREPARE; @@ -138,6 +135,10 @@ FUNCTION_NAME(PyObject *self, PyObject *args) } RETURN_AT_END; + +error: + DO_FREES; + return NULL; } #undef DO_FREES diff --git a/src/fuzzysearch/generic_search.py b/src/fuzzysearch/generic_search.py index 16cc7ca..838e965 100644 --- a/src/fuzzysearch/generic_search.py +++ b/src/fuzzysearch/generic_search.py @@ -1,10 +1,11 @@ from collections import namedtuple +from functools import wraps import attr from fuzzysearch.common import FuzzySearchBase, Match, \ consolidate_overlapping_matches -from fuzzysearch.compat import text_type, xrange +from fuzzysearch.compat import xrange from fuzzysearch.search_exact import search_exact @@ -184,20 +185,15 @@ except ImportError: find_near_matches_generic_linear_programming = \ _find_near_matches_generic_linear_programming else: + @wraps(_find_near_matches_generic_linear_programming) def find_near_matches_generic_linear_programming(subsequence, sequence, search_params): - if not ( - isinstance(subsequence, text_type) or - isinstance(sequence, text_type) - ): - try: - for match in c_fnm_generic_lp(subsequence, sequence, search_params): - yield match - except TypeError: - pass - - for match in _find_near_matches_generic_linear_programming( - subsequence, sequence, search_params): - yield match + try: + for match in c_fnm_generic_lp(subsequence, sequence, search_params): + yield match + except (TypeError, UnicodeEncodeError): + for match in _find_near_matches_generic_linear_programming( + subsequence, sequence, search_params): + yield match def find_near_matches_generic_ngrams(subsequence, sequence, search_params): diff --git a/src/fuzzysearch/substitutions_only.py b/src/fuzzysearch/substitutions_only.py index f7089f4..24ebbdb 100644 --- a/src/fuzzysearch/substitutions_only.py +++ b/src/fuzzysearch/substitutions_only.py @@ -4,7 +4,6 @@ from functools import wraps from fuzzysearch.common import FuzzySearchBase, Match, \ count_differences_with_maximum, get_best_match_in_group, group_matches -from fuzzysearch.compat import text_type from fuzzysearch.search_exact import search_exact @@ -246,51 +245,41 @@ else: @wraps(py_has_near_match_substitutions_ngrams) def has_near_match_substitutions_ngrams(subsequence, sequence, max_substitutions): - if not ( - isinstance(subsequence, text_type) or - isinstance(sequence, text_type) - ): - try: - return substitutions_only_has_near_matches_ngrams_byteslike( - subsequence, sequence, max_substitutions) - except TypeError: - pass - - return py_has_near_match_substitutions_ngrams( - subsequence, sequence, max_substitutions) + try: + return substitutions_only_has_near_matches_ngrams_byteslike( + subsequence, sequence, max_substitutions) + except (TypeError, UnicodeEncodeError): + return py_has_near_match_substitutions_ngrams( + subsequence, sequence, max_substitutions) py_find_near_matches_substitutions_ngrams = \ find_near_matches_substitutions_ngrams @wraps(py_find_near_matches_substitutions_ngrams) def find_near_matches_substitutions_ngrams(subsequence, sequence, max_substitutions): - if not ( - isinstance(subsequence, text_type) or - isinstance(sequence, text_type) - ): - try: - results = _subs_only_fnm_ngram_byteslike( - subsequence, sequence, max_substitutions) - except TypeError: - pass - else: - matches = [ - Match( - index, - index + len(subsequence), - count_differences_with_maximum( - sequence[index:index+len(subsequence)], - subsequence, - max_substitutions + 1, - ), - matched=sequence[index:index + len(subsequence)], - ) - for index in results - ] - return [ - get_best_match_in_group(group) - for group in group_matches(matches) - ] + try: + results = _subs_only_fnm_ngram_byteslike( + subsequence, sequence, max_substitutions) + except (TypeError, UnicodeEncodeError): + pass + else: + matches = [ + Match( + index, + index + len(subsequence), + count_differences_with_maximum( + sequence[index:index+len(subsequence)], + subsequence, + max_substitutions + 1, + ), + matched=sequence[index:index + len(subsequence)], + ) + for index in results + ] + return [ + get_best_match_in_group(group) + for group in group_matches(matches) + ] return py_find_near_matches_substitutions_ngrams( subsequence, sequence, max_substitutions)