diff --git a/.github/workflows/pythonbuild.yml b/.github/workflows/pythonbuild.yml index cff0fff..e2aad76 100644 --- a/.github/workflows/pythonbuild.yml +++ b/.github/workflows/pythonbuild.yml @@ -9,11 +9,11 @@ on: jobs: test_python: - name: run linting, tests and benchmarks for the python module - runs-on: ubuntu-latest + name: linting and tests on Python ${{ matrix.python-version }} + runs-on: ubuntu-18.04 strategy: matrix: - python-version: [2.7, 3.5, 3.6, 3.7, 3.8] + python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9] steps: - uses: actions/checkout@v2 @@ -41,7 +41,7 @@ jobs: - name: Run Unit Tests run: | pip install . - pip install pytest + pip install pytest hypothesis pytest diff --git a/.gitignore b/.gitignore index 3238a04..e530e50 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,7 @@ site/ # benchmark results bench_results/ + +# Hypothesis results +.hypothesis/ + diff --git a/VERSION b/VERSION index 43c2417..54d1a4f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.12.5 +0.13.0 diff --git a/setup.py b/setup.py index c31f050..16a373b 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ class BuildExt(build_ext): """A custom build extension for adding compiler-specific options.""" c_opts = { 'msvc': ['/EHsc', '/O2', '/std:c++14'], - 'unix': ['-O3', '-std=c++14'], + 'unix': ['-O3', '-std=c++14', '-Wextra', '-Wall'], } l_opts = { 'msvc': [], diff --git a/src/py2_utils.hpp b/src/py2_utils.hpp index 62e617b..469ba7e 100644 --- a/src/py2_utils.hpp +++ b/src/py2_utils.hpp @@ -21,10 +21,14 @@ bool valid_str(PyObject* str, const char* name) Py_InitModule3(#name, methods, doc); \ } +using python_string = + mpark::variant, std::basic_string, + rapidfuzz::basic_string_view, rapidfuzz::basic_string_view>; + using python_string_view = mpark::variant, rapidfuzz::basic_string_view>; -python_string_view decode_python_string(PyObject* py_str) +python_string decode_python_string(PyObject* py_str) { if (PyObject_TypeCheck(py_str, &PyString_Type)) { Py_ssize_t len = PyString_GET_SIZE(py_str); @@ -38,12 +42,27 @@ python_string_view decode_python_string(PyObject* py_str) } } -PyObject* encode_python_string(std::basic_string str) +python_string_view decode_python_string_view(PyObject* py_str) +{ + if (PyObject_TypeCheck(py_str, &PyString_Type)) { + Py_ssize_t len = PyString_GET_SIZE(py_str); + uint8_t* str = reinterpret_cast(PyString_AS_STRING(py_str)); + return rapidfuzz::basic_string_view(str, len); + } + else { + Py_ssize_t len = PyUnicode_GET_SIZE(py_str); + Py_UNICODE* str = PyUnicode_AS_UNICODE(py_str); + return rapidfuzz::basic_string_view(str, len); + } +} + + +PyObject* encode_python_string(rapidfuzz::basic_string_view str) { return PyString_FromStringAndSize(reinterpret_cast(str.data()), str.size()); } -PyObject* encode_python_string(std::basic_string str) +PyObject* encode_python_string(rapidfuzz::basic_string_view str) { return PyUnicode_FromUnicode(str.data(), str.size()); } \ No newline at end of file diff --git a/src/py3_utils.hpp b/src/py3_utils.hpp index 89e02a6..3bf0981 100644 --- a/src/py3_utils.hpp +++ b/src/py3_utils.hpp @@ -6,12 +6,6 @@ #include "details/types.hpp" #include -// PEP 623 deprecates legacy strings and therefor -// deprecates e.g. PyUnicode_READY in Python 3.10 -#if PY_VERSION_HEX < 0x030A0000 -#define PY_BELOW_3_10 -#endif - bool valid_str(PyObject* str, const char* name) { if (!PyUnicode_Check(str)) { @@ -19,7 +13,9 @@ bool valid_str(PyObject* str, const char* name) return false; } -#ifdef PY_BELOW_3_10 + // PEP 623 deprecates legacy strings and therefor + // deprecates e.g. PyUnicode_READY in Python 3.10 +#if PY_VERSION_HEX < PYTHON_VERSION(3,10,0) if (PyUnicode_READY(str)) { return false; } @@ -36,11 +32,16 @@ bool valid_str(PyObject* str, const char* name) return PyModule_Create(&moduledef); \ } +using python_string = + mpark::variant, std::basic_string, std::basic_string, + rapidfuzz::basic_string_view, rapidfuzz::basic_string_view, + rapidfuzz::basic_string_view>; + using python_string_view = mpark::variant, rapidfuzz::basic_string_view, rapidfuzz::basic_string_view>; -python_string_view decode_python_string(PyObject* py_str) +python_string decode_python_string(PyObject* py_str) { Py_ssize_t len = PyUnicode_GET_LENGTH(py_str); void* str = PyUnicode_DATA(py_str); @@ -55,17 +56,32 @@ python_string_view decode_python_string(PyObject* py_str) } } -PyObject* encode_python_string(std::basic_string str) +python_string_view decode_python_string_view(PyObject* py_str) +{ + Py_ssize_t len = PyUnicode_GET_LENGTH(py_str); + void* str = PyUnicode_DATA(py_str); + + switch (PyUnicode_KIND(py_str)) { + case PyUnicode_1BYTE_KIND: + return rapidfuzz::basic_string_view(static_cast(str), len); + case PyUnicode_2BYTE_KIND: + return rapidfuzz::basic_string_view(static_cast(str), len); + default: + return rapidfuzz::basic_string_view(static_cast(str), len); + } +} + +PyObject* encode_python_string(rapidfuzz::basic_string_view str) { return PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND, str.data(), str.size()); } -PyObject* encode_python_string(std::basic_string str) +PyObject* encode_python_string(rapidfuzz::basic_string_view str) { return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, str.data(), str.size()); } -PyObject* encode_python_string(std::basic_string str) +PyObject* encode_python_string(rapidfuzz::basic_string_view str) { return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, str.data(), str.size()); } diff --git a/src/py_abstraction.cpp b/src/py_abstraction.cpp index 3cc22ce..937429b 100644 --- a/src/py_abstraction.cpp +++ b/src/py_abstraction.cpp @@ -22,7 +22,7 @@ static inline bool non_default_process(PyObject* processor) { if (processor) { if (PyCFunction_Check(processor)) { - if (PyCFunction_GetFunction(processor) == (PyCFunction)(void (*)(void))default_process) { + if (PyCFunction_GetFunction(processor) == PY_FUNC_CAST(default_process)) { return false; } } @@ -31,8 +31,21 @@ static inline bool non_default_process(PyObject* processor) return PyCallable_Check(processor); } +static inline void free_owner_list(const std::vector& owner_list) +{ + for (const auto owned : owner_list) { + Py_DecRef(owned); + } +} + +template +static inline python_string default_process_string(Sentence&& str) +{ + return rutils::default_process(std::forward(str)); +} + template -static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* keywds) +static inline PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* keywds) { PyObject* py_s1; PyObject* py_s2; @@ -50,10 +63,6 @@ static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* key return PyFloat_FromDouble(0); } - if (!valid_str(py_s1, "s1") || !valid_str(py_s2, "s2")) { - return NULL; - } - if (non_default_process(processor)) { PyObject* proc_s1 = PyObject_CallFunctionObjArgs(processor, py_s2, NULL); if (proc_s1 == NULL) { @@ -66,8 +75,12 @@ static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* key return NULL; } - auto s1_view = decode_python_string(proc_s1); - auto s2_view = decode_python_string(proc_s2); + if (!valid_str(proc_s1, "s1") || !valid_str(proc_s2, "s2")) { + return NULL; + } + + auto s1_view = decode_python_string_view(proc_s1); + auto s2_view = decode_python_string_view(proc_s2); double result = mpark::visit( [score_cutoff](auto&& val1, auto&& val2) { @@ -81,8 +94,12 @@ static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* key return PyFloat_FromDouble(result); } - auto s1_view = decode_python_string(py_s1); - auto s2_view = decode_python_string(py_s2); + if (!valid_str(py_s1, "s1") || !valid_str(py_s2, "s2")) { + return NULL; + } + + auto s1_view = decode_python_string_view(py_s1); + auto s2_view = decode_python_string_view(py_s2); double result; if (use_preprocessing(processor, processor_default)) { @@ -118,7 +135,24 @@ struct name##_func { \ static PyObject* name(PyObject* /*self*/, PyObject* args, PyObject* keywds) \ { \ return fuzz_call(process_default, args, keywds); \ -} +} + +struct CachedFuzz { + virtual void str1_set(python_string str) { + m_str1 = std::move(str); + } + + virtual void str2_set(python_string str) { + m_str2 = std::move(str); + } + + virtual double call(double score_cutoff) = 0; + +protected: + python_string m_str1; + python_string m_str2; +}; + FUZZ_FUNC( ratio, false, @@ -140,6 +174,17 @@ FUZZ_FUNC( " 96.55171966552734" ) +struct CachedRatio : public CachedFuzz { + double call(double score_cutoff) override { + return mpark::visit( + [score_cutoff](auto&& val1, auto&& val2) { + return rfuzz::ratio(val1, val2, score_cutoff); + }, + m_str1, m_str2); + } +}; + + FUZZ_FUNC( partial_ratio, false, "partial_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n" @@ -160,6 +205,15 @@ FUZZ_FUNC( " 100" ) +struct CachedPartialRatio : public CachedFuzz { + double call(double score_cutoff) override { + return mpark::visit( + [score_cutoff](auto&& val1, auto&& val2) { + return rfuzz::partial_ratio(val1, val2, score_cutoff); + }, + m_str1, m_str2); + } +}; FUZZ_FUNC( token_sort_ratio, true, @@ -182,6 +236,26 @@ FUZZ_FUNC( " 100.0" ) +struct CachedTokenSortRatio : public CachedFuzz { + void str1_set(python_string str) override { + m_str1 = mpark::visit( + [](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str); + } + + virtual void str2_set(python_string str) override { + m_str2 = mpark::visit( + [](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str); + } + + double call(double score_cutoff) override { + return mpark::visit( + [score_cutoff](auto&& val1, auto&& val2) { + return rfuzz::ratio(val1, val2, score_cutoff); + }, + m_str1, m_str2); + } +}; + FUZZ_FUNC( partial_token_sort_ratio, true, "partial_token_sort_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n" @@ -200,6 +274,26 @@ FUZZ_FUNC( " float: ratio between s1 and s2 as a float between 0 and 100" ) +struct CachedPartialTokenSortRatio : public CachedFuzz { + void str1_set(python_string str) override { + m_str1 = mpark::visit( + [](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str); + } + + virtual void str2_set(python_string str) override { + m_str2 = mpark::visit( + [](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str); + } + + double call(double score_cutoff) override { + return mpark::visit( + [score_cutoff](auto&& val1, auto&& val2) { + return rfuzz::partial_ratio(val1, val2, score_cutoff); + }, + m_str1, m_str2); + } +}; + FUZZ_FUNC( token_set_ratio, true, "token_set_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n" @@ -224,6 +318,16 @@ FUZZ_FUNC( " 100.0" ) +struct CachedTokenSetRatio : public CachedFuzz { + double call(double score_cutoff) override { + return mpark::visit( + [score_cutoff](auto&& val1, auto&& val2) { + return rfuzz::token_set_ratio(val1, val2, score_cutoff); + }, + m_str1, m_str2); + } +}; + FUZZ_FUNC( partial_token_set_ratio, true, "partial_token_set_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n" @@ -243,6 +347,16 @@ FUZZ_FUNC( " float: ratio between s1 and s2 as a float between 0 and 100" ) +struct CachedPartialTokenSetRatio : public CachedFuzz { + double call(double score_cutoff) override { + return mpark::visit( + [score_cutoff](auto&& val1, auto&& val2) { + return rfuzz::partial_token_set_ratio(val1, val2, score_cutoff); + }, + m_str1, m_str2); + } +}; + FUZZ_FUNC( token_ratio, true, "token_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n" @@ -262,6 +376,16 @@ FUZZ_FUNC( " float: ratio between s1 and s2 as a float between 0 and 100" ) +struct CachedTokenRatio : public CachedFuzz { + double call(double score_cutoff) override { + return mpark::visit( + [score_cutoff](auto&& val1, auto&& val2) { + return rfuzz::token_ratio(val1, val2, score_cutoff); + }, + m_str1, m_str2); + } +}; + FUZZ_FUNC( partial_token_ratio, true, "partial_token_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n" @@ -282,6 +406,16 @@ FUZZ_FUNC( " float: ratio between s1 and s2 as a float between 0 and 100" ) +struct CachedPartialTokenRatio : public CachedFuzz { + double call(double score_cutoff) override { + return mpark::visit( + [score_cutoff](auto&& val1, auto&& val2) { + return rfuzz::partial_token_ratio(val1, val2, score_cutoff); + }, + m_str1, m_str2); + } +}; + FUZZ_FUNC( WRatio, true, "WRatio($module, s1, s2, processor = False, score_cutoff = 0)\n" @@ -300,6 +434,16 @@ FUZZ_FUNC( " float: ratio between s1 and s2 as a float between 0 and 100" ) +struct CachedWRatio : public CachedFuzz { + double call(double score_cutoff) override { + return mpark::visit( + [score_cutoff](auto&& val1, auto&& val2) { + return rfuzz::WRatio(val1, val2, score_cutoff); + }, + m_str1, m_str2); + } +}; + FUZZ_FUNC( QRatio, true, "QRatio($module, s1, s2, processor = False, score_cutoff = 0)\n" @@ -321,6 +465,16 @@ FUZZ_FUNC( " 96.55171966552734" ) +struct CachedQRatio : public CachedFuzz { + double call(double score_cutoff) override { + return mpark::visit( + [score_cutoff](auto&& val1, auto&& val2) { + return rfuzz::QRatio(val1, val2, score_cutoff); + }, + m_str1, m_str2); + } +}; + FUZZ_FUNC( quick_lev_ratio, true, "quick_lev_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n" @@ -343,7 +497,15 @@ FUZZ_FUNC( " float: ratio between s1 and s2 as a float between 0 and 100" ) - +struct CachedQuickLevRatio : public CachedFuzz { + double call(double score_cutoff) override { + return mpark::visit( + [score_cutoff](auto&& val1, auto&& val2) { + return rfuzz::quick_lev_ratio(val1, val2, score_cutoff); + }, + m_str1, m_str2); + } +}; constexpr const char* default_process_docstring = R"()"; @@ -360,13 +522,391 @@ static PyObject* default_process(PyObject* /*self*/, PyObject* args, PyObject* k return NULL; } - auto sentence_view = decode_python_string(py_sentence); - PyObject* processed = mpark::visit( - [](auto&& val1) { - return encode_python_string(rutils::default_process(val1));}, - sentence_view); + /* this is pretty verbose. However it is faster than std::variant + std::visit */ +#ifdef PYTHON_2 + if (PyObject_TypeCheck(py_sentence, &PyString_Type)) { + Py_ssize_t len = PyString_GET_SIZE(py_sentence); + char* str = PyString_AS_STRING(py_sentence); + + auto proc_str = rutils::default_process(rapidfuzz::basic_string_view(str, len)); + return PyString_FromStringAndSize(proc_str.data(), proc_str.size()); + } + else { + Py_ssize_t len = PyUnicode_GET_SIZE(py_sentence); + const Py_UNICODE* str = PyUnicode_AS_UNICODE(py_sentence); + + auto proc_str = rutils::default_process(rapidfuzz::basic_string_view(str, len)); + return PyUnicode_FromUnicode(proc_str.data(), proc_str.size()); + } +#else /* Python 3 */ + + Py_ssize_t len = PyUnicode_GET_LENGTH(py_sentence); + void* str = PyUnicode_DATA(py_sentence); + + switch (PyUnicode_KIND(py_sentence)) { + case PyUnicode_1BYTE_KIND: + { + auto proc_str = rutils::default_process( + rapidfuzz::basic_string_view(static_cast(str), len)); + return PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND, proc_str.data(), proc_str.size()); + } + case PyUnicode_2BYTE_KIND: + { + auto proc_str = rutils::default_process( + rapidfuzz::basic_string_view(static_cast(str), len)); + return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, proc_str.data(), proc_str.size()); + } + default: + { + auto proc_str = rutils::default_process( + rapidfuzz::basic_string_view(static_cast(str), len)); + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, proc_str.data(), proc_str.size()); + } + } +#endif +} + +static inline bool process_string( + PyObject* py_str, const char* name, + PyObject* processor, bool processor_default, + python_string& proc_str, std::vector& owner_list) +{ + if (non_default_process(processor)) { + PyObject* proc_py_str = PyObject_CallFunctionObjArgs(processor, py_str, NULL); + if ((proc_py_str == NULL) || (!valid_str(proc_py_str, name))) { + return false; + } + + owner_list.push_back(proc_py_str); + proc_str = decode_python_string(proc_py_str); + return true; + } - return processed; + if (!valid_str(py_str, name)) { + return false; + } + + if (use_preprocessing(processor, processor_default)) { + proc_str = mpark::visit( + [](auto&& val1) { return default_process_string(val1);}, + decode_python_string(py_str)); + } else { + proc_str = decode_python_string(py_str); + } + + return true; +} + + + +std::unique_ptr get_matching_instance(PyObject* scorer) +{ + if (scorer) { + if (PyCFunction_Check(scorer)) { + auto scorer_func = PyCFunction_GetFunction(scorer); + if (scorer_func == PY_FUNC_CAST(ratio)) + { + return std::make_unique(); + } else if (scorer_func == PY_FUNC_CAST(partial_ratio)) { + return std::make_unique(); + } else if (scorer_func == PY_FUNC_CAST(token_sort_ratio)) { + return std::make_unique(); + } else if (scorer_func == PY_FUNC_CAST(token_set_ratio)) { + return std::make_unique(); + } else if (scorer_func == PY_FUNC_CAST(partial_token_sort_ratio)) { + return std::make_unique(); + } else if (scorer_func == PY_FUNC_CAST(partial_token_set_ratio)) { + return std::make_unique(); + } else if (scorer_func == PY_FUNC_CAST(token_ratio)) { + return std::make_unique(); + } else if (scorer_func == PY_FUNC_CAST(partial_token_ratio)) { + return std::make_unique(); + } else if (scorer_func == PY_FUNC_CAST(WRatio)) { + return std::make_unique(); + } else if (scorer_func == PY_FUNC_CAST(QRatio)) { + return std::make_unique(); + } + } + /* call python function */ + return nullptr; + /* default is fuzz.WRatio */ + } else { + return std::make_unique(); + } +} + + +static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices, + PyObject* scorer, PyObject* processor, double score_cutoff) +{ + bool match_found = false; + PyObject* result_choice = NULL; + PyObject* choice_key = NULL; + std::vector outer_owner_list; + + bool is_dict = false; + + PyObject* py_score_cutoff = PyFloat_FromDouble(score_cutoff); + if (!py_score_cutoff) { + return NULL; + } + + python_string query; + if (!process_string(py_query, "query", processor, true, query, outer_owner_list)) { + Py_DecRef(py_score_cutoff); + return NULL; + } + + py_query = mpark::visit( + [](auto&& val) {return encode_python_string(val);}, + query); + + if (!py_query) { + Py_DecRef(py_score_cutoff); + free_owner_list(outer_owner_list); + return NULL; + } + outer_owner_list.push_back(py_query); + + /* dict like container */ + if (PyObject_HasAttrString(py_choices, "items")) { + is_dict = true; + py_choices = PyObject_CallMethod(py_choices, "items", NULL); + if (!py_choices) { + free_owner_list(outer_owner_list); + return NULL; + } + outer_owner_list.push_back(py_choices); + } + + PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings"); + if (!choices) { + Py_DecRef(py_score_cutoff); + free_owner_list(outer_owner_list); + return NULL; + } + outer_owner_list.push_back(choices); + + std::size_t choice_count = PySequence_Fast_GET_SIZE(choices); + + + for (std::size_t i = 0; i < choice_count; ++i) { + PyObject* py_choice = NULL; + PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i); + + if (is_dict) { + if (!PyArg_ParseTuple(py_match_choice, "OO", &py_choice, &py_match_choice)) + { + Py_DecRef(py_score_cutoff); + free_owner_list(outer_owner_list); + return NULL; + } + } + + if (py_match_choice == Py_None) { + continue; + } + + std::vector inner_owner_list; + python_string choice; + + if (!process_string(py_match_choice, "choice", processor, true, choice, inner_owner_list)) { + Py_DecRef(py_score_cutoff); + free_owner_list(outer_owner_list); + return NULL; + } + + PyObject* py_proc_choice = mpark::visit( + [](auto&& val) {return encode_python_string(val);}, + choice); + + if (!py_proc_choice) { + Py_DecRef(py_score_cutoff); + free_owner_list(outer_owner_list); + return NULL; + } + inner_owner_list.push_back(py_proc_choice); + + PyObject* score = PyObject_CallFunction(scorer, "OOO", + py_query, py_proc_choice, py_score_cutoff); + + if (!score) { + Py_DecRef(py_score_cutoff); + free_owner_list(outer_owner_list); + free_owner_list(inner_owner_list); + return NULL; + } + + int comp = PyObject_RichCompareBool(score, py_score_cutoff, Py_GE); + if (comp == 1) { + Py_DecRef(py_score_cutoff); + py_score_cutoff = score; + match_found = true; + result_choice = py_match_choice; + choice_key = py_choice; + } else if (comp == 0) { + Py_DecRef(score); + } else if (comp == -1) { + Py_DecRef(py_score_cutoff); + Py_DecRef(score); + free_owner_list(outer_owner_list); + free_owner_list(inner_owner_list); + return NULL; + } + free_owner_list(inner_owner_list); + } + + free_owner_list(outer_owner_list); + + if (!match_found) { + Py_DecRef(py_score_cutoff); + Py_RETURN_NONE; + } + + if (score_cutoff > 100) { + score_cutoff = 100; + } + + PyObject* result = is_dict + ? Py_BuildValue("(OOO)", result_choice, py_score_cutoff, choice_key) + : Py_BuildValue("(OO)", result_choice, py_score_cutoff); + + Py_DecRef(py_score_cutoff); + return result; +} + + +constexpr const char* extractOne_docstring = + "extractOne($module, query, choices, scorer = 'fuzz.WRatio', processor = 'utils.default_process', score_cutoff = 0)\n" + "--\n\n" + "Find the best match in a list of choices\n\n" + "Args:\n" + " query (str): string we want to find\n" + " choices (Iterable): list of all strings the query should be compared with or dict with a mapping\n" + " {: }\n" + " scorer (Callable): optional callable that is used to calculate the matching score between\n" + " the query and each choice. WRatio is used by default\n" + " processor (Callable): optional callable that reformats the strings. utils.default_process\n" + " is used by default, which lowercases the strings and trims whitespace\n" + " score_cutoff (float): Optional argument for a score threshold. Matches with\n" + " a lower score than this number will not be returned. Defaults to 0\n\n" + "Returns:\n" + " Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is\n" + " no match with a score >= score_cutoff\n" + " Union[None, Tuple[str, float], Tuple[str, float, str]]: Returns the best match the best match\n" + " in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will\n" + " be in the form`(, )` when `choices` is a list of strings\n" + " or `(, , )` when `choices` is a mapping."; + +static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds) +{ + bool match_found = false; + PyObject* result_choice = NULL; + PyObject* choice_key = NULL; + double result_score; + std::vector outer_owner_list; + python_string query; + bool is_dict = false; + + PyObject* py_query; + PyObject* py_choices; + PyObject* processor = NULL; + PyObject* py_scorer = NULL; + double score_cutoff = 0; + static const char* kwlist[] = {"query", "choices", "scorer", "processor", "score_cutoff", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "OO|OOd", const_cast(kwlist), &py_query, + &py_choices, &py_scorer, &processor, &score_cutoff)) + { + return NULL; + } + + if (py_query == Py_None) { + return PyFloat_FromDouble(0); + } + + auto scorer = get_matching_instance(py_scorer); + if (!scorer) { + // todo this is mostly code duplication + return py_extractOne(py_query, py_choices, py_scorer, processor, score_cutoff); + } + + if (!process_string(py_query, "query", processor, true, query, outer_owner_list)) { + return NULL; + } + + scorer->str1_set(query); + PyObject* py_items; + + /* dict like container */ + if (PyObject_HasAttrString(py_choices, "items")) { + is_dict = true; + py_choices = PyObject_CallMethod(py_choices, "items", NULL); + if (!py_choices) { + free_owner_list(outer_owner_list); + return NULL; + } + outer_owner_list.push_back(py_choices); + } + + PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings"); + if (!choices) { + free_owner_list(outer_owner_list); + return NULL; + } + outer_owner_list.push_back(choices); + + std::size_t choice_count = PySequence_Fast_GET_SIZE(choices); + + for (std::size_t i = 0; i < choice_count; ++i) { + PyObject* py_choice = NULL; + PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i); + + if (is_dict) { + if (!PyArg_ParseTuple(py_match_choice, "OO", &py_choice, &py_match_choice)) + { + free_owner_list(outer_owner_list); + return NULL; + } + } + + if (py_match_choice == Py_None) { + continue; + } + + std::vector inner_owner_list; + python_string choice; + + if (!process_string(py_match_choice, "choice", processor, true, choice, inner_owner_list)) { + free_owner_list(outer_owner_list); + return NULL; + } + + scorer->str2_set(choice); + double score = scorer->call(score_cutoff); + + if (score >= score_cutoff) { + // increase the value by a small step so it might be able to exit early + score_cutoff = score + (float)0.00001; + result_score = score; + match_found = true; + result_choice = py_match_choice; + choice_key = py_choice; + } + free_owner_list(inner_owner_list); + } + + free_owner_list(outer_owner_list); + + if (!match_found) { + Py_RETURN_NONE; + } + + if (is_dict) { + return Py_BuildValue("(OdO)", result_choice, result_score, choice_key); + } else { + return Py_BuildValue("(Od)", result_choice, result_score); + } } static PyMethodDef methods[] = { @@ -386,6 +926,7 @@ static PyMethodDef methods[] = { PY_METHOD(QRatio), PY_METHOD(quick_lev_ratio), /* process */ + PY_METHOD(extractOne), /* sentinel */ {NULL, NULL, 0, NULL} }; diff --git a/src/py_process.cpp b/src/py_process.cpp deleted file mode 100644 index 6defca1..0000000 --- a/src/py_process.cpp +++ /dev/null @@ -1,106 +0,0 @@ -#include "fuzz.hpp" -#include "py_utils.hpp" -#include "utils.hpp" -#include - -namespace rfuzz = rapidfuzz::fuzz; -namespace utils = rapidfuzz::utils; - -PyObject* extractOne(PyObject* self, PyObject* args, PyObject* keywds) -{ - PyObject* py_query; - PyObject* py_choices; - PyObject* processor = NULL; - PyObject* scorer = NULL; - double score_cutoff = 0; - static const char* kwlist[] = {"query", "choices", "scorer", "processor", "score_cutoff", NULL}; - - if (!PyArg_ParseTupleAndKeywords(args, keywds, "OO|OOd", const_cast(kwlist), &py_query, - &py_choices, &scorer, &processor, &score_cutoff)) - { - return NULL; - } - - if (py_query == Py_None) { - return PyFloat_FromDouble(0); - } - - if (PyObject_HasAttrString(py_choices, "items")) { - } - else { - } - - if (PySequence_Check(processor)) { - } - - if (!valid_str(py_query, "query")) { - return NULL; - } - - // if is list - - PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings"); - if (!choices) { - return NULL; - } - - std::size_t choice_count = PySequence_Fast_GET_SIZE(choices); - - bool match_found; - // PyObject* - - // processing missing - auto query_view = decode_python_string(py_query); - - for (std::size_t i = 0; i < choice_count; ++i) { - PyObject* py_choice = PySequence_Fast_GET_ITEM(choices, i); - - if (py_choice == Py_None) { - continue; - } - - if (!valid_str(py_choice, "choice")) { - Py_DECREF(choices); - return NULL; - } - - auto choice_view = decode_python_string(py_choice); - - double score = mpark::visit( - [score_cutoff](auto&& val1, auto&& val2) { - return rfuzz::WRatio(val1, val2, score_cutoff); - }, - query_view, choice_view); - /* - float score; - if (preprocess) { - score = fuzz::WRatio( - cleaned_query, - utils::default_process(choice), - score_cutoff); - } else { - score = fuzz::WRatio( - cleaned_query, - std::wstring_view(choice, wcslen(choice)), - score_cutoff); - }*/ - - if (score >= score_cutoff) { - // increase the value by a small step so it might be able to exit early - score_cutoff = score + (float)0.00001; - match_found = true; - result_choice = choice; - } - } - - Py_DECREF(choices); - - if (!match_found) { - Py_RETURN_NONE; - } - - if (score_cutoff > 100) { - score_cutoff = 100; - } - return Py_BuildValue("(ud)", result_choice, score_cutoff); -} diff --git a/src/py_utils.hpp b/src/py_utils.hpp index f089995..e325d33 100644 --- a/src/py_utils.hpp +++ b/src/py_utils.hpp @@ -1,21 +1,26 @@ /* SPDX-License-Identifier: MIT */ /* Copyright © 2020 Max Bachmann */ +#pragma once #define PY_SSIZE_T_CLEAN #include +#include +#include "utils.hpp" + +#define PY_FUNC_CAST(func) ((PyCFunction)(void (*)(void))func) + +#define PYTHON_VERSION(major, minor, micro) ((major << 24) | (minor << 16) | (micro << 8)) /* The cast of the function is necessary since PyCFunction values * only take two PyObject* parameters, and these functions take three. */ -#define PY_METHOD(x) \ - { \ -#x, (PyCFunction)(void (*)(void))x, METH_VARARGS | METH_KEYWORDS, x##_docstring \ - } +#define PY_METHOD(x) \ + { #x, PY_FUNC_CAST(x), METH_VARARGS | METH_KEYWORDS, x##_docstring } -#if PY_MAJOR_VERSION == 2 +#if PY_VERSION_HEX < PYTHON_VERSION(3,0,0) #define PYTHON_2 #include "py2_utils.hpp" #else #define PYTHON_3 #include "py3_utils.hpp" -#endif \ No newline at end of file +#endif diff --git a/src/rapidfuzz-cpp b/src/rapidfuzz-cpp index aa743d1..0cbbee6 160000 --- a/src/rapidfuzz-cpp +++ b/src/rapidfuzz-cpp @@ -1 +1 @@ -Subproject commit aa743d18e39a1b19f83fb745e580ab311487b727 +Subproject commit 0cbbee61bd9a2401e45c96a3d3d6ab640317ccce diff --git a/src/rapidfuzz/__init__.py b/src/rapidfuzz/__init__.py index 68085bd..94ebc15 100644 --- a/src/rapidfuzz/__init__.py +++ b/src/rapidfuzz/__init__.py @@ -3,6 +3,6 @@ rapid string matching library """ __author__ = "Max Bachmann" __license__ = "MIT" -__version__ = "0.12.5" +__version__ = "0.13.0" -from rapidfuzz import process, fuzz, levenshtein, utils +from rapidfuzz import process, fuzz, utils# levenshtein diff --git a/src/rapidfuzz/process.py b/src/rapidfuzz/process.py index d0c5f78..b7bb7b1 100644 --- a/src/rapidfuzz/process.py +++ b/src/rapidfuzz/process.py @@ -3,6 +3,7 @@ # Copyright © 2011 Adam Cohen from rapidfuzz import fuzz, utils +from rapidfuzz.cpp_impl import extractOne import heapq import numbers @@ -117,86 +118,3 @@ def extractIndices(query, choices, scorer = fuzz.WRatio, processor = utils.defau def extractBests(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0): return extract(query, choices, scorer, processor, limit, score_cutoff) - - -def extractOne(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, score_cutoff = 0): - """ - Find the best match in a list of choices - - Args: - query (str): string we want to find - choices (Iterable): list of all strings the query should be compared with or dict with a mapping - {: } - scorer (Callable): optional callable that is used to calculate the matching score between - the query and each choice. WRatio is used by default - processor (Callable): optional callable that reformats the strings. utils.default_process - is used by default, which lowercases the strings and trims whitespace - score_cutoff (float): Optional argument for a score threshold. Matches with - a lower score than this number will not be returned. Defaults to 0 - - Returns: - Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is - no match with a score >= score_cutoff - Union[None, Tuple[str, float], Tuple[str, float, str]]: Returns the best match the best match - in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will - be in the form`(, )` when `choices` is a list of strings - or `(, , )` when `choices` is a mapping. - """ - if query is None: - return None - - a = processor(query) if processor else query - - result_score = None - result_choice = "" - - if hasattr(choices, "items"): - choice_key = "" - for choice, match_choice in choices.items(): - if match_choice is None: - continue - b = processor(match_choice) if processor else match_choice - - score = scorer( - a, b, - processor=None, - score_cutoff=score_cutoff) - - if score >= score_cutoff: - # very small increment for the score_cutoff, so when multiple - # elements have the same score the first one is used - # only done when the score is a number - if isinstance(score, numbers.Number): - score_cutoff = score + 0.00001 - if score_cutoff > 100: - return (match_choice, score, choice) - else: - score_cutoff = score - - result_score = score - result_choice = match_choice - choice_key = choice - return (result_choice, result_score, choice_key) if not result_score is None else None - - for choice in choices: - if choice is None: - continue - b = processor(choice) if processor else choice - - score = scorer( - a, b, - processor=None, - score_cutoff=score_cutoff) - - if score >= score_cutoff: - if isinstance(score, numbers.Number): - score_cutoff = score + 0.00001 - if score_cutoff > 100: - return (choice, score) - else: - score_cutoff = score - - result_score = score - result_choice = choice - - return (result_choice, result_score) if not result_score is None else None diff --git a/tests/test_fuzz.py b/tests/test_fuzz.py index 4d13bee..8f6a654 100644 --- a/tests/test_fuzz.py +++ b/tests/test_fuzz.py @@ -5,6 +5,19 @@ import unittest from rapidfuzz import process, fuzz, utils +scorers = [ + fuzz.ratio, + fuzz.partial_ratio, + fuzz.token_sort_ratio, + fuzz.token_set_ratio, + fuzz.token_ratio, + fuzz.partial_token_sort_ratio, + fuzz.partial_token_set_ratio, + fuzz.partial_token_ratio, + fuzz.WRatio, + fuzz.QRatio +] + class RatioTest(unittest.TestCase): def setUp(self): self.s1 = "new york mets" @@ -87,5 +100,27 @@ class RatioTest(unittest.TestCase): score = fuzz.QRatio(s1, s2) self.assertEqual(0, score) + def testWithProcessor(self): + """ + Any scorer should accept any type as s1 and s2, as long as it is a string + after preprocessing. + """ + s1 = ["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"] + s2 = ["chicago cubs vs new york mets", "CitiFields", "2012-05-11", "9pm"] + + for scorer in scorers: + score = scorer(s1, s2, processor=lambda event: event[0]) + self.assertEqual(score, 100) + + def testHelp(self): + """ + test that all help texts can be printed without throwing an exception, + since they are implemented in C++ aswell + """ + + for scorer in scorers: + help(scorer) + + if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/tests/test_hypothesis.py b/tests/test_hypothesis.py new file mode 100644 index 0000000..6f0eab3 --- /dev/null +++ b/tests/test_hypothesis.py @@ -0,0 +1,138 @@ +from itertools import product +from functools import partial +from string import ascii_letters, digits, punctuation + +from hypothesis import given, assume, settings +import hypothesis.strategies as st +import pytest + +from rapidfuzz import fuzz, process, utils +import random + + +HYPOTHESIS_ALPHABET = ascii_letters + digits + punctuation + +SCORERS = [ + fuzz.ratio, + fuzz.partial_ratio, + fuzz.token_set_ratio, + fuzz.token_sort_ratio, + fuzz.token_ratio, + fuzz.partial_token_set_ratio, + fuzz.partial_token_sort_ratio, + fuzz.partial_token_ratio, + fuzz.WRatio, + fuzz.QRatio +] + +FULL_SCORERS = [ + fuzz.ratio, + fuzz.WRatio, + fuzz.QRatio +] + +PROCESSORS = [ + lambda x: x, + utils.default_process +] + +@given(sentence=st.text()) +@settings(max_examples=200) +def test_multiple_processor_runs(sentence): + """ + Test that running a preprocessor on a sentence + a second time does not change the result + """ + assert utils.default_process(sentence) \ + == utils.default_process(utils.default_process(sentence)) + +''' + +def full_scorers_processors(): + """ + Generate a list of (scorer, processor) pairs for testing for scorers that use the full string only + :return: [(scorer, processor), ...] + """ + scorers = [fuzz.ratio] + processors = [lambda x: x, + partial(utils.full_process, force_ascii=False), + partial(utils.full_process, force_ascii=True)] + splist = list(product(scorers, processors)) + splist.extend( + [(fuzz.WRatio, partial(utils.full_process, force_ascii=True)), + (fuzz.QRatio, partial(utils.full_process, force_ascii=True)), + (fuzz.UWRatio, partial(utils.full_process, force_ascii=False)), + (fuzz.UQRatio, partial(utils.full_process, force_ascii=False))] + ) + + return splist + + +@pytest.mark.parametrize('scorer,processor', + scorers_processors()) +@given(data=st.data()) +@settings(max_examples=20, deadline=5000) +def test_identical_strings_extracted(scorer, processor, data): + """ + Test that identical strings will always return a perfect match. + :param scorer: + :param processor: + :param data: + :return: + """ + # Draw a list of random strings + strings = data.draw( + st.lists( + st.text(min_size=10, max_size=100, alphabet=HYPOTHESIS_ALPHABET), + min_size=1, + max_size=10 + ) + ) + # Draw a random integer for the index in that list + choiceidx = data.draw(st.integers(min_value=0, max_value=(len(strings) - 1))) + + # Extract our choice from the list + choice = strings[choiceidx] + + # Check process doesn't make our choice the empty string + assume(processor(choice) != '') + + # Extract all perfect matches + result = process.extractBests(choice, + strings, + scorer=scorer, + processor=processor, + score_cutoff=100, + limit=None) + + # Check we get a result + assert result != [] + + # Check the original is in the list + assert (choice, 100) in result +''' + +@pytest.mark.parametrize('scorer,processor', list(product(FULL_SCORERS, PROCESSORS))) +@given(choices=st.lists(st.text(), min_size=1)) +@settings(max_examples=20, deadline=5000) +def test_only_identical_strings_extracted(scorer, processor, choices): + """ + Test that only identical (post processing) strings score 100 on the test. + If two strings are not identical then using full comparison methods they should + not be a perfect (100) match. + :param scorer: + :param processor: + :param data: + :return: + """ + query = random.choice(choices) + assume(processor(query) != '') + + matches = process.extract(query, choices, + scorer=scorer, processor=processor, + score_cutoff=100, limit=None) + + assert matches != [] + + for match in matches: + assert processor(query) == processor(match[0]) \ No newline at end of file diff --git a/tests/test_process.py b/tests/test_process.py index 32fad2f..5c4eeee 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -14,6 +14,20 @@ class ProcessTest(unittest.TestCase): "braves vs mets", ] + def testExtractOneExceptions(self): + self.assertRaises(TypeError, process.extractOne) + self.assertRaises(TypeError, process.extractOne, 1) + self.assertRaises(TypeError, process.extractOne, 1, []) + self.assertRaises(TypeError, process.extractOne, '', [1]) + self.assertRaises(TypeError, process.extractOne, '', {1:1}) + + def testExtractExceptions(self): + self.assertRaises(TypeError, process.extract) + self.assertRaises(TypeError, process.extract, 1) + self.assertRaises(TypeError, process.extract, 1, []) + self.assertRaises(TypeError, process.extract, '', [1]) + self.assertRaises(TypeError, process.extract, '', {1:1}) + def testGetBestChoice1(self): query = "new york mets at atlanta braves" best = process.extractOne(query, self.baseball_strings) @@ -35,12 +49,16 @@ class ProcessTest(unittest.TestCase): self.assertEqual(best[0], self.baseball_strings[0]) def testWithProcessor(self): + """ + extractOne should accept any type as long as it is a string + after preprocessing + """ events = [ ["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"], ["new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm"], ["atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm"], ] - query = "new york mets vs chicago cubs" + query = events[0] best = process.extractOne(query, events, processor=lambda event: event[0]) self.assertEqual(best[0], events[0])