implement process.extractOne in C++ ()

* start to simplify complexion

* start implementation

* add extractOne to C++

* fix a couple of bugs in the implementation

* start adressing performance issues
This commit is contained in:
Max Bachmann 2020-11-15 20:18:46 +01:00 committed by GitHub
parent eee513f2c5
commit 426fbb24e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 825 additions and 237 deletions

View File

@ -9,11 +9,11 @@ on:
jobs:
test_python:
name: run linting, tests and benchmarks for the python module
runs-on: ubuntu-latest
name: linting and tests on Python ${{ matrix.python-version }}
runs-on: ubuntu-18.04
strategy:
matrix:
python-version: [2.7, 3.5, 3.6, 3.7, 3.8]
python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9]
steps:
- uses: actions/checkout@v2
@ -41,7 +41,7 @@ jobs:
- name: Run Unit Tests
run: |
pip install .
pip install pytest
pip install pytest hypothesis
pytest

4
.gitignore vendored
View File

@ -15,3 +15,7 @@ site/
# benchmark results
bench_results/
# Hypothesis results
.hypothesis/

View File

@ -1 +1 @@
0.12.5
0.13.0

View File

@ -17,7 +17,7 @@ class BuildExt(build_ext):
"""A custom build extension for adding compiler-specific options."""
c_opts = {
'msvc': ['/EHsc', '/O2', '/std:c++14'],
'unix': ['-O3', '-std=c++14'],
'unix': ['-O3', '-std=c++14', '-Wextra', '-Wall'],
}
l_opts = {
'msvc': [],

View File

@ -21,10 +21,14 @@ bool valid_str(PyObject* str, const char* name)
Py_InitModule3(#name, methods, doc); \
}
using python_string =
mpark::variant<std::basic_string<uint8_t>, std::basic_string<Py_UNICODE>,
rapidfuzz::basic_string_view<uint8_t>, rapidfuzz::basic_string_view<Py_UNICODE>>;
using python_string_view =
mpark::variant<rapidfuzz::basic_string_view<uint8_t>, rapidfuzz::basic_string_view<Py_UNICODE>>;
python_string_view decode_python_string(PyObject* py_str)
python_string decode_python_string(PyObject* py_str)
{
if (PyObject_TypeCheck(py_str, &PyString_Type)) {
Py_ssize_t len = PyString_GET_SIZE(py_str);
@ -38,12 +42,27 @@ python_string_view decode_python_string(PyObject* py_str)
}
}
PyObject* encode_python_string(std::basic_string<uint8_t> str)
python_string_view decode_python_string_view(PyObject* py_str)
{
if (PyObject_TypeCheck(py_str, &PyString_Type)) {
Py_ssize_t len = PyString_GET_SIZE(py_str);
uint8_t* str = reinterpret_cast<uint8_t*>(PyString_AS_STRING(py_str));
return rapidfuzz::basic_string_view<uint8_t>(str, len);
}
else {
Py_ssize_t len = PyUnicode_GET_SIZE(py_str);
Py_UNICODE* str = PyUnicode_AS_UNICODE(py_str);
return rapidfuzz::basic_string_view<Py_UNICODE>(str, len);
}
}
PyObject* encode_python_string(rapidfuzz::basic_string_view<uint8_t> str)
{
return PyString_FromStringAndSize(reinterpret_cast<const char*>(str.data()), str.size());
}
PyObject* encode_python_string(std::basic_string<Py_UNICODE> str)
PyObject* encode_python_string(rapidfuzz::basic_string_view<Py_UNICODE> str)
{
return PyUnicode_FromUnicode(str.data(), str.size());
}

View File

@ -6,12 +6,6 @@
#include "details/types.hpp"
#include <variant/variant.hpp>
// PEP 623 deprecates legacy strings and therefor
// deprecates e.g. PyUnicode_READY in Python 3.10
#if PY_VERSION_HEX < 0x030A0000
#define PY_BELOW_3_10
#endif
bool valid_str(PyObject* str, const char* name)
{
if (!PyUnicode_Check(str)) {
@ -19,7 +13,9 @@ bool valid_str(PyObject* str, const char* name)
return false;
}
#ifdef PY_BELOW_3_10
// PEP 623 deprecates legacy strings and therefor
// deprecates e.g. PyUnicode_READY in Python 3.10
#if PY_VERSION_HEX < PYTHON_VERSION(3,10,0)
if (PyUnicode_READY(str)) {
return false;
}
@ -36,11 +32,16 @@ bool valid_str(PyObject* str, const char* name)
return PyModule_Create(&moduledef); \
}
using python_string =
mpark::variant<std::basic_string<uint8_t>, std::basic_string<uint16_t>, std::basic_string<uint32_t>,
rapidfuzz::basic_string_view<uint8_t>, rapidfuzz::basic_string_view<uint16_t>,
rapidfuzz::basic_string_view<uint32_t>>;
using python_string_view =
mpark::variant<rapidfuzz::basic_string_view<uint8_t>, rapidfuzz::basic_string_view<uint16_t>,
rapidfuzz::basic_string_view<uint32_t>>;
python_string_view decode_python_string(PyObject* py_str)
python_string decode_python_string(PyObject* py_str)
{
Py_ssize_t len = PyUnicode_GET_LENGTH(py_str);
void* str = PyUnicode_DATA(py_str);
@ -55,17 +56,32 @@ python_string_view decode_python_string(PyObject* py_str)
}
}
PyObject* encode_python_string(std::basic_string<uint8_t> str)
python_string_view decode_python_string_view(PyObject* py_str)
{
Py_ssize_t len = PyUnicode_GET_LENGTH(py_str);
void* str = PyUnicode_DATA(py_str);
switch (PyUnicode_KIND(py_str)) {
case PyUnicode_1BYTE_KIND:
return rapidfuzz::basic_string_view<uint8_t>(static_cast<uint8_t*>(str), len);
case PyUnicode_2BYTE_KIND:
return rapidfuzz::basic_string_view<uint16_t>(static_cast<uint16_t*>(str), len);
default:
return rapidfuzz::basic_string_view<uint32_t>(static_cast<uint32_t*>(str), len);
}
}
PyObject* encode_python_string(rapidfuzz::basic_string_view<uint8_t> str)
{
return PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND, str.data(), str.size());
}
PyObject* encode_python_string(std::basic_string<uint16_t> str)
PyObject* encode_python_string(rapidfuzz::basic_string_view<uint16_t> str)
{
return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, str.data(), str.size());
}
PyObject* encode_python_string(std::basic_string<uint32_t> str)
PyObject* encode_python_string(rapidfuzz::basic_string_view<uint32_t> str)
{
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, str.data(), str.size());
}

View File

@ -22,7 +22,7 @@ static inline bool non_default_process(PyObject* processor)
{
if (processor) {
if (PyCFunction_Check(processor)) {
if (PyCFunction_GetFunction(processor) == (PyCFunction)(void (*)(void))default_process) {
if (PyCFunction_GetFunction(processor) == PY_FUNC_CAST(default_process)) {
return false;
}
}
@ -31,8 +31,21 @@ static inline bool non_default_process(PyObject* processor)
return PyCallable_Check(processor);
}
static inline void free_owner_list(const std::vector<PyObject*>& owner_list)
{
for (const auto owned : owner_list) {
Py_DecRef(owned);
}
}
template<typename Sentence>
static inline python_string default_process_string(Sentence&& str)
{
return rutils::default_process(std::forward<Sentence>(str));
}
template <typename MatchingFunc>
static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* keywds)
static inline PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* keywds)
{
PyObject* py_s1;
PyObject* py_s2;
@ -50,10 +63,6 @@ static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* key
return PyFloat_FromDouble(0);
}
if (!valid_str(py_s1, "s1") || !valid_str(py_s2, "s2")) {
return NULL;
}
if (non_default_process(processor)) {
PyObject* proc_s1 = PyObject_CallFunctionObjArgs(processor, py_s2, NULL);
if (proc_s1 == NULL) {
@ -66,8 +75,12 @@ static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* key
return NULL;
}
auto s1_view = decode_python_string(proc_s1);
auto s2_view = decode_python_string(proc_s2);
if (!valid_str(proc_s1, "s1") || !valid_str(proc_s2, "s2")) {
return NULL;
}
auto s1_view = decode_python_string_view(proc_s1);
auto s2_view = decode_python_string_view(proc_s2);
double result = mpark::visit(
[score_cutoff](auto&& val1, auto&& val2) {
@ -81,8 +94,12 @@ static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* key
return PyFloat_FromDouble(result);
}
auto s1_view = decode_python_string(py_s1);
auto s2_view = decode_python_string(py_s2);
if (!valid_str(py_s1, "s1") || !valid_str(py_s2, "s2")) {
return NULL;
}
auto s1_view = decode_python_string_view(py_s1);
auto s2_view = decode_python_string_view(py_s2);
double result;
if (use_preprocessing(processor, processor_default)) {
@ -118,7 +135,24 @@ struct name##_func { \
static PyObject* name(PyObject* /*self*/, PyObject* args, PyObject* keywds) \
{ \
return fuzz_call<name##_func>(process_default, args, keywds); \
}
}
struct CachedFuzz {
virtual void str1_set(python_string str) {
m_str1 = std::move(str);
}
virtual void str2_set(python_string str) {
m_str2 = std::move(str);
}
virtual double call(double score_cutoff) = 0;
protected:
python_string m_str1;
python_string m_str2;
};
FUZZ_FUNC(
ratio, false,
@ -140,6 +174,17 @@ FUZZ_FUNC(
" 96.55171966552734"
)
struct CachedRatio : public CachedFuzz {
double call(double score_cutoff) override {
return mpark::visit(
[score_cutoff](auto&& val1, auto&& val2) {
return rfuzz::ratio(val1, val2, score_cutoff);
},
m_str1, m_str2);
}
};
FUZZ_FUNC(
partial_ratio, false,
"partial_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -160,6 +205,15 @@ FUZZ_FUNC(
" 100"
)
struct CachedPartialRatio : public CachedFuzz {
double call(double score_cutoff) override {
return mpark::visit(
[score_cutoff](auto&& val1, auto&& val2) {
return rfuzz::partial_ratio(val1, val2, score_cutoff);
},
m_str1, m_str2);
}
};
FUZZ_FUNC(
token_sort_ratio, true,
@ -182,6 +236,26 @@ FUZZ_FUNC(
" 100.0"
)
struct CachedTokenSortRatio : public CachedFuzz {
void str1_set(python_string str) override {
m_str1 = mpark::visit(
[](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str);
}
virtual void str2_set(python_string str) override {
m_str2 = mpark::visit(
[](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str);
}
double call(double score_cutoff) override {
return mpark::visit(
[score_cutoff](auto&& val1, auto&& val2) {
return rfuzz::ratio(val1, val2, score_cutoff);
},
m_str1, m_str2);
}
};
FUZZ_FUNC(
partial_token_sort_ratio, true,
"partial_token_sort_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -200,6 +274,26 @@ FUZZ_FUNC(
" float: ratio between s1 and s2 as a float between 0 and 100"
)
struct CachedPartialTokenSortRatio : public CachedFuzz {
void str1_set(python_string str) override {
m_str1 = mpark::visit(
[](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str);
}
virtual void str2_set(python_string str) override {
m_str2 = mpark::visit(
[](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str);
}
double call(double score_cutoff) override {
return mpark::visit(
[score_cutoff](auto&& val1, auto&& val2) {
return rfuzz::partial_ratio(val1, val2, score_cutoff);
},
m_str1, m_str2);
}
};
FUZZ_FUNC(
token_set_ratio, true,
"token_set_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -224,6 +318,16 @@ FUZZ_FUNC(
" 100.0"
)
struct CachedTokenSetRatio : public CachedFuzz {
double call(double score_cutoff) override {
return mpark::visit(
[score_cutoff](auto&& val1, auto&& val2) {
return rfuzz::token_set_ratio(val1, val2, score_cutoff);
},
m_str1, m_str2);
}
};
FUZZ_FUNC(
partial_token_set_ratio, true,
"partial_token_set_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -243,6 +347,16 @@ FUZZ_FUNC(
" float: ratio between s1 and s2 as a float between 0 and 100"
)
struct CachedPartialTokenSetRatio : public CachedFuzz {
double call(double score_cutoff) override {
return mpark::visit(
[score_cutoff](auto&& val1, auto&& val2) {
return rfuzz::partial_token_set_ratio(val1, val2, score_cutoff);
},
m_str1, m_str2);
}
};
FUZZ_FUNC(
token_ratio, true,
"token_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -262,6 +376,16 @@ FUZZ_FUNC(
" float: ratio between s1 and s2 as a float between 0 and 100"
)
struct CachedTokenRatio : public CachedFuzz {
double call(double score_cutoff) override {
return mpark::visit(
[score_cutoff](auto&& val1, auto&& val2) {
return rfuzz::token_ratio(val1, val2, score_cutoff);
},
m_str1, m_str2);
}
};
FUZZ_FUNC(
partial_token_ratio, true,
"partial_token_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -282,6 +406,16 @@ FUZZ_FUNC(
" float: ratio between s1 and s2 as a float between 0 and 100"
)
struct CachedPartialTokenRatio : public CachedFuzz {
double call(double score_cutoff) override {
return mpark::visit(
[score_cutoff](auto&& val1, auto&& val2) {
return rfuzz::partial_token_ratio(val1, val2, score_cutoff);
},
m_str1, m_str2);
}
};
FUZZ_FUNC(
WRatio, true,
"WRatio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -300,6 +434,16 @@ FUZZ_FUNC(
" float: ratio between s1 and s2 as a float between 0 and 100"
)
struct CachedWRatio : public CachedFuzz {
double call(double score_cutoff) override {
return mpark::visit(
[score_cutoff](auto&& val1, auto&& val2) {
return rfuzz::WRatio(val1, val2, score_cutoff);
},
m_str1, m_str2);
}
};
FUZZ_FUNC(
QRatio, true,
"QRatio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -321,6 +465,16 @@ FUZZ_FUNC(
" 96.55171966552734"
)
struct CachedQRatio : public CachedFuzz {
double call(double score_cutoff) override {
return mpark::visit(
[score_cutoff](auto&& val1, auto&& val2) {
return rfuzz::QRatio(val1, val2, score_cutoff);
},
m_str1, m_str2);
}
};
FUZZ_FUNC(
quick_lev_ratio, true,
"quick_lev_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -343,7 +497,15 @@ FUZZ_FUNC(
" float: ratio between s1 and s2 as a float between 0 and 100"
)
struct CachedQuickLevRatio : public CachedFuzz {
double call(double score_cutoff) override {
return mpark::visit(
[score_cutoff](auto&& val1, auto&& val2) {
return rfuzz::quick_lev_ratio(val1, val2, score_cutoff);
},
m_str1, m_str2);
}
};
constexpr const char* default_process_docstring = R"()";
@ -360,13 +522,391 @@ static PyObject* default_process(PyObject* /*self*/, PyObject* args, PyObject* k
return NULL;
}
auto sentence_view = decode_python_string(py_sentence);
PyObject* processed = mpark::visit(
[](auto&& val1) {
return encode_python_string(rutils::default_process(val1));},
sentence_view);
/* this is pretty verbose. However it is faster than std::variant + std::visit */
#ifdef PYTHON_2
if (PyObject_TypeCheck(py_sentence, &PyString_Type)) {
Py_ssize_t len = PyString_GET_SIZE(py_sentence);
char* str = PyString_AS_STRING(py_sentence);
auto proc_str = rutils::default_process(rapidfuzz::basic_string_view<char>(str, len));
return PyString_FromStringAndSize(proc_str.data(), proc_str.size());
}
else {
Py_ssize_t len = PyUnicode_GET_SIZE(py_sentence);
const Py_UNICODE* str = PyUnicode_AS_UNICODE(py_sentence);
auto proc_str = rutils::default_process(rapidfuzz::basic_string_view<Py_UNICODE>(str, len));
return PyUnicode_FromUnicode(proc_str.data(), proc_str.size());
}
#else /* Python 3 */
Py_ssize_t len = PyUnicode_GET_LENGTH(py_sentence);
void* str = PyUnicode_DATA(py_sentence);
switch (PyUnicode_KIND(py_sentence)) {
case PyUnicode_1BYTE_KIND:
{
auto proc_str = rutils::default_process(
rapidfuzz::basic_string_view<uint8_t>(static_cast<uint8_t*>(str), len));
return PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND, proc_str.data(), proc_str.size());
}
case PyUnicode_2BYTE_KIND:
{
auto proc_str = rutils::default_process(
rapidfuzz::basic_string_view<uint16_t>(static_cast<uint16_t*>(str), len));
return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, proc_str.data(), proc_str.size());
}
default:
{
auto proc_str = rutils::default_process(
rapidfuzz::basic_string_view<uint32_t>(static_cast<uint32_t*>(str), len));
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, proc_str.data(), proc_str.size());
}
}
#endif
}
static inline bool process_string(
PyObject* py_str, const char* name,
PyObject* processor, bool processor_default,
python_string& proc_str, std::vector<PyObject*>& owner_list)
{
if (non_default_process(processor)) {
PyObject* proc_py_str = PyObject_CallFunctionObjArgs(processor, py_str, NULL);
if ((proc_py_str == NULL) || (!valid_str(proc_py_str, name))) {
return false;
}
owner_list.push_back(proc_py_str);
proc_str = decode_python_string(proc_py_str);
return true;
}
return processed;
if (!valid_str(py_str, name)) {
return false;
}
if (use_preprocessing(processor, processor_default)) {
proc_str = mpark::visit(
[](auto&& val1) { return default_process_string(val1);},
decode_python_string(py_str));
} else {
proc_str = decode_python_string(py_str);
}
return true;
}
std::unique_ptr<CachedFuzz> get_matching_instance(PyObject* scorer)
{
if (scorer) {
if (PyCFunction_Check(scorer)) {
auto scorer_func = PyCFunction_GetFunction(scorer);
if (scorer_func == PY_FUNC_CAST(ratio))
{
return std::make_unique<CachedRatio>();
} else if (scorer_func == PY_FUNC_CAST(partial_ratio)) {
return std::make_unique<CachedPartialRatio>();
} else if (scorer_func == PY_FUNC_CAST(token_sort_ratio)) {
return std::make_unique<CachedTokenSortRatio>();
} else if (scorer_func == PY_FUNC_CAST(token_set_ratio)) {
return std::make_unique<CachedTokenSetRatio>();
} else if (scorer_func == PY_FUNC_CAST(partial_token_sort_ratio)) {
return std::make_unique<CachedPartialTokenSortRatio>();
} else if (scorer_func == PY_FUNC_CAST(partial_token_set_ratio)) {
return std::make_unique<CachedPartialTokenSetRatio>();
} else if (scorer_func == PY_FUNC_CAST(token_ratio)) {
return std::make_unique<CachedTokenRatio>();
} else if (scorer_func == PY_FUNC_CAST(partial_token_ratio)) {
return std::make_unique<CachedPartialTokenRatio>();
} else if (scorer_func == PY_FUNC_CAST(WRatio)) {
return std::make_unique<CachedWRatio>();
} else if (scorer_func == PY_FUNC_CAST(QRatio)) {
return std::make_unique<CachedQRatio>();
}
}
/* call python function */
return nullptr;
/* default is fuzz.WRatio */
} else {
return std::make_unique<CachedWRatio>();
}
}
static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
PyObject* scorer, PyObject* processor, double score_cutoff)
{
bool match_found = false;
PyObject* result_choice = NULL;
PyObject* choice_key = NULL;
std::vector<PyObject*> outer_owner_list;
bool is_dict = false;
PyObject* py_score_cutoff = PyFloat_FromDouble(score_cutoff);
if (!py_score_cutoff) {
return NULL;
}
python_string query;
if (!process_string(py_query, "query", processor, true, query, outer_owner_list)) {
Py_DecRef(py_score_cutoff);
return NULL;
}
py_query = mpark::visit(
[](auto&& val) {return encode_python_string(val);},
query);
if (!py_query) {
Py_DecRef(py_score_cutoff);
free_owner_list(outer_owner_list);
return NULL;
}
outer_owner_list.push_back(py_query);
/* dict like container */
if (PyObject_HasAttrString(py_choices, "items")) {
is_dict = true;
py_choices = PyObject_CallMethod(py_choices, "items", NULL);
if (!py_choices) {
free_owner_list(outer_owner_list);
return NULL;
}
outer_owner_list.push_back(py_choices);
}
PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings");
if (!choices) {
Py_DecRef(py_score_cutoff);
free_owner_list(outer_owner_list);
return NULL;
}
outer_owner_list.push_back(choices);
std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
for (std::size_t i = 0; i < choice_count; ++i) {
PyObject* py_choice = NULL;
PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i);
if (is_dict) {
if (!PyArg_ParseTuple(py_match_choice, "OO", &py_choice, &py_match_choice))
{
Py_DecRef(py_score_cutoff);
free_owner_list(outer_owner_list);
return NULL;
}
}
if (py_match_choice == Py_None) {
continue;
}
std::vector<PyObject*> inner_owner_list;
python_string choice;
if (!process_string(py_match_choice, "choice", processor, true, choice, inner_owner_list)) {
Py_DecRef(py_score_cutoff);
free_owner_list(outer_owner_list);
return NULL;
}
PyObject* py_proc_choice = mpark::visit(
[](auto&& val) {return encode_python_string(val);},
choice);
if (!py_proc_choice) {
Py_DecRef(py_score_cutoff);
free_owner_list(outer_owner_list);
return NULL;
}
inner_owner_list.push_back(py_proc_choice);
PyObject* score = PyObject_CallFunction(scorer, "OOO",
py_query, py_proc_choice, py_score_cutoff);
if (!score) {
Py_DecRef(py_score_cutoff);
free_owner_list(outer_owner_list);
free_owner_list(inner_owner_list);
return NULL;
}
int comp = PyObject_RichCompareBool(score, py_score_cutoff, Py_GE);
if (comp == 1) {
Py_DecRef(py_score_cutoff);
py_score_cutoff = score;
match_found = true;
result_choice = py_match_choice;
choice_key = py_choice;
} else if (comp == 0) {
Py_DecRef(score);
} else if (comp == -1) {
Py_DecRef(py_score_cutoff);
Py_DecRef(score);
free_owner_list(outer_owner_list);
free_owner_list(inner_owner_list);
return NULL;
}
free_owner_list(inner_owner_list);
}
free_owner_list(outer_owner_list);
if (!match_found) {
Py_DecRef(py_score_cutoff);
Py_RETURN_NONE;
}
if (score_cutoff > 100) {
score_cutoff = 100;
}
PyObject* result = is_dict
? Py_BuildValue("(OOO)", result_choice, py_score_cutoff, choice_key)
: Py_BuildValue("(OO)", result_choice, py_score_cutoff);
Py_DecRef(py_score_cutoff);
return result;
}
constexpr const char* extractOne_docstring =
"extractOne($module, query, choices, scorer = 'fuzz.WRatio', processor = 'utils.default_process', score_cutoff = 0)\n"
"--\n\n"
"Find the best match in a list of choices\n\n"
"Args:\n"
" query (str): string we want to find\n"
" choices (Iterable): list of all strings the query should be compared with or dict with a mapping\n"
" {<result>: <string to compare>}\n"
" scorer (Callable): optional callable that is used to calculate the matching score between\n"
" the query and each choice. WRatio is used by default\n"
" processor (Callable): optional callable that reformats the strings. utils.default_process\n"
" is used by default, which lowercases the strings and trims whitespace\n"
" score_cutoff (float): Optional argument for a score threshold. Matches with\n"
" a lower score than this number will not be returned. Defaults to 0\n\n"
"Returns:\n"
" Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is\n"
" no match with a score >= score_cutoff\n"
" Union[None, Tuple[str, float], Tuple[str, float, str]]: Returns the best match the best match\n"
" in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will\n"
" be in the form`(<choice>, <ratio>)` when `choices` is a list of strings\n"
" or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.";
static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds)
{
bool match_found = false;
PyObject* result_choice = NULL;
PyObject* choice_key = NULL;
double result_score;
std::vector<PyObject*> outer_owner_list;
python_string query;
bool is_dict = false;
PyObject* py_query;
PyObject* py_choices;
PyObject* processor = NULL;
PyObject* py_scorer = NULL;
double score_cutoff = 0;
static const char* kwlist[] = {"query", "choices", "scorer", "processor", "score_cutoff", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "OO|OOd", const_cast<char**>(kwlist), &py_query,
&py_choices, &py_scorer, &processor, &score_cutoff))
{
return NULL;
}
if (py_query == Py_None) {
return PyFloat_FromDouble(0);
}
auto scorer = get_matching_instance(py_scorer);
if (!scorer) {
// todo this is mostly code duplication
return py_extractOne(py_query, py_choices, py_scorer, processor, score_cutoff);
}
if (!process_string(py_query, "query", processor, true, query, outer_owner_list)) {
return NULL;
}
scorer->str1_set(query);
PyObject* py_items;
/* dict like container */
if (PyObject_HasAttrString(py_choices, "items")) {
is_dict = true;
py_choices = PyObject_CallMethod(py_choices, "items", NULL);
if (!py_choices) {
free_owner_list(outer_owner_list);
return NULL;
}
outer_owner_list.push_back(py_choices);
}
PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings");
if (!choices) {
free_owner_list(outer_owner_list);
return NULL;
}
outer_owner_list.push_back(choices);
std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
for (std::size_t i = 0; i < choice_count; ++i) {
PyObject* py_choice = NULL;
PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i);
if (is_dict) {
if (!PyArg_ParseTuple(py_match_choice, "OO", &py_choice, &py_match_choice))
{
free_owner_list(outer_owner_list);
return NULL;
}
}
if (py_match_choice == Py_None) {
continue;
}
std::vector<PyObject*> inner_owner_list;
python_string choice;
if (!process_string(py_match_choice, "choice", processor, true, choice, inner_owner_list)) {
free_owner_list(outer_owner_list);
return NULL;
}
scorer->str2_set(choice);
double score = scorer->call(score_cutoff);
if (score >= score_cutoff) {
// increase the value by a small step so it might be able to exit early
score_cutoff = score + (float)0.00001;
result_score = score;
match_found = true;
result_choice = py_match_choice;
choice_key = py_choice;
}
free_owner_list(inner_owner_list);
}
free_owner_list(outer_owner_list);
if (!match_found) {
Py_RETURN_NONE;
}
if (is_dict) {
return Py_BuildValue("(OdO)", result_choice, result_score, choice_key);
} else {
return Py_BuildValue("(Od)", result_choice, result_score);
}
}
static PyMethodDef methods[] = {
@ -386,6 +926,7 @@ static PyMethodDef methods[] = {
PY_METHOD(QRatio),
PY_METHOD(quick_lev_ratio),
/* process */
PY_METHOD(extractOne),
/* sentinel */
{NULL, NULL, 0, NULL}
};

View File

@ -1,106 +0,0 @@
#include "fuzz.hpp"
#include "py_utils.hpp"
#include "utils.hpp"
#include <string>
namespace rfuzz = rapidfuzz::fuzz;
namespace utils = rapidfuzz::utils;
PyObject* extractOne(PyObject* self, PyObject* args, PyObject* keywds)
{
PyObject* py_query;
PyObject* py_choices;
PyObject* processor = NULL;
PyObject* scorer = NULL;
double score_cutoff = 0;
static const char* kwlist[] = {"query", "choices", "scorer", "processor", "score_cutoff", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "OO|OOd", const_cast<char**>(kwlist), &py_query,
&py_choices, &scorer, &processor, &score_cutoff))
{
return NULL;
}
if (py_query == Py_None) {
return PyFloat_FromDouble(0);
}
if (PyObject_HasAttrString(py_choices, "items")) {
}
else {
}
if (PySequence_Check(processor)) {
}
if (!valid_str(py_query, "query")) {
return NULL;
}
// if is list
PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings");
if (!choices) {
return NULL;
}
std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
bool match_found;
// PyObject*
// processing missing
auto query_view = decode_python_string(py_query);
for (std::size_t i = 0; i < choice_count; ++i) {
PyObject* py_choice = PySequence_Fast_GET_ITEM(choices, i);
if (py_choice == Py_None) {
continue;
}
if (!valid_str(py_choice, "choice")) {
Py_DECREF(choices);
return NULL;
}
auto choice_view = decode_python_string(py_choice);
double score = mpark::visit(
[score_cutoff](auto&& val1, auto&& val2) {
return rfuzz::WRatio(val1, val2, score_cutoff);
},
query_view, choice_view);
/*
float score;
if (preprocess) {
score = fuzz::WRatio(
cleaned_query,
utils::default_process(choice),
score_cutoff);
} else {
score = fuzz::WRatio(
cleaned_query,
std::wstring_view(choice, wcslen(choice)),
score_cutoff);
}*/
if (score >= score_cutoff) {
// increase the value by a small step so it might be able to exit early
score_cutoff = score + (float)0.00001;
match_found = true;
result_choice = choice;
}
}
Py_DECREF(choices);
if (!match_found) {
Py_RETURN_NONE;
}
if (score_cutoff > 100) {
score_cutoff = 100;
}
return Py_BuildValue("(ud)", result_choice, score_cutoff);
}

View File

@ -1,21 +1,26 @@
/* SPDX-License-Identifier: MIT */
/* Copyright © 2020 Max Bachmann */
#pragma once
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <vector>
#include "utils.hpp"
#define PY_FUNC_CAST(func) ((PyCFunction)(void (*)(void))func)
#define PYTHON_VERSION(major, minor, micro) ((major << 24) | (minor << 16) | (micro << 8))
/* The cast of the function is necessary since PyCFunction values
* only take two PyObject* parameters, and these functions take three.
*/
#define PY_METHOD(x) \
{ \
#x, (PyCFunction)(void (*)(void))x, METH_VARARGS | METH_KEYWORDS, x##_docstring \
}
#define PY_METHOD(x) \
{ #x, PY_FUNC_CAST(x), METH_VARARGS | METH_KEYWORDS, x##_docstring }
#if PY_MAJOR_VERSION == 2
#if PY_VERSION_HEX < PYTHON_VERSION(3,0,0)
#define PYTHON_2
#include "py2_utils.hpp"
#else
#define PYTHON_3
#include "py3_utils.hpp"
#endif
#endif

@ -1 +1 @@
Subproject commit aa743d18e39a1b19f83fb745e580ab311487b727
Subproject commit 0cbbee61bd9a2401e45c96a3d3d6ab640317ccce

View File

@ -3,6 +3,6 @@ rapid string matching library
"""
__author__ = "Max Bachmann"
__license__ = "MIT"
__version__ = "0.12.5"
__version__ = "0.13.0"
from rapidfuzz import process, fuzz, levenshtein, utils
from rapidfuzz import process, fuzz, utils# levenshtein

View File

@ -3,6 +3,7 @@
# Copyright © 2011 Adam Cohen
from rapidfuzz import fuzz, utils
from rapidfuzz.cpp_impl import extractOne
import heapq
import numbers
@ -117,86 +118,3 @@ def extractIndices(query, choices, scorer = fuzz.WRatio, processor = utils.defau
def extractBests(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0):
return extract(query, choices, scorer, processor, limit, score_cutoff)
def extractOne(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, score_cutoff = 0):
"""
Find the best match in a list of choices
Args:
query (str): string we want to find
choices (Iterable): list of all strings the query should be compared with or dict with a mapping
{<result>: <string to compare>}
scorer (Callable): optional callable that is used to calculate the matching score between
the query and each choice. WRatio is used by default
processor (Callable): optional callable that reformats the strings. utils.default_process
is used by default, which lowercases the strings and trims whitespace
score_cutoff (float): Optional argument for a score threshold. Matches with
a lower score than this number will not be returned. Defaults to 0
Returns:
Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is
no match with a score >= score_cutoff
Union[None, Tuple[str, float], Tuple[str, float, str]]: Returns the best match the best match
in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will
be in the form`(<choice>, <ratio>)` when `choices` is a list of strings
or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.
"""
if query is None:
return None
a = processor(query) if processor else query
result_score = None
result_choice = ""
if hasattr(choices, "items"):
choice_key = ""
for choice, match_choice in choices.items():
if match_choice is None:
continue
b = processor(match_choice) if processor else match_choice
score = scorer(
a, b,
processor=None,
score_cutoff=score_cutoff)
if score >= score_cutoff:
# very small increment for the score_cutoff, so when multiple
# elements have the same score the first one is used
# only done when the score is a number
if isinstance(score, numbers.Number):
score_cutoff = score + 0.00001
if score_cutoff > 100:
return (match_choice, score, choice)
else:
score_cutoff = score
result_score = score
result_choice = match_choice
choice_key = choice
return (result_choice, result_score, choice_key) if not result_score is None else None
for choice in choices:
if choice is None:
continue
b = processor(choice) if processor else choice
score = scorer(
a, b,
processor=None,
score_cutoff=score_cutoff)
if score >= score_cutoff:
if isinstance(score, numbers.Number):
score_cutoff = score + 0.00001
if score_cutoff > 100:
return (choice, score)
else:
score_cutoff = score
result_score = score
result_choice = choice
return (result_choice, result_score) if not result_score is None else None

View File

@ -5,6 +5,19 @@ import unittest
from rapidfuzz import process, fuzz, utils
scorers = [
fuzz.ratio,
fuzz.partial_ratio,
fuzz.token_sort_ratio,
fuzz.token_set_ratio,
fuzz.token_ratio,
fuzz.partial_token_sort_ratio,
fuzz.partial_token_set_ratio,
fuzz.partial_token_ratio,
fuzz.WRatio,
fuzz.QRatio
]
class RatioTest(unittest.TestCase):
def setUp(self):
self.s1 = "new york mets"
@ -87,5 +100,27 @@ class RatioTest(unittest.TestCase):
score = fuzz.QRatio(s1, s2)
self.assertEqual(0, score)
def testWithProcessor(self):
"""
Any scorer should accept any type as s1 and s2, as long as it is a string
after preprocessing.
"""
s1 = ["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"]
s2 = ["chicago cubs vs new york mets", "CitiFields", "2012-05-11", "9pm"]
for scorer in scorers:
score = scorer(s1, s2, processor=lambda event: event[0])
self.assertEqual(score, 100)
def testHelp(self):
"""
test that all help texts can be printed without throwing an exception,
since they are implemented in C++ aswell
"""
for scorer in scorers:
help(scorer)
if __name__ == '__main__':
unittest.main()

138
tests/test_hypothesis.py Normal file
View File

@ -0,0 +1,138 @@
from itertools import product
from functools import partial
from string import ascii_letters, digits, punctuation
from hypothesis import given, assume, settings
import hypothesis.strategies as st
import pytest
from rapidfuzz import fuzz, process, utils
import random
HYPOTHESIS_ALPHABET = ascii_letters + digits + punctuation
SCORERS = [
fuzz.ratio,
fuzz.partial_ratio,
fuzz.token_set_ratio,
fuzz.token_sort_ratio,
fuzz.token_ratio,
fuzz.partial_token_set_ratio,
fuzz.partial_token_sort_ratio,
fuzz.partial_token_ratio,
fuzz.WRatio,
fuzz.QRatio
]
FULL_SCORERS = [
fuzz.ratio,
fuzz.WRatio,
fuzz.QRatio
]
PROCESSORS = [
lambda x: x,
utils.default_process
]
@given(sentence=st.text())
@settings(max_examples=200)
def test_multiple_processor_runs(sentence):
"""
Test that running a preprocessor on a sentence
a second time does not change the result
"""
assert utils.default_process(sentence) \
== utils.default_process(utils.default_process(sentence))
'''
def full_scorers_processors():
"""
Generate a list of (scorer, processor) pairs for testing for scorers that use the full string only
:return: [(scorer, processor), ...]
"""
scorers = [fuzz.ratio]
processors = [lambda x: x,
partial(utils.full_process, force_ascii=False),
partial(utils.full_process, force_ascii=True)]
splist = list(product(scorers, processors))
splist.extend(
[(fuzz.WRatio, partial(utils.full_process, force_ascii=True)),
(fuzz.QRatio, partial(utils.full_process, force_ascii=True)),
(fuzz.UWRatio, partial(utils.full_process, force_ascii=False)),
(fuzz.UQRatio, partial(utils.full_process, force_ascii=False))]
)
return splist
@pytest.mark.parametrize('scorer,processor',
scorers_processors())
@given(data=st.data())
@settings(max_examples=20, deadline=5000)
def test_identical_strings_extracted(scorer, processor, data):
"""
Test that identical strings will always return a perfect match.
:param scorer:
:param processor:
:param data:
:return:
"""
# Draw a list of random strings
strings = data.draw(
st.lists(
st.text(min_size=10, max_size=100, alphabet=HYPOTHESIS_ALPHABET),
min_size=1,
max_size=10
)
)
# Draw a random integer for the index in that list
choiceidx = data.draw(st.integers(min_value=0, max_value=(len(strings) - 1)))
# Extract our choice from the list
choice = strings[choiceidx]
# Check process doesn't make our choice the empty string
assume(processor(choice) != '')
# Extract all perfect matches
result = process.extractBests(choice,
strings,
scorer=scorer,
processor=processor,
score_cutoff=100,
limit=None)
# Check we get a result
assert result != []
# Check the original is in the list
assert (choice, 100) in result
'''
@pytest.mark.parametrize('scorer,processor', list(product(FULL_SCORERS, PROCESSORS)))
@given(choices=st.lists(st.text(), min_size=1))
@settings(max_examples=20, deadline=5000)
def test_only_identical_strings_extracted(scorer, processor, choices):
"""
Test that only identical (post processing) strings score 100 on the test.
If two strings are not identical then using full comparison methods they should
not be a perfect (100) match.
:param scorer:
:param processor:
:param data:
:return:
"""
query = random.choice(choices)
assume(processor(query) != '')
matches = process.extract(query, choices,
scorer=scorer, processor=processor,
score_cutoff=100, limit=None)
assert matches != []
for match in matches:
assert processor(query) == processor(match[0])

View File

@ -14,6 +14,20 @@ class ProcessTest(unittest.TestCase):
"braves vs mets",
]
def testExtractOneExceptions(self):
self.assertRaises(TypeError, process.extractOne)
self.assertRaises(TypeError, process.extractOne, 1)
self.assertRaises(TypeError, process.extractOne, 1, [])
self.assertRaises(TypeError, process.extractOne, '', [1])
self.assertRaises(TypeError, process.extractOne, '', {1:1})
def testExtractExceptions(self):
self.assertRaises(TypeError, process.extract)
self.assertRaises(TypeError, process.extract, 1)
self.assertRaises(TypeError, process.extract, 1, [])
self.assertRaises(TypeError, process.extract, '', [1])
self.assertRaises(TypeError, process.extract, '', {1:1})
def testGetBestChoice1(self):
query = "new york mets at atlanta braves"
best = process.extractOne(query, self.baseball_strings)
@ -35,12 +49,16 @@ class ProcessTest(unittest.TestCase):
self.assertEqual(best[0], self.baseball_strings[0])
def testWithProcessor(self):
"""
extractOne should accept any type as long as it is a string
after preprocessing
"""
events = [
["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"],
["new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm"],
["atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm"],
]
query = "new york mets vs chicago cubs"
query = events[0]
best = process.extractOne(query, events, processor=lambda event: event[0])
self.assertEqual(best[0], events[0])