implement process.extractOne in C++ (#53)
* start to simplify complexion * start implementation * add extractOne to C++ * fix a couple of bugs in the implementation * start adressing performance issues
This commit is contained in:
parent
eee513f2c5
commit
426fbb24e9
|
@ -9,11 +9,11 @@ on:
|
|||
|
||||
jobs:
|
||||
test_python:
|
||||
name: run linting, tests and benchmarks for the python module
|
||||
runs-on: ubuntu-latest
|
||||
name: linting and tests on Python ${{ matrix.python-version }}
|
||||
runs-on: ubuntu-18.04
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: [2.7, 3.5, 3.6, 3.7, 3.8]
|
||||
python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
@ -41,7 +41,7 @@ jobs:
|
|||
- name: Run Unit Tests
|
||||
run: |
|
||||
pip install .
|
||||
pip install pytest
|
||||
pip install pytest hypothesis
|
||||
pytest
|
||||
|
||||
|
||||
|
|
|
@ -15,3 +15,7 @@ site/
|
|||
|
||||
# benchmark results
|
||||
bench_results/
|
||||
|
||||
# Hypothesis results
|
||||
.hypothesis/
|
||||
|
||||
|
|
2
setup.py
2
setup.py
|
@ -17,7 +17,7 @@ class BuildExt(build_ext):
|
|||
"""A custom build extension for adding compiler-specific options."""
|
||||
c_opts = {
|
||||
'msvc': ['/EHsc', '/O2', '/std:c++14'],
|
||||
'unix': ['-O3', '-std=c++14'],
|
||||
'unix': ['-O3', '-std=c++14', '-Wextra', '-Wall'],
|
||||
}
|
||||
l_opts = {
|
||||
'msvc': [],
|
||||
|
|
|
@ -21,10 +21,14 @@ bool valid_str(PyObject* str, const char* name)
|
|||
Py_InitModule3(#name, methods, doc); \
|
||||
}
|
||||
|
||||
using python_string =
|
||||
mpark::variant<std::basic_string<uint8_t>, std::basic_string<Py_UNICODE>,
|
||||
rapidfuzz::basic_string_view<uint8_t>, rapidfuzz::basic_string_view<Py_UNICODE>>;
|
||||
|
||||
using python_string_view =
|
||||
mpark::variant<rapidfuzz::basic_string_view<uint8_t>, rapidfuzz::basic_string_view<Py_UNICODE>>;
|
||||
|
||||
python_string_view decode_python_string(PyObject* py_str)
|
||||
python_string decode_python_string(PyObject* py_str)
|
||||
{
|
||||
if (PyObject_TypeCheck(py_str, &PyString_Type)) {
|
||||
Py_ssize_t len = PyString_GET_SIZE(py_str);
|
||||
|
@ -38,12 +42,27 @@ python_string_view decode_python_string(PyObject* py_str)
|
|||
}
|
||||
}
|
||||
|
||||
PyObject* encode_python_string(std::basic_string<uint8_t> str)
|
||||
python_string_view decode_python_string_view(PyObject* py_str)
|
||||
{
|
||||
if (PyObject_TypeCheck(py_str, &PyString_Type)) {
|
||||
Py_ssize_t len = PyString_GET_SIZE(py_str);
|
||||
uint8_t* str = reinterpret_cast<uint8_t*>(PyString_AS_STRING(py_str));
|
||||
return rapidfuzz::basic_string_view<uint8_t>(str, len);
|
||||
}
|
||||
else {
|
||||
Py_ssize_t len = PyUnicode_GET_SIZE(py_str);
|
||||
Py_UNICODE* str = PyUnicode_AS_UNICODE(py_str);
|
||||
return rapidfuzz::basic_string_view<Py_UNICODE>(str, len);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
PyObject* encode_python_string(rapidfuzz::basic_string_view<uint8_t> str)
|
||||
{
|
||||
return PyString_FromStringAndSize(reinterpret_cast<const char*>(str.data()), str.size());
|
||||
}
|
||||
|
||||
PyObject* encode_python_string(std::basic_string<Py_UNICODE> str)
|
||||
PyObject* encode_python_string(rapidfuzz::basic_string_view<Py_UNICODE> str)
|
||||
{
|
||||
return PyUnicode_FromUnicode(str.data(), str.size());
|
||||
}
|
|
@ -6,12 +6,6 @@
|
|||
#include "details/types.hpp"
|
||||
#include <variant/variant.hpp>
|
||||
|
||||
// PEP 623 deprecates legacy strings and therefor
|
||||
// deprecates e.g. PyUnicode_READY in Python 3.10
|
||||
#if PY_VERSION_HEX < 0x030A0000
|
||||
#define PY_BELOW_3_10
|
||||
#endif
|
||||
|
||||
bool valid_str(PyObject* str, const char* name)
|
||||
{
|
||||
if (!PyUnicode_Check(str)) {
|
||||
|
@ -19,7 +13,9 @@ bool valid_str(PyObject* str, const char* name)
|
|||
return false;
|
||||
}
|
||||
|
||||
#ifdef PY_BELOW_3_10
|
||||
// PEP 623 deprecates legacy strings and therefor
|
||||
// deprecates e.g. PyUnicode_READY in Python 3.10
|
||||
#if PY_VERSION_HEX < PYTHON_VERSION(3,10,0)
|
||||
if (PyUnicode_READY(str)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -36,11 +32,16 @@ bool valid_str(PyObject* str, const char* name)
|
|||
return PyModule_Create(&moduledef); \
|
||||
}
|
||||
|
||||
using python_string =
|
||||
mpark::variant<std::basic_string<uint8_t>, std::basic_string<uint16_t>, std::basic_string<uint32_t>,
|
||||
rapidfuzz::basic_string_view<uint8_t>, rapidfuzz::basic_string_view<uint16_t>,
|
||||
rapidfuzz::basic_string_view<uint32_t>>;
|
||||
|
||||
using python_string_view =
|
||||
mpark::variant<rapidfuzz::basic_string_view<uint8_t>, rapidfuzz::basic_string_view<uint16_t>,
|
||||
rapidfuzz::basic_string_view<uint32_t>>;
|
||||
|
||||
python_string_view decode_python_string(PyObject* py_str)
|
||||
python_string decode_python_string(PyObject* py_str)
|
||||
{
|
||||
Py_ssize_t len = PyUnicode_GET_LENGTH(py_str);
|
||||
void* str = PyUnicode_DATA(py_str);
|
||||
|
@ -55,17 +56,32 @@ python_string_view decode_python_string(PyObject* py_str)
|
|||
}
|
||||
}
|
||||
|
||||
PyObject* encode_python_string(std::basic_string<uint8_t> str)
|
||||
python_string_view decode_python_string_view(PyObject* py_str)
|
||||
{
|
||||
Py_ssize_t len = PyUnicode_GET_LENGTH(py_str);
|
||||
void* str = PyUnicode_DATA(py_str);
|
||||
|
||||
switch (PyUnicode_KIND(py_str)) {
|
||||
case PyUnicode_1BYTE_KIND:
|
||||
return rapidfuzz::basic_string_view<uint8_t>(static_cast<uint8_t*>(str), len);
|
||||
case PyUnicode_2BYTE_KIND:
|
||||
return rapidfuzz::basic_string_view<uint16_t>(static_cast<uint16_t*>(str), len);
|
||||
default:
|
||||
return rapidfuzz::basic_string_view<uint32_t>(static_cast<uint32_t*>(str), len);
|
||||
}
|
||||
}
|
||||
|
||||
PyObject* encode_python_string(rapidfuzz::basic_string_view<uint8_t> str)
|
||||
{
|
||||
return PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND, str.data(), str.size());
|
||||
}
|
||||
|
||||
PyObject* encode_python_string(std::basic_string<uint16_t> str)
|
||||
PyObject* encode_python_string(rapidfuzz::basic_string_view<uint16_t> str)
|
||||
{
|
||||
return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, str.data(), str.size());
|
||||
}
|
||||
|
||||
PyObject* encode_python_string(std::basic_string<uint32_t> str)
|
||||
PyObject* encode_python_string(rapidfuzz::basic_string_view<uint32_t> str)
|
||||
{
|
||||
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, str.data(), str.size());
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ static inline bool non_default_process(PyObject* processor)
|
|||
{
|
||||
if (processor) {
|
||||
if (PyCFunction_Check(processor)) {
|
||||
if (PyCFunction_GetFunction(processor) == (PyCFunction)(void (*)(void))default_process) {
|
||||
if (PyCFunction_GetFunction(processor) == PY_FUNC_CAST(default_process)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -31,8 +31,21 @@ static inline bool non_default_process(PyObject* processor)
|
|||
return PyCallable_Check(processor);
|
||||
}
|
||||
|
||||
static inline void free_owner_list(const std::vector<PyObject*>& owner_list)
|
||||
{
|
||||
for (const auto owned : owner_list) {
|
||||
Py_DecRef(owned);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Sentence>
|
||||
static inline python_string default_process_string(Sentence&& str)
|
||||
{
|
||||
return rutils::default_process(std::forward<Sentence>(str));
|
||||
}
|
||||
|
||||
template <typename MatchingFunc>
|
||||
static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* keywds)
|
||||
static inline PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* keywds)
|
||||
{
|
||||
PyObject* py_s1;
|
||||
PyObject* py_s2;
|
||||
|
@ -50,10 +63,6 @@ static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* key
|
|||
return PyFloat_FromDouble(0);
|
||||
}
|
||||
|
||||
if (!valid_str(py_s1, "s1") || !valid_str(py_s2, "s2")) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (non_default_process(processor)) {
|
||||
PyObject* proc_s1 = PyObject_CallFunctionObjArgs(processor, py_s2, NULL);
|
||||
if (proc_s1 == NULL) {
|
||||
|
@ -66,8 +75,12 @@ static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* key
|
|||
return NULL;
|
||||
}
|
||||
|
||||
auto s1_view = decode_python_string(proc_s1);
|
||||
auto s2_view = decode_python_string(proc_s2);
|
||||
if (!valid_str(proc_s1, "s1") || !valid_str(proc_s2, "s2")) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto s1_view = decode_python_string_view(proc_s1);
|
||||
auto s2_view = decode_python_string_view(proc_s2);
|
||||
|
||||
double result = mpark::visit(
|
||||
[score_cutoff](auto&& val1, auto&& val2) {
|
||||
|
@ -81,8 +94,12 @@ static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* key
|
|||
return PyFloat_FromDouble(result);
|
||||
}
|
||||
|
||||
auto s1_view = decode_python_string(py_s1);
|
||||
auto s2_view = decode_python_string(py_s2);
|
||||
if (!valid_str(py_s1, "s1") || !valid_str(py_s2, "s2")) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto s1_view = decode_python_string_view(py_s1);
|
||||
auto s2_view = decode_python_string_view(py_s2);
|
||||
|
||||
double result;
|
||||
if (use_preprocessing(processor, processor_default)) {
|
||||
|
@ -118,7 +135,24 @@ struct name##_func { \
|
|||
static PyObject* name(PyObject* /*self*/, PyObject* args, PyObject* keywds) \
|
||||
{ \
|
||||
return fuzz_call<name##_func>(process_default, args, keywds); \
|
||||
}
|
||||
}
|
||||
|
||||
struct CachedFuzz {
|
||||
virtual void str1_set(python_string str) {
|
||||
m_str1 = std::move(str);
|
||||
}
|
||||
|
||||
virtual void str2_set(python_string str) {
|
||||
m_str2 = std::move(str);
|
||||
}
|
||||
|
||||
virtual double call(double score_cutoff) = 0;
|
||||
|
||||
protected:
|
||||
python_string m_str1;
|
||||
python_string m_str2;
|
||||
};
|
||||
|
||||
|
||||
FUZZ_FUNC(
|
||||
ratio, false,
|
||||
|
@ -140,6 +174,17 @@ FUZZ_FUNC(
|
|||
" 96.55171966552734"
|
||||
)
|
||||
|
||||
struct CachedRatio : public CachedFuzz {
|
||||
double call(double score_cutoff) override {
|
||||
return mpark::visit(
|
||||
[score_cutoff](auto&& val1, auto&& val2) {
|
||||
return rfuzz::ratio(val1, val2, score_cutoff);
|
||||
},
|
||||
m_str1, m_str2);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
FUZZ_FUNC(
|
||||
partial_ratio, false,
|
||||
"partial_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
|
||||
|
@ -160,6 +205,15 @@ FUZZ_FUNC(
|
|||
" 100"
|
||||
)
|
||||
|
||||
struct CachedPartialRatio : public CachedFuzz {
|
||||
double call(double score_cutoff) override {
|
||||
return mpark::visit(
|
||||
[score_cutoff](auto&& val1, auto&& val2) {
|
||||
return rfuzz::partial_ratio(val1, val2, score_cutoff);
|
||||
},
|
||||
m_str1, m_str2);
|
||||
}
|
||||
};
|
||||
|
||||
FUZZ_FUNC(
|
||||
token_sort_ratio, true,
|
||||
|
@ -182,6 +236,26 @@ FUZZ_FUNC(
|
|||
" 100.0"
|
||||
)
|
||||
|
||||
struct CachedTokenSortRatio : public CachedFuzz {
|
||||
void str1_set(python_string str) override {
|
||||
m_str1 = mpark::visit(
|
||||
[](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str);
|
||||
}
|
||||
|
||||
virtual void str2_set(python_string str) override {
|
||||
m_str2 = mpark::visit(
|
||||
[](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str);
|
||||
}
|
||||
|
||||
double call(double score_cutoff) override {
|
||||
return mpark::visit(
|
||||
[score_cutoff](auto&& val1, auto&& val2) {
|
||||
return rfuzz::ratio(val1, val2, score_cutoff);
|
||||
},
|
||||
m_str1, m_str2);
|
||||
}
|
||||
};
|
||||
|
||||
FUZZ_FUNC(
|
||||
partial_token_sort_ratio, true,
|
||||
"partial_token_sort_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
|
||||
|
@ -200,6 +274,26 @@ FUZZ_FUNC(
|
|||
" float: ratio between s1 and s2 as a float between 0 and 100"
|
||||
)
|
||||
|
||||
struct CachedPartialTokenSortRatio : public CachedFuzz {
|
||||
void str1_set(python_string str) override {
|
||||
m_str1 = mpark::visit(
|
||||
[](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str);
|
||||
}
|
||||
|
||||
virtual void str2_set(python_string str) override {
|
||||
m_str2 = mpark::visit(
|
||||
[](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str);
|
||||
}
|
||||
|
||||
double call(double score_cutoff) override {
|
||||
return mpark::visit(
|
||||
[score_cutoff](auto&& val1, auto&& val2) {
|
||||
return rfuzz::partial_ratio(val1, val2, score_cutoff);
|
||||
},
|
||||
m_str1, m_str2);
|
||||
}
|
||||
};
|
||||
|
||||
FUZZ_FUNC(
|
||||
token_set_ratio, true,
|
||||
"token_set_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
|
||||
|
@ -224,6 +318,16 @@ FUZZ_FUNC(
|
|||
" 100.0"
|
||||
)
|
||||
|
||||
struct CachedTokenSetRatio : public CachedFuzz {
|
||||
double call(double score_cutoff) override {
|
||||
return mpark::visit(
|
||||
[score_cutoff](auto&& val1, auto&& val2) {
|
||||
return rfuzz::token_set_ratio(val1, val2, score_cutoff);
|
||||
},
|
||||
m_str1, m_str2);
|
||||
}
|
||||
};
|
||||
|
||||
FUZZ_FUNC(
|
||||
partial_token_set_ratio, true,
|
||||
"partial_token_set_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
|
||||
|
@ -243,6 +347,16 @@ FUZZ_FUNC(
|
|||
" float: ratio between s1 and s2 as a float between 0 and 100"
|
||||
)
|
||||
|
||||
struct CachedPartialTokenSetRatio : public CachedFuzz {
|
||||
double call(double score_cutoff) override {
|
||||
return mpark::visit(
|
||||
[score_cutoff](auto&& val1, auto&& val2) {
|
||||
return rfuzz::partial_token_set_ratio(val1, val2, score_cutoff);
|
||||
},
|
||||
m_str1, m_str2);
|
||||
}
|
||||
};
|
||||
|
||||
FUZZ_FUNC(
|
||||
token_ratio, true,
|
||||
"token_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
|
||||
|
@ -262,6 +376,16 @@ FUZZ_FUNC(
|
|||
" float: ratio between s1 and s2 as a float between 0 and 100"
|
||||
)
|
||||
|
||||
struct CachedTokenRatio : public CachedFuzz {
|
||||
double call(double score_cutoff) override {
|
||||
return mpark::visit(
|
||||
[score_cutoff](auto&& val1, auto&& val2) {
|
||||
return rfuzz::token_ratio(val1, val2, score_cutoff);
|
||||
},
|
||||
m_str1, m_str2);
|
||||
}
|
||||
};
|
||||
|
||||
FUZZ_FUNC(
|
||||
partial_token_ratio, true,
|
||||
"partial_token_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
|
||||
|
@ -282,6 +406,16 @@ FUZZ_FUNC(
|
|||
" float: ratio between s1 and s2 as a float between 0 and 100"
|
||||
)
|
||||
|
||||
struct CachedPartialTokenRatio : public CachedFuzz {
|
||||
double call(double score_cutoff) override {
|
||||
return mpark::visit(
|
||||
[score_cutoff](auto&& val1, auto&& val2) {
|
||||
return rfuzz::partial_token_ratio(val1, val2, score_cutoff);
|
||||
},
|
||||
m_str1, m_str2);
|
||||
}
|
||||
};
|
||||
|
||||
FUZZ_FUNC(
|
||||
WRatio, true,
|
||||
"WRatio($module, s1, s2, processor = False, score_cutoff = 0)\n"
|
||||
|
@ -300,6 +434,16 @@ FUZZ_FUNC(
|
|||
" float: ratio between s1 and s2 as a float between 0 and 100"
|
||||
)
|
||||
|
||||
struct CachedWRatio : public CachedFuzz {
|
||||
double call(double score_cutoff) override {
|
||||
return mpark::visit(
|
||||
[score_cutoff](auto&& val1, auto&& val2) {
|
||||
return rfuzz::WRatio(val1, val2, score_cutoff);
|
||||
},
|
||||
m_str1, m_str2);
|
||||
}
|
||||
};
|
||||
|
||||
FUZZ_FUNC(
|
||||
QRatio, true,
|
||||
"QRatio($module, s1, s2, processor = False, score_cutoff = 0)\n"
|
||||
|
@ -321,6 +465,16 @@ FUZZ_FUNC(
|
|||
" 96.55171966552734"
|
||||
)
|
||||
|
||||
struct CachedQRatio : public CachedFuzz {
|
||||
double call(double score_cutoff) override {
|
||||
return mpark::visit(
|
||||
[score_cutoff](auto&& val1, auto&& val2) {
|
||||
return rfuzz::QRatio(val1, val2, score_cutoff);
|
||||
},
|
||||
m_str1, m_str2);
|
||||
}
|
||||
};
|
||||
|
||||
FUZZ_FUNC(
|
||||
quick_lev_ratio, true,
|
||||
"quick_lev_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
|
||||
|
@ -343,7 +497,15 @@ FUZZ_FUNC(
|
|||
" float: ratio between s1 and s2 as a float between 0 and 100"
|
||||
)
|
||||
|
||||
|
||||
struct CachedQuickLevRatio : public CachedFuzz {
|
||||
double call(double score_cutoff) override {
|
||||
return mpark::visit(
|
||||
[score_cutoff](auto&& val1, auto&& val2) {
|
||||
return rfuzz::quick_lev_ratio(val1, val2, score_cutoff);
|
||||
},
|
||||
m_str1, m_str2);
|
||||
}
|
||||
};
|
||||
|
||||
constexpr const char* default_process_docstring = R"()";
|
||||
|
||||
|
@ -360,13 +522,391 @@ static PyObject* default_process(PyObject* /*self*/, PyObject* args, PyObject* k
|
|||
return NULL;
|
||||
}
|
||||
|
||||
auto sentence_view = decode_python_string(py_sentence);
|
||||
PyObject* processed = mpark::visit(
|
||||
[](auto&& val1) {
|
||||
return encode_python_string(rutils::default_process(val1));},
|
||||
sentence_view);
|
||||
/* this is pretty verbose. However it is faster than std::variant + std::visit */
|
||||
#ifdef PYTHON_2
|
||||
if (PyObject_TypeCheck(py_sentence, &PyString_Type)) {
|
||||
Py_ssize_t len = PyString_GET_SIZE(py_sentence);
|
||||
char* str = PyString_AS_STRING(py_sentence);
|
||||
|
||||
auto proc_str = rutils::default_process(rapidfuzz::basic_string_view<char>(str, len));
|
||||
return PyString_FromStringAndSize(proc_str.data(), proc_str.size());
|
||||
}
|
||||
else {
|
||||
Py_ssize_t len = PyUnicode_GET_SIZE(py_sentence);
|
||||
const Py_UNICODE* str = PyUnicode_AS_UNICODE(py_sentence);
|
||||
|
||||
auto proc_str = rutils::default_process(rapidfuzz::basic_string_view<Py_UNICODE>(str, len));
|
||||
return PyUnicode_FromUnicode(proc_str.data(), proc_str.size());
|
||||
}
|
||||
#else /* Python 3 */
|
||||
|
||||
Py_ssize_t len = PyUnicode_GET_LENGTH(py_sentence);
|
||||
void* str = PyUnicode_DATA(py_sentence);
|
||||
|
||||
switch (PyUnicode_KIND(py_sentence)) {
|
||||
case PyUnicode_1BYTE_KIND:
|
||||
{
|
||||
auto proc_str = rutils::default_process(
|
||||
rapidfuzz::basic_string_view<uint8_t>(static_cast<uint8_t*>(str), len));
|
||||
return PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND, proc_str.data(), proc_str.size());
|
||||
}
|
||||
case PyUnicode_2BYTE_KIND:
|
||||
{
|
||||
auto proc_str = rutils::default_process(
|
||||
rapidfuzz::basic_string_view<uint16_t>(static_cast<uint16_t*>(str), len));
|
||||
return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, proc_str.data(), proc_str.size());
|
||||
}
|
||||
default:
|
||||
{
|
||||
auto proc_str = rutils::default_process(
|
||||
rapidfuzz::basic_string_view<uint32_t>(static_cast<uint32_t*>(str), len));
|
||||
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, proc_str.data(), proc_str.size());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool process_string(
|
||||
PyObject* py_str, const char* name,
|
||||
PyObject* processor, bool processor_default,
|
||||
python_string& proc_str, std::vector<PyObject*>& owner_list)
|
||||
{
|
||||
if (non_default_process(processor)) {
|
||||
PyObject* proc_py_str = PyObject_CallFunctionObjArgs(processor, py_str, NULL);
|
||||
if ((proc_py_str == NULL) || (!valid_str(proc_py_str, name))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
owner_list.push_back(proc_py_str);
|
||||
proc_str = decode_python_string(proc_py_str);
|
||||
return true;
|
||||
}
|
||||
|
||||
return processed;
|
||||
if (!valid_str(py_str, name)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (use_preprocessing(processor, processor_default)) {
|
||||
proc_str = mpark::visit(
|
||||
[](auto&& val1) { return default_process_string(val1);},
|
||||
decode_python_string(py_str));
|
||||
} else {
|
||||
proc_str = decode_python_string(py_str);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::unique_ptr<CachedFuzz> get_matching_instance(PyObject* scorer)
|
||||
{
|
||||
if (scorer) {
|
||||
if (PyCFunction_Check(scorer)) {
|
||||
auto scorer_func = PyCFunction_GetFunction(scorer);
|
||||
if (scorer_func == PY_FUNC_CAST(ratio))
|
||||
{
|
||||
return std::make_unique<CachedRatio>();
|
||||
} else if (scorer_func == PY_FUNC_CAST(partial_ratio)) {
|
||||
return std::make_unique<CachedPartialRatio>();
|
||||
} else if (scorer_func == PY_FUNC_CAST(token_sort_ratio)) {
|
||||
return std::make_unique<CachedTokenSortRatio>();
|
||||
} else if (scorer_func == PY_FUNC_CAST(token_set_ratio)) {
|
||||
return std::make_unique<CachedTokenSetRatio>();
|
||||
} else if (scorer_func == PY_FUNC_CAST(partial_token_sort_ratio)) {
|
||||
return std::make_unique<CachedPartialTokenSortRatio>();
|
||||
} else if (scorer_func == PY_FUNC_CAST(partial_token_set_ratio)) {
|
||||
return std::make_unique<CachedPartialTokenSetRatio>();
|
||||
} else if (scorer_func == PY_FUNC_CAST(token_ratio)) {
|
||||
return std::make_unique<CachedTokenRatio>();
|
||||
} else if (scorer_func == PY_FUNC_CAST(partial_token_ratio)) {
|
||||
return std::make_unique<CachedPartialTokenRatio>();
|
||||
} else if (scorer_func == PY_FUNC_CAST(WRatio)) {
|
||||
return std::make_unique<CachedWRatio>();
|
||||
} else if (scorer_func == PY_FUNC_CAST(QRatio)) {
|
||||
return std::make_unique<CachedQRatio>();
|
||||
}
|
||||
}
|
||||
/* call python function */
|
||||
return nullptr;
|
||||
/* default is fuzz.WRatio */
|
||||
} else {
|
||||
return std::make_unique<CachedWRatio>();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
|
||||
PyObject* scorer, PyObject* processor, double score_cutoff)
|
||||
{
|
||||
bool match_found = false;
|
||||
PyObject* result_choice = NULL;
|
||||
PyObject* choice_key = NULL;
|
||||
std::vector<PyObject*> outer_owner_list;
|
||||
|
||||
bool is_dict = false;
|
||||
|
||||
PyObject* py_score_cutoff = PyFloat_FromDouble(score_cutoff);
|
||||
if (!py_score_cutoff) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
python_string query;
|
||||
if (!process_string(py_query, "query", processor, true, query, outer_owner_list)) {
|
||||
Py_DecRef(py_score_cutoff);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
py_query = mpark::visit(
|
||||
[](auto&& val) {return encode_python_string(val);},
|
||||
query);
|
||||
|
||||
if (!py_query) {
|
||||
Py_DecRef(py_score_cutoff);
|
||||
free_owner_list(outer_owner_list);
|
||||
return NULL;
|
||||
}
|
||||
outer_owner_list.push_back(py_query);
|
||||
|
||||
/* dict like container */
|
||||
if (PyObject_HasAttrString(py_choices, "items")) {
|
||||
is_dict = true;
|
||||
py_choices = PyObject_CallMethod(py_choices, "items", NULL);
|
||||
if (!py_choices) {
|
||||
free_owner_list(outer_owner_list);
|
||||
return NULL;
|
||||
}
|
||||
outer_owner_list.push_back(py_choices);
|
||||
}
|
||||
|
||||
PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings");
|
||||
if (!choices) {
|
||||
Py_DecRef(py_score_cutoff);
|
||||
free_owner_list(outer_owner_list);
|
||||
return NULL;
|
||||
}
|
||||
outer_owner_list.push_back(choices);
|
||||
|
||||
std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
|
||||
|
||||
|
||||
for (std::size_t i = 0; i < choice_count; ++i) {
|
||||
PyObject* py_choice = NULL;
|
||||
PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i);
|
||||
|
||||
if (is_dict) {
|
||||
if (!PyArg_ParseTuple(py_match_choice, "OO", &py_choice, &py_match_choice))
|
||||
{
|
||||
Py_DecRef(py_score_cutoff);
|
||||
free_owner_list(outer_owner_list);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (py_match_choice == Py_None) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<PyObject*> inner_owner_list;
|
||||
python_string choice;
|
||||
|
||||
if (!process_string(py_match_choice, "choice", processor, true, choice, inner_owner_list)) {
|
||||
Py_DecRef(py_score_cutoff);
|
||||
free_owner_list(outer_owner_list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyObject* py_proc_choice = mpark::visit(
|
||||
[](auto&& val) {return encode_python_string(val);},
|
||||
choice);
|
||||
|
||||
if (!py_proc_choice) {
|
||||
Py_DecRef(py_score_cutoff);
|
||||
free_owner_list(outer_owner_list);
|
||||
return NULL;
|
||||
}
|
||||
inner_owner_list.push_back(py_proc_choice);
|
||||
|
||||
PyObject* score = PyObject_CallFunction(scorer, "OOO",
|
||||
py_query, py_proc_choice, py_score_cutoff);
|
||||
|
||||
if (!score) {
|
||||
Py_DecRef(py_score_cutoff);
|
||||
free_owner_list(outer_owner_list);
|
||||
free_owner_list(inner_owner_list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int comp = PyObject_RichCompareBool(score, py_score_cutoff, Py_GE);
|
||||
if (comp == 1) {
|
||||
Py_DecRef(py_score_cutoff);
|
||||
py_score_cutoff = score;
|
||||
match_found = true;
|
||||
result_choice = py_match_choice;
|
||||
choice_key = py_choice;
|
||||
} else if (comp == 0) {
|
||||
Py_DecRef(score);
|
||||
} else if (comp == -1) {
|
||||
Py_DecRef(py_score_cutoff);
|
||||
Py_DecRef(score);
|
||||
free_owner_list(outer_owner_list);
|
||||
free_owner_list(inner_owner_list);
|
||||
return NULL;
|
||||
}
|
||||
free_owner_list(inner_owner_list);
|
||||
}
|
||||
|
||||
free_owner_list(outer_owner_list);
|
||||
|
||||
if (!match_found) {
|
||||
Py_DecRef(py_score_cutoff);
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
if (score_cutoff > 100) {
|
||||
score_cutoff = 100;
|
||||
}
|
||||
|
||||
PyObject* result = is_dict
|
||||
? Py_BuildValue("(OOO)", result_choice, py_score_cutoff, choice_key)
|
||||
: Py_BuildValue("(OO)", result_choice, py_score_cutoff);
|
||||
|
||||
Py_DecRef(py_score_cutoff);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
constexpr const char* extractOne_docstring =
|
||||
"extractOne($module, query, choices, scorer = 'fuzz.WRatio', processor = 'utils.default_process', score_cutoff = 0)\n"
|
||||
"--\n\n"
|
||||
"Find the best match in a list of choices\n\n"
|
||||
"Args:\n"
|
||||
" query (str): string we want to find\n"
|
||||
" choices (Iterable): list of all strings the query should be compared with or dict with a mapping\n"
|
||||
" {<result>: <string to compare>}\n"
|
||||
" scorer (Callable): optional callable that is used to calculate the matching score between\n"
|
||||
" the query and each choice. WRatio is used by default\n"
|
||||
" processor (Callable): optional callable that reformats the strings. utils.default_process\n"
|
||||
" is used by default, which lowercases the strings and trims whitespace\n"
|
||||
" score_cutoff (float): Optional argument for a score threshold. Matches with\n"
|
||||
" a lower score than this number will not be returned. Defaults to 0\n\n"
|
||||
"Returns:\n"
|
||||
" Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is\n"
|
||||
" no match with a score >= score_cutoff\n"
|
||||
" Union[None, Tuple[str, float], Tuple[str, float, str]]: Returns the best match the best match\n"
|
||||
" in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will\n"
|
||||
" be in the form`(<choice>, <ratio>)` when `choices` is a list of strings\n"
|
||||
" or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.";
|
||||
|
||||
static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds)
|
||||
{
|
||||
bool match_found = false;
|
||||
PyObject* result_choice = NULL;
|
||||
PyObject* choice_key = NULL;
|
||||
double result_score;
|
||||
std::vector<PyObject*> outer_owner_list;
|
||||
python_string query;
|
||||
bool is_dict = false;
|
||||
|
||||
PyObject* py_query;
|
||||
PyObject* py_choices;
|
||||
PyObject* processor = NULL;
|
||||
PyObject* py_scorer = NULL;
|
||||
double score_cutoff = 0;
|
||||
static const char* kwlist[] = {"query", "choices", "scorer", "processor", "score_cutoff", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "OO|OOd", const_cast<char**>(kwlist), &py_query,
|
||||
&py_choices, &py_scorer, &processor, &score_cutoff))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (py_query == Py_None) {
|
||||
return PyFloat_FromDouble(0);
|
||||
}
|
||||
|
||||
auto scorer = get_matching_instance(py_scorer);
|
||||
if (!scorer) {
|
||||
// todo this is mostly code duplication
|
||||
return py_extractOne(py_query, py_choices, py_scorer, processor, score_cutoff);
|
||||
}
|
||||
|
||||
if (!process_string(py_query, "query", processor, true, query, outer_owner_list)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
scorer->str1_set(query);
|
||||
PyObject* py_items;
|
||||
|
||||
/* dict like container */
|
||||
if (PyObject_HasAttrString(py_choices, "items")) {
|
||||
is_dict = true;
|
||||
py_choices = PyObject_CallMethod(py_choices, "items", NULL);
|
||||
if (!py_choices) {
|
||||
free_owner_list(outer_owner_list);
|
||||
return NULL;
|
||||
}
|
||||
outer_owner_list.push_back(py_choices);
|
||||
}
|
||||
|
||||
PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings");
|
||||
if (!choices) {
|
||||
free_owner_list(outer_owner_list);
|
||||
return NULL;
|
||||
}
|
||||
outer_owner_list.push_back(choices);
|
||||
|
||||
std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
|
||||
|
||||
for (std::size_t i = 0; i < choice_count; ++i) {
|
||||
PyObject* py_choice = NULL;
|
||||
PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i);
|
||||
|
||||
if (is_dict) {
|
||||
if (!PyArg_ParseTuple(py_match_choice, "OO", &py_choice, &py_match_choice))
|
||||
{
|
||||
free_owner_list(outer_owner_list);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (py_match_choice == Py_None) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<PyObject*> inner_owner_list;
|
||||
python_string choice;
|
||||
|
||||
if (!process_string(py_match_choice, "choice", processor, true, choice, inner_owner_list)) {
|
||||
free_owner_list(outer_owner_list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
scorer->str2_set(choice);
|
||||
double score = scorer->call(score_cutoff);
|
||||
|
||||
if (score >= score_cutoff) {
|
||||
// increase the value by a small step so it might be able to exit early
|
||||
score_cutoff = score + (float)0.00001;
|
||||
result_score = score;
|
||||
match_found = true;
|
||||
result_choice = py_match_choice;
|
||||
choice_key = py_choice;
|
||||
}
|
||||
free_owner_list(inner_owner_list);
|
||||
}
|
||||
|
||||
free_owner_list(outer_owner_list);
|
||||
|
||||
if (!match_found) {
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
if (is_dict) {
|
||||
return Py_BuildValue("(OdO)", result_choice, result_score, choice_key);
|
||||
} else {
|
||||
return Py_BuildValue("(Od)", result_choice, result_score);
|
||||
}
|
||||
}
|
||||
|
||||
static PyMethodDef methods[] = {
|
||||
|
@ -386,6 +926,7 @@ static PyMethodDef methods[] = {
|
|||
PY_METHOD(QRatio),
|
||||
PY_METHOD(quick_lev_ratio),
|
||||
/* process */
|
||||
PY_METHOD(extractOne),
|
||||
/* sentinel */
|
||||
{NULL, NULL, 0, NULL}
|
||||
};
|
||||
|
|
|
@ -1,106 +0,0 @@
|
|||
#include "fuzz.hpp"
|
||||
#include "py_utils.hpp"
|
||||
#include "utils.hpp"
|
||||
#include <string>
|
||||
|
||||
namespace rfuzz = rapidfuzz::fuzz;
|
||||
namespace utils = rapidfuzz::utils;
|
||||
|
||||
PyObject* extractOne(PyObject* self, PyObject* args, PyObject* keywds)
|
||||
{
|
||||
PyObject* py_query;
|
||||
PyObject* py_choices;
|
||||
PyObject* processor = NULL;
|
||||
PyObject* scorer = NULL;
|
||||
double score_cutoff = 0;
|
||||
static const char* kwlist[] = {"query", "choices", "scorer", "processor", "score_cutoff", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "OO|OOd", const_cast<char**>(kwlist), &py_query,
|
||||
&py_choices, &scorer, &processor, &score_cutoff))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (py_query == Py_None) {
|
||||
return PyFloat_FromDouble(0);
|
||||
}
|
||||
|
||||
if (PyObject_HasAttrString(py_choices, "items")) {
|
||||
}
|
||||
else {
|
||||
}
|
||||
|
||||
if (PySequence_Check(processor)) {
|
||||
}
|
||||
|
||||
if (!valid_str(py_query, "query")) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// if is list
|
||||
|
||||
PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings");
|
||||
if (!choices) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
|
||||
|
||||
bool match_found;
|
||||
// PyObject*
|
||||
|
||||
// processing missing
|
||||
auto query_view = decode_python_string(py_query);
|
||||
|
||||
for (std::size_t i = 0; i < choice_count; ++i) {
|
||||
PyObject* py_choice = PySequence_Fast_GET_ITEM(choices, i);
|
||||
|
||||
if (py_choice == Py_None) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!valid_str(py_choice, "choice")) {
|
||||
Py_DECREF(choices);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto choice_view = decode_python_string(py_choice);
|
||||
|
||||
double score = mpark::visit(
|
||||
[score_cutoff](auto&& val1, auto&& val2) {
|
||||
return rfuzz::WRatio(val1, val2, score_cutoff);
|
||||
},
|
||||
query_view, choice_view);
|
||||
/*
|
||||
float score;
|
||||
if (preprocess) {
|
||||
score = fuzz::WRatio(
|
||||
cleaned_query,
|
||||
utils::default_process(choice),
|
||||
score_cutoff);
|
||||
} else {
|
||||
score = fuzz::WRatio(
|
||||
cleaned_query,
|
||||
std::wstring_view(choice, wcslen(choice)),
|
||||
score_cutoff);
|
||||
}*/
|
||||
|
||||
if (score >= score_cutoff) {
|
||||
// increase the value by a small step so it might be able to exit early
|
||||
score_cutoff = score + (float)0.00001;
|
||||
match_found = true;
|
||||
result_choice = choice;
|
||||
}
|
||||
}
|
||||
|
||||
Py_DECREF(choices);
|
||||
|
||||
if (!match_found) {
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
if (score_cutoff > 100) {
|
||||
score_cutoff = 100;
|
||||
}
|
||||
return Py_BuildValue("(ud)", result_choice, score_cutoff);
|
||||
}
|
|
@ -1,21 +1,26 @@
|
|||
/* SPDX-License-Identifier: MIT */
|
||||
/* Copyright © 2020 Max Bachmann */
|
||||
|
||||
#pragma once
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#include <Python.h>
|
||||
#include <vector>
|
||||
#include "utils.hpp"
|
||||
|
||||
#define PY_FUNC_CAST(func) ((PyCFunction)(void (*)(void))func)
|
||||
|
||||
#define PYTHON_VERSION(major, minor, micro) ((major << 24) | (minor << 16) | (micro << 8))
|
||||
|
||||
/* The cast of the function is necessary since PyCFunction values
|
||||
* only take two PyObject* parameters, and these functions take three.
|
||||
*/
|
||||
#define PY_METHOD(x) \
|
||||
{ \
|
||||
#x, (PyCFunction)(void (*)(void))x, METH_VARARGS | METH_KEYWORDS, x##_docstring \
|
||||
}
|
||||
#define PY_METHOD(x) \
|
||||
{ #x, PY_FUNC_CAST(x), METH_VARARGS | METH_KEYWORDS, x##_docstring }
|
||||
|
||||
#if PY_MAJOR_VERSION == 2
|
||||
#if PY_VERSION_HEX < PYTHON_VERSION(3,0,0)
|
||||
#define PYTHON_2
|
||||
#include "py2_utils.hpp"
|
||||
#else
|
||||
#define PYTHON_3
|
||||
#include "py3_utils.hpp"
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit aa743d18e39a1b19f83fb745e580ab311487b727
|
||||
Subproject commit 0cbbee61bd9a2401e45c96a3d3d6ab640317ccce
|
|
@ -3,6 +3,6 @@ rapid string matching library
|
|||
"""
|
||||
__author__ = "Max Bachmann"
|
||||
__license__ = "MIT"
|
||||
__version__ = "0.12.5"
|
||||
__version__ = "0.13.0"
|
||||
|
||||
from rapidfuzz import process, fuzz, levenshtein, utils
|
||||
from rapidfuzz import process, fuzz, utils# levenshtein
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
# Copyright © 2011 Adam Cohen
|
||||
|
||||
from rapidfuzz import fuzz, utils
|
||||
from rapidfuzz.cpp_impl import extractOne
|
||||
import heapq
|
||||
import numbers
|
||||
|
||||
|
@ -117,86 +118,3 @@ def extractIndices(query, choices, scorer = fuzz.WRatio, processor = utils.defau
|
|||
|
||||
def extractBests(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0):
|
||||
return extract(query, choices, scorer, processor, limit, score_cutoff)
|
||||
|
||||
|
||||
def extractOne(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, score_cutoff = 0):
|
||||
"""
|
||||
Find the best match in a list of choices
|
||||
|
||||
Args:
|
||||
query (str): string we want to find
|
||||
choices (Iterable): list of all strings the query should be compared with or dict with a mapping
|
||||
{<result>: <string to compare>}
|
||||
scorer (Callable): optional callable that is used to calculate the matching score between
|
||||
the query and each choice. WRatio is used by default
|
||||
processor (Callable): optional callable that reformats the strings. utils.default_process
|
||||
is used by default, which lowercases the strings and trims whitespace
|
||||
score_cutoff (float): Optional argument for a score threshold. Matches with
|
||||
a lower score than this number will not be returned. Defaults to 0
|
||||
|
||||
Returns:
|
||||
Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is
|
||||
no match with a score >= score_cutoff
|
||||
Union[None, Tuple[str, float], Tuple[str, float, str]]: Returns the best match the best match
|
||||
in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will
|
||||
be in the form`(<choice>, <ratio>)` when `choices` is a list of strings
|
||||
or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.
|
||||
"""
|
||||
if query is None:
|
||||
return None
|
||||
|
||||
a = processor(query) if processor else query
|
||||
|
||||
result_score = None
|
||||
result_choice = ""
|
||||
|
||||
if hasattr(choices, "items"):
|
||||
choice_key = ""
|
||||
for choice, match_choice in choices.items():
|
||||
if match_choice is None:
|
||||
continue
|
||||
b = processor(match_choice) if processor else match_choice
|
||||
|
||||
score = scorer(
|
||||
a, b,
|
||||
processor=None,
|
||||
score_cutoff=score_cutoff)
|
||||
|
||||
if score >= score_cutoff:
|
||||
# very small increment for the score_cutoff, so when multiple
|
||||
# elements have the same score the first one is used
|
||||
# only done when the score is a number
|
||||
if isinstance(score, numbers.Number):
|
||||
score_cutoff = score + 0.00001
|
||||
if score_cutoff > 100:
|
||||
return (match_choice, score, choice)
|
||||
else:
|
||||
score_cutoff = score
|
||||
|
||||
result_score = score
|
||||
result_choice = match_choice
|
||||
choice_key = choice
|
||||
return (result_choice, result_score, choice_key) if not result_score is None else None
|
||||
|
||||
for choice in choices:
|
||||
if choice is None:
|
||||
continue
|
||||
b = processor(choice) if processor else choice
|
||||
|
||||
score = scorer(
|
||||
a, b,
|
||||
processor=None,
|
||||
score_cutoff=score_cutoff)
|
||||
|
||||
if score >= score_cutoff:
|
||||
if isinstance(score, numbers.Number):
|
||||
score_cutoff = score + 0.00001
|
||||
if score_cutoff > 100:
|
||||
return (choice, score)
|
||||
else:
|
||||
score_cutoff = score
|
||||
|
||||
result_score = score
|
||||
result_choice = choice
|
||||
|
||||
return (result_choice, result_score) if not result_score is None else None
|
||||
|
|
|
@ -5,6 +5,19 @@ import unittest
|
|||
|
||||
from rapidfuzz import process, fuzz, utils
|
||||
|
||||
scorers = [
|
||||
fuzz.ratio,
|
||||
fuzz.partial_ratio,
|
||||
fuzz.token_sort_ratio,
|
||||
fuzz.token_set_ratio,
|
||||
fuzz.token_ratio,
|
||||
fuzz.partial_token_sort_ratio,
|
||||
fuzz.partial_token_set_ratio,
|
||||
fuzz.partial_token_ratio,
|
||||
fuzz.WRatio,
|
||||
fuzz.QRatio
|
||||
]
|
||||
|
||||
class RatioTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.s1 = "new york mets"
|
||||
|
@ -87,5 +100,27 @@ class RatioTest(unittest.TestCase):
|
|||
score = fuzz.QRatio(s1, s2)
|
||||
self.assertEqual(0, score)
|
||||
|
||||
def testWithProcessor(self):
|
||||
"""
|
||||
Any scorer should accept any type as s1 and s2, as long as it is a string
|
||||
after preprocessing.
|
||||
"""
|
||||
s1 = ["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"]
|
||||
s2 = ["chicago cubs vs new york mets", "CitiFields", "2012-05-11", "9pm"]
|
||||
|
||||
for scorer in scorers:
|
||||
score = scorer(s1, s2, processor=lambda event: event[0])
|
||||
self.assertEqual(score, 100)
|
||||
|
||||
def testHelp(self):
|
||||
"""
|
||||
test that all help texts can be printed without throwing an exception,
|
||||
since they are implemented in C++ aswell
|
||||
"""
|
||||
|
||||
for scorer in scorers:
|
||||
help(scorer)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -0,0 +1,138 @@
|
|||
from itertools import product
|
||||
from functools import partial
|
||||
from string import ascii_letters, digits, punctuation
|
||||
|
||||
from hypothesis import given, assume, settings
|
||||
import hypothesis.strategies as st
|
||||
import pytest
|
||||
|
||||
from rapidfuzz import fuzz, process, utils
|
||||
import random
|
||||
|
||||
|
||||
HYPOTHESIS_ALPHABET = ascii_letters + digits + punctuation
|
||||
|
||||
SCORERS = [
|
||||
fuzz.ratio,
|
||||
fuzz.partial_ratio,
|
||||
fuzz.token_set_ratio,
|
||||
fuzz.token_sort_ratio,
|
||||
fuzz.token_ratio,
|
||||
fuzz.partial_token_set_ratio,
|
||||
fuzz.partial_token_sort_ratio,
|
||||
fuzz.partial_token_ratio,
|
||||
fuzz.WRatio,
|
||||
fuzz.QRatio
|
||||
]
|
||||
|
||||
FULL_SCORERS = [
|
||||
fuzz.ratio,
|
||||
fuzz.WRatio,
|
||||
fuzz.QRatio
|
||||
]
|
||||
|
||||
PROCESSORS = [
|
||||
lambda x: x,
|
||||
utils.default_process
|
||||
]
|
||||
|
||||
@given(sentence=st.text())
|
||||
@settings(max_examples=200)
|
||||
def test_multiple_processor_runs(sentence):
|
||||
"""
|
||||
Test that running a preprocessor on a sentence
|
||||
a second time does not change the result
|
||||
"""
|
||||
assert utils.default_process(sentence) \
|
||||
== utils.default_process(utils.default_process(sentence))
|
||||
|
||||
'''
|
||||
|
||||
def full_scorers_processors():
|
||||
"""
|
||||
Generate a list of (scorer, processor) pairs for testing for scorers that use the full string only
|
||||
:return: [(scorer, processor), ...]
|
||||
"""
|
||||
scorers = [fuzz.ratio]
|
||||
processors = [lambda x: x,
|
||||
partial(utils.full_process, force_ascii=False),
|
||||
partial(utils.full_process, force_ascii=True)]
|
||||
splist = list(product(scorers, processors))
|
||||
splist.extend(
|
||||
[(fuzz.WRatio, partial(utils.full_process, force_ascii=True)),
|
||||
(fuzz.QRatio, partial(utils.full_process, force_ascii=True)),
|
||||
(fuzz.UWRatio, partial(utils.full_process, force_ascii=False)),
|
||||
(fuzz.UQRatio, partial(utils.full_process, force_ascii=False))]
|
||||
)
|
||||
|
||||
return splist
|
||||
|
||||
|
||||
@pytest.mark.parametrize('scorer,processor',
|
||||
scorers_processors())
|
||||
@given(data=st.data())
|
||||
@settings(max_examples=20, deadline=5000)
|
||||
def test_identical_strings_extracted(scorer, processor, data):
|
||||
"""
|
||||
Test that identical strings will always return a perfect match.
|
||||
:param scorer:
|
||||
:param processor:
|
||||
:param data:
|
||||
:return:
|
||||
"""
|
||||
# Draw a list of random strings
|
||||
strings = data.draw(
|
||||
st.lists(
|
||||
st.text(min_size=10, max_size=100, alphabet=HYPOTHESIS_ALPHABET),
|
||||
min_size=1,
|
||||
max_size=10
|
||||
)
|
||||
)
|
||||
# Draw a random integer for the index in that list
|
||||
choiceidx = data.draw(st.integers(min_value=0, max_value=(len(strings) - 1)))
|
||||
|
||||
# Extract our choice from the list
|
||||
choice = strings[choiceidx]
|
||||
|
||||
# Check process doesn't make our choice the empty string
|
||||
assume(processor(choice) != '')
|
||||
|
||||
# Extract all perfect matches
|
||||
result = process.extractBests(choice,
|
||||
strings,
|
||||
scorer=scorer,
|
||||
processor=processor,
|
||||
score_cutoff=100,
|
||||
limit=None)
|
||||
|
||||
# Check we get a result
|
||||
assert result != []
|
||||
|
||||
# Check the original is in the list
|
||||
assert (choice, 100) in result
|
||||
'''
|
||||
|
||||
@pytest.mark.parametrize('scorer,processor', list(product(FULL_SCORERS, PROCESSORS)))
|
||||
@given(choices=st.lists(st.text(), min_size=1))
|
||||
@settings(max_examples=20, deadline=5000)
|
||||
def test_only_identical_strings_extracted(scorer, processor, choices):
|
||||
"""
|
||||
Test that only identical (post processing) strings score 100 on the test.
|
||||
If two strings are not identical then using full comparison methods they should
|
||||
not be a perfect (100) match.
|
||||
:param scorer:
|
||||
:param processor:
|
||||
:param data:
|
||||
:return:
|
||||
"""
|
||||
query = random.choice(choices)
|
||||
assume(processor(query) != '')
|
||||
|
||||
matches = process.extract(query, choices,
|
||||
scorer=scorer, processor=processor,
|
||||
score_cutoff=100, limit=None)
|
||||
|
||||
assert matches != []
|
||||
|
||||
for match in matches:
|
||||
assert processor(query) == processor(match[0])
|
|
@ -14,6 +14,20 @@ class ProcessTest(unittest.TestCase):
|
|||
"braves vs mets",
|
||||
]
|
||||
|
||||
def testExtractOneExceptions(self):
|
||||
self.assertRaises(TypeError, process.extractOne)
|
||||
self.assertRaises(TypeError, process.extractOne, 1)
|
||||
self.assertRaises(TypeError, process.extractOne, 1, [])
|
||||
self.assertRaises(TypeError, process.extractOne, '', [1])
|
||||
self.assertRaises(TypeError, process.extractOne, '', {1:1})
|
||||
|
||||
def testExtractExceptions(self):
|
||||
self.assertRaises(TypeError, process.extract)
|
||||
self.assertRaises(TypeError, process.extract, 1)
|
||||
self.assertRaises(TypeError, process.extract, 1, [])
|
||||
self.assertRaises(TypeError, process.extract, '', [1])
|
||||
self.assertRaises(TypeError, process.extract, '', {1:1})
|
||||
|
||||
def testGetBestChoice1(self):
|
||||
query = "new york mets at atlanta braves"
|
||||
best = process.extractOne(query, self.baseball_strings)
|
||||
|
@ -35,12 +49,16 @@ class ProcessTest(unittest.TestCase):
|
|||
self.assertEqual(best[0], self.baseball_strings[0])
|
||||
|
||||
def testWithProcessor(self):
|
||||
"""
|
||||
extractOne should accept any type as long as it is a string
|
||||
after preprocessing
|
||||
"""
|
||||
events = [
|
||||
["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"],
|
||||
["new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm"],
|
||||
["atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm"],
|
||||
]
|
||||
query = "new york mets vs chicago cubs"
|
||||
query = events[0]
|
||||
|
||||
best = process.extractOne(query, events, processor=lambda event: event[0])
|
||||
self.assertEqual(best[0], events[0])
|
||||
|
|
Loading…
Reference in New Issue