add distance support to process.*

## Changed
- added processor support to `levenshtein` and `hamming`
- added distance support to extract/extractOne/extract_iter

## Fixes
- incorrect results of `normalized_hamming` and `normalized_levenshtein` when used with `utils.default_process` as processor
This commit is contained in:
Max Bachmann 2021-03-29 19:09:22 +02:00 committed by GitHub
parent 3e1776ccd4
commit 05f907bf2b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 14069 additions and 5596 deletions

View File

@ -1 +1 @@
1.3.3
1.4.0

17307
src/cpp_process.cpp vendored

File diff suppressed because one or more lines are too long

View File

@ -1,18 +1,31 @@
#include "cpp_common.hpp"
struct ListMatchElem {
struct ListMatchScorerElem {
double score;
size_t index;
};
struct DictMatchElem {
struct DictMatchScorerElem {
double score;
size_t index;
PyObject* choice;
PyObject* key;
};
struct ExtractComp
struct ListMatchDistanceElem {
std::size_t distance;
size_t index;
};
struct DictMatchDistanceElem {
std::size_t distance;
size_t index;
PyObject* choice;
PyObject* key;
};
struct ExtractScorerComp
{
template<class T>
bool operator()(T const &a, T const &b) const {
@ -26,13 +39,34 @@ struct ExtractComp
}
};
struct ExtractDistanceComp
{
template<class T>
bool operator()(T const &a, T const &b) const {
if (a.distance < b.distance) {
return true;
} else if (a.distance > b.distance) {
return false;
} else {
return a.index < b.index;
}
}
};
typedef double (*scorer_func) (void* context, PyObject* str, double score_cutoff);
typedef void (*scorer_context_deinit) (void* context);
typedef std::size_t (*distance_func) (void* context, PyObject* str, std::size_t max);
typedef void (*context_deinit) (void* context);
struct scorer_context {
void* context;
scorer_func scorer;
scorer_context_deinit deinit;
context_deinit deinit;
};
struct distance_context {
void* context;
distance_func scorer;
context_deinit deinit;
};
template <typename CachedScorer>
@ -42,7 +76,7 @@ static void cached_deinit(void* context)
}
template<typename CachedScorer>
static inline double cached_func_default_process(
static inline double cached_scorer_func_default_process(
void* context, PyObject* py_str, double score_cutoff)
{
proc_string str = convert_string(py_str, "choice must be a String or None");
@ -68,12 +102,12 @@ static inline double cached_func_default_process(
score_cutoff
);
default:
throw std::logic_error("Reached end of control flow in cached_func_default_process");
throw std::logic_error("Reached end of control flow in cached_scorer_func_default_process");
}
}
template<typename CachedScorer>
static inline double cached_func(void* context, PyObject* py_str, double score_cutoff)
static inline double cached_scorer_func(void* context, PyObject* py_str, double score_cutoff)
{
proc_string str = convert_string(py_str, "choice must be a String or None");
CachedScorer* ratio = (CachedScorer*)context;
@ -95,7 +129,7 @@ static inline double cached_func(void* context, PyObject* py_str, double score_c
score_cutoff
);
default:
throw std::logic_error("Reached end of control flow in cached_func");
throw std::logic_error("Reached end of control flow in cached_scorer_func");
}
}
@ -107,16 +141,16 @@ static inline scorer_context get_scorer_context(const proc_string& str, int def_
context.context = (void*) new CachedScorer<Sentence>(Sentence((CharT*)str.data, str.length), args...);
if (def_process) {
context.scorer = cached_func_default_process<CachedScorer<Sentence>>;
context.scorer = cached_scorer_func_default_process<CachedScorer<Sentence>>;
} else {
context.scorer = cached_func<CachedScorer<Sentence>>;
context.scorer = cached_scorer_func<CachedScorer<Sentence>>;
}
context.deinit = cached_deinit<CachedScorer<Sentence>>;
return context;
}
template<template <typename> class CachedScorer, typename ...Args>
static inline scorer_context cached_init(PyObject* py_str, int def_process, Args... args)
static inline scorer_context cached_scorer_init(PyObject* py_str, int def_process, Args... args)
{
proc_string str = convert_string(py_str, "query must be a String");
@ -128,59 +162,59 @@ static inline scorer_context cached_init(PyObject* py_str, int def_process, Args
case PyUnicode_4BYTE_KIND:
return get_scorer_context<CachedScorer, uint32_t>(str, def_process, args...);
default:
throw std::logic_error("Reached end of control flow in cached_init");
throw std::logic_error("Reached end of control flow in cached_scorer_init");
}
}
/* fuzz */
static scorer_context cached_ratio_init(PyObject* py_str, int def_process)
{
return cached_init<fuzz::CachedRatio>(py_str, def_process);
return cached_scorer_init<fuzz::CachedRatio>(py_str, def_process);
}
static scorer_context cached_partial_ratio_init(PyObject* py_str, int def_process)
{
return cached_init<fuzz::CachedPartialRatio>(py_str, def_process);
return cached_scorer_init<fuzz::CachedPartialRatio>(py_str, def_process);
}
static scorer_context cached_token_sort_ratio_init(PyObject* py_str, int def_process)
{
return cached_init<fuzz::CachedTokenSortRatio>(py_str, def_process);
return cached_scorer_init<fuzz::CachedTokenSortRatio>(py_str, def_process);
}
static scorer_context cached_token_set_ratio_init(PyObject* py_str, int def_process)
{
return cached_init<fuzz::CachedTokenSetRatio>(py_str, def_process);
return cached_scorer_init<fuzz::CachedTokenSetRatio>(py_str, def_process);
}
static scorer_context cached_token_ratio_init(PyObject* py_str, int def_process)
{
return cached_init<fuzz::CachedTokenRatio>(py_str, def_process);
return cached_scorer_init<fuzz::CachedTokenRatio>(py_str, def_process);
}
static scorer_context cached_partial_token_sort_ratio_init(PyObject* py_str, int def_process)
{
return cached_init<fuzz::CachedPartialTokenSortRatio>(py_str, def_process);
return cached_scorer_init<fuzz::CachedPartialTokenSortRatio>(py_str, def_process);
}
static scorer_context cached_partial_token_set_ratio_init(PyObject* py_str, int def_process)
{
return cached_init<fuzz::CachedPartialTokenSetRatio>(py_str, def_process);
return cached_scorer_init<fuzz::CachedPartialTokenSetRatio>(py_str, def_process);
}
static scorer_context cached_partial_token_ratio_init(PyObject* py_str, int def_process)
{
return cached_init<fuzz::CachedPartialTokenRatio>(py_str, def_process);
return cached_scorer_init<fuzz::CachedPartialTokenRatio>(py_str, def_process);
}
static scorer_context cached_WRatio_init(PyObject* py_str, int def_process)
{
return cached_init<fuzz::CachedWRatio>(py_str, def_process);
return cached_scorer_init<fuzz::CachedWRatio>(py_str, def_process);
}
static scorer_context cached_QRatio_init(PyObject* py_str, int def_process)
{
return cached_init<fuzz::CachedQRatio>(py_str, def_process);
return cached_scorer_init<fuzz::CachedQRatio>(py_str, def_process);
}
/* string_metric */
@ -189,11 +223,123 @@ static scorer_context cached_normalized_levenshtein_init(PyObject* py_str, int d
size_t insertion, size_t deletion, size_t substitution)
{
rapidfuzz::LevenshteinWeightTable weights = {insertion, deletion, substitution};
return cached_init<string_metric::CachedNormalizedLevenshtein>(
return cached_scorer_init<string_metric::CachedNormalizedLevenshtein>(
py_str, def_process, weights);
}
static scorer_context cached_normalized_hamming_init(PyObject* py_str, int def_process)
{
return cached_init<string_metric::CachedNormalizedHamming>(py_str, def_process);
return cached_scorer_init<string_metric::CachedNormalizedHamming>(py_str, def_process);
}
/*************************************************
* cached distances
*************************************************/
template<typename CachedDistance>
static inline std::size_t cached_distance_func_default_process(
void* context, PyObject* py_str, std::size_t max)
{
proc_string str = convert_string(py_str, "choice must be a String or None");
CachedDistance* distance = (CachedDistance*)context;
switch(str.kind){
case PyUnicode_1BYTE_KIND:
return distance->distance(
utils::default_process(
rapidfuzz::basic_string_view<uint8_t>((uint8_t*)str.data, str.length)),
max
);
case PyUnicode_2BYTE_KIND:
return distance->distance(
utils::default_process(
rapidfuzz::basic_string_view<uint16_t>((uint16_t*)str.data, str.length)),
max
);
case PyUnicode_4BYTE_KIND:
return distance->distance(
utils::default_process(
rapidfuzz::basic_string_view<uint32_t>((uint32_t*)str.data, str.length)),
max
);
default:
throw std::logic_error("Reached end of control flow in cached_distance_func_default_process");
}
}
template<typename CachedDistance>
static inline std::size_t cached_distance_func(void* context, PyObject* py_str, std::size_t max)
{
proc_string str = convert_string(py_str, "choice must be a String or None");
CachedDistance* distance = (CachedDistance*)context;
switch(str.kind){
case PyUnicode_1BYTE_KIND:
return distance->distance(
rapidfuzz::basic_string_view<uint8_t>((uint8_t*)str.data, str.length),
max
);
case PyUnicode_2BYTE_KIND:
return distance->distance(
rapidfuzz::basic_string_view<uint16_t>((uint16_t*)str.data, str.length),
max
);
case PyUnicode_4BYTE_KIND:
return distance->distance(
rapidfuzz::basic_string_view<uint32_t>((uint32_t*)str.data, str.length),
max
);
default:
throw std::logic_error("Reached end of control flow in cached_distance_func");
}
}
template<template <typename> class CachedDistance, typename CharT, typename ...Args>
static inline distance_context get_distance_context(const proc_string& str, int def_process, Args... args)
{
using Sentence = rapidfuzz::basic_string_view<CharT>;
distance_context context;
context.context = (void*) new CachedDistance<Sentence>(Sentence((CharT*)str.data, str.length), args...);
if (def_process) {
context.scorer = cached_distance_func_default_process<CachedDistance<Sentence>>;
} else {
context.scorer = cached_distance_func<CachedDistance<Sentence>>;
}
context.deinit = cached_deinit<CachedDistance<Sentence>>;
return context;
}
template<template <typename> class CachedDistance, typename ...Args>
static inline distance_context cached_distance_init(PyObject* py_str, int def_process, Args... args)
{
proc_string str = convert_string(py_str, "query must be a String");
switch(str.kind){
case PyUnicode_1BYTE_KIND:
return get_distance_context<CachedDistance, uint8_t>(str, def_process, args...);
case PyUnicode_2BYTE_KIND:
return get_distance_context<CachedDistance, uint16_t>(str, def_process, args...);
case PyUnicode_4BYTE_KIND:
return get_distance_context<CachedDistance, uint32_t>(str, def_process, args...);
default:
throw std::logic_error("Reached end of control flow in cached_distance_init");
}
}
/* string_metric */
static distance_context cached_levenshtein_init(PyObject* py_str, int def_process,
size_t insertion, size_t deletion, size_t substitution)
{
rapidfuzz::LevenshteinWeightTable weights = {insertion, deletion, substitution};
return cached_distance_init<string_metric::CachedLevenshtein>(
py_str, def_process, weights);
}
static distance_context cached_hamming_init(PyObject* py_str, int def_process)
{
return cached_distance_init<string_metric::CachedHamming>(py_str, def_process);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -58,6 +58,73 @@ PyObject* levenshtein_impl(PyObject* s1, PyObject* s2,
return PyLong_FromSize_t(result);
}
template<typename CharT>
size_t levenshtein_impl_inner_default_process(proc_string s1, proc_string s2,
size_t insertion, size_t deletion, size_t substitution, size_t max)
{
switch(s2.kind){
case PyUnicode_1BYTE_KIND:
return string_metric::levenshtein(
utils::default_process(
rapidfuzz::basic_string_view<CharT>((CharT*)s1.data, s1.length)
),
utils::default_process(
rapidfuzz::basic_string_view<uint8_t>((uint8_t*)s2.data, s2.length)
),
{insertion, deletion, substitution}, max
);
case PyUnicode_2BYTE_KIND:
return string_metric::levenshtein(
utils::default_process(
rapidfuzz::basic_string_view<CharT>((CharT*)s1.data, s1.length)
),
utils::default_process(
rapidfuzz::basic_string_view<uint16_t>((uint16_t*)s2.data, s2.length)
),
{insertion, deletion, substitution}, max
);
default:
return string_metric::levenshtein(
utils::default_process(
rapidfuzz::basic_string_view<CharT>((CharT*)s1.data, s1.length)
),
utils::default_process(
rapidfuzz::basic_string_view<uint32_t>((uint32_t*)s2.data, s2.length)
),
{insertion, deletion, substitution}, max
);
}
}
PyObject* levenshtein_impl_default_process(PyObject* s1, PyObject* s2,
size_t insertion, size_t deletion, size_t substitution, size_t max)
{
size_t result = 0;
proc_string c_s1 = convert_string(s1, "s1 must be a String");
proc_string c_s2 = convert_string(s2, "s2 must be a String");
switch(c_s1.kind){
case PyUnicode_1BYTE_KIND:
result = levenshtein_impl_inner_default_process<uint8_t>(
c_s1, c_s2, insertion, deletion, substitution, max);
break;
case PyUnicode_2BYTE_KIND:
result = levenshtein_impl_inner_default_process<uint16_t>(
c_s1, c_s2, insertion, deletion, substitution, max);
break;
default:
result = levenshtein_impl_inner_default_process<uint32_t>(
c_s1, c_s2, insertion, deletion, substitution, max);
break;
}
if (result == (std::size_t)-1) {
return PyLong_FromLong(-1);
}
return PyLong_FromSize_t(result);
}
/*
* Normalized Levenshtein
@ -119,7 +186,9 @@ inline double normalized_levenshtein_impl_inner_default_process(proc_string s1,
utils::default_process(
rapidfuzz::basic_string_view<CharT>((CharT*)s1.data, s1.length)
),
rapidfuzz::basic_string_view<uint8_t>((uint8_t*)s2.data, s2.length),
utils::default_process(
rapidfuzz::basic_string_view<uint8_t>((uint8_t*)s2.data, s2.length)
),
{insertion, deletion, substitution}, score_cutoff
);
case PyUnicode_2BYTE_KIND:
@ -128,7 +197,7 @@ inline double normalized_levenshtein_impl_inner_default_process(proc_string s1,
rapidfuzz::basic_string_view<CharT>((CharT*)s1.data, s1.length)
),
utils::default_process(
rapidfuzz::basic_string_view<uint16_t>((uint16_t*)s1.data, s1.length)
rapidfuzz::basic_string_view<uint16_t>((uint16_t*)s2.data, s2.length)
),
{insertion, deletion, substitution}, score_cutoff
);
@ -138,7 +207,7 @@ inline double normalized_levenshtein_impl_inner_default_process(proc_string s1,
rapidfuzz::basic_string_view<CharT>((CharT*)s1.data, s1.length)
),
utils::default_process(
rapidfuzz::basic_string_view<uint32_t>((uint32_t*)s1.data, s1.length)
rapidfuzz::basic_string_view<uint32_t>((uint32_t*)s2.data, s2.length)
),
{insertion, deletion, substitution}, score_cutoff
);
@ -217,6 +286,67 @@ PyObject* hamming_impl(PyObject* s1, PyObject* s2, size_t max)
return PyLong_FromSize_t(result);
}
template<typename CharT>
size_t hamming_impl_inner_default_process(proc_string s1, proc_string s2, size_t max)
{
switch(s2.kind){
case PyUnicode_1BYTE_KIND:
return string_metric::hamming(
utils::default_process(
rapidfuzz::basic_string_view<CharT>((CharT*)s1.data, s1.length)
),
utils::default_process(
rapidfuzz::basic_string_view<uint8_t>((uint8_t*)s2.data, s2.length)
),
max
);
case PyUnicode_2BYTE_KIND:
return string_metric::hamming(
utils::default_process(
rapidfuzz::basic_string_view<CharT>((CharT*)s1.data, s1.length)
),
utils::default_process(
rapidfuzz::basic_string_view<uint16_t>((uint16_t*)s2.data, s2.length)
),
max
);
default:
return string_metric::hamming(
utils::default_process(
rapidfuzz::basic_string_view<CharT>((CharT*)s1.data, s1.length)
),
utils::default_process(
rapidfuzz::basic_string_view<uint32_t>((uint32_t*)s2.data, s2.length)
),
max
);
}
}
PyObject* hamming_impl_default_process(PyObject* s1, PyObject* s2, size_t max)
{
size_t result = 0;
proc_string c_s1 = convert_string(s1, "s1 must be a String");
proc_string c_s2 = convert_string(s2, "s2 must be a String");
switch(c_s1.kind){
case PyUnicode_1BYTE_KIND:
result = hamming_impl_inner_default_process<uint8_t>(c_s1, c_s2, max);
break;
case PyUnicode_2BYTE_KIND:
result = hamming_impl_inner_default_process<uint16_t>(c_s1, c_s2, max);
break;
default:
result = hamming_impl_inner_default_process<uint32_t>(c_s1, c_s2, max);
break;
}
if (result == (std::size_t)-1) {
return PyLong_FromLong(-1);
}
return PyLong_FromSize_t(result);
}
/*
* Normalized Hamming
@ -272,7 +402,9 @@ inline double normalized_hamming_impl_inner_default_process(
utils::default_process(
rapidfuzz::basic_string_view<CharT>((CharT*)s1.data, s1.length)
),
rapidfuzz::basic_string_view<uint8_t>((uint8_t*)s2.data, s2.length),
utils::default_process(
rapidfuzz::basic_string_view<uint8_t>((uint8_t*)s2.data, s2.length)
),
score_cutoff
);
case PyUnicode_2BYTE_KIND:
@ -281,7 +413,7 @@ inline double normalized_hamming_impl_inner_default_process(
rapidfuzz::basic_string_view<CharT>((CharT*)s1.data, s1.length)
),
utils::default_process(
rapidfuzz::basic_string_view<uint16_t>((uint16_t*)s1.data, s1.length)
rapidfuzz::basic_string_view<uint16_t>((uint16_t*)s2.data, s2.length)
),
score_cutoff
);
@ -291,7 +423,7 @@ inline double normalized_hamming_impl_inner_default_process(
rapidfuzz::basic_string_view<CharT>((CharT*)s1.data, s1.length)
),
utils::default_process(
rapidfuzz::basic_string_view<uint32_t>((uint32_t*)s1.data, s1.length)
rapidfuzz::basic_string_view<uint32_t>((uint32_t*)s2.data, s2.length)
),
score_cutoff
);

View File

@ -6,15 +6,17 @@ from rapidfuzz.utils import default_process
cdef extern from "cpp_string_metric.hpp":
object levenshtein_impl(object, object, size_t, size_t, size_t, size_t) except +
object levenshtein_impl_default_process(object, object, size_t, size_t, size_t, size_t) except +
double normalized_levenshtein_impl(object, object, size_t, size_t, size_t, double) except +
double normalized_levenshtein_impl_default_process(object, object, size_t, size_t, size_t, double) except +
object hamming_impl(object, object, size_t) except +
object hamming_impl_default_process(object, object, size_t) except +
double normalized_hamming_impl(object, object, double) except +
double normalized_hamming_impl_default_process(object, object, double) except +
def levenshtein(s1, s2, weights=(1,1,1), max=None):
def levenshtein(s1, s2, weights=(1,1,1), processor=None, max=None):
"""
Calculates the minimum number of insertions, deletions, and substitutions
required to change one sequence into the other according to Levenshtein with custom
@ -30,6 +32,10 @@ def levenshtein(s1, s2, weights=(1,1,1), max=None):
The weights for the three operations in the form
(insertion, deletion, substitution). Default is (1, 1, 1),
which gives all three operations a weight of 1.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
comparing them. When processor is True ``utils.default_process``
is used. Default is None, which deactivates this behaviour.
max : int or None, optional
Maximum Levenshtein distance between s1 and s2, that is
considered as a result. If the distance is bigger than max,
@ -183,6 +189,12 @@ def levenshtein(s1, s2, weights=(1,1,1), max=None):
if max is not None:
max_ = max
if processor is True or processor == default_process:
return levenshtein_impl_default_process(s1, s2, insertion, deletion, substitution, max_)
elif callable(processor):
s1 = processor(s1)
s2 = processor(s2)
return levenshtein_impl(s1, s2, insertion, deletion, substitution, max_)
@ -292,7 +304,7 @@ def normalized_levenshtein(s1, s2, weights=(1,1,1), processor=None, double score
return normalized_levenshtein_impl(s1, s2, insertion, deletion, substitution, score_cutoff)
def hamming(s1, s2, max=None):
def hamming(s1, s2, processor=None, max=None):
"""
Calculates the Hamming distance between two strings.
The hamming distance is defined as the number of positions
@ -305,6 +317,10 @@ def hamming(s1, s2, max=None):
First string to compare.
s2 : str
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
comparing them. When processor is True ``utils.default_process``
is used. Default is None, which deactivates this behaviour.
max : int or None, optional
Maximum Hamming distance between s1 and s2, that is
considered as a result. If the distance is bigger than max,
@ -326,6 +342,12 @@ def hamming(s1, s2, max=None):
if max is not None:
max_ = max
if processor is True or processor == default_process:
return hamming_impl_default_process(s1, s2, max_)
elif callable(processor):
s1 = processor(s1)
s2 = processor(s2)
return hamming_impl(s1, s2, max_)

29
src/cpp_utils.cpp vendored
View File

@ -306,6 +306,8 @@
# endif
# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
# define CYTHON_UNUSED __attribute__ ((__unused__))
# elif defined(_MSC_VER)
# define CYTHON_UNUSED __pragma(warning(suppress:4100))
# else
# define CYTHON_UNUSED
# endif
@ -479,7 +481,7 @@ class __Pyx_FakeReference {
typedef PyObject *(*__pyx_vectorcallfunc)(PyObject *callable, PyObject *const *args,
size_t nargsf, PyObject *kwnames);
#define __Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET ((size_t)1 << (8 * sizeof(size_t) - 1))
#define __Pyx_PyVectorcall_NARGS(n) ((Py_ssize_t)((n) & ~__Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET))
#define __Pyx_PyVectorcall_NARGS(n) ((Py_ssize_t)(((size_t)(n)) & ~__Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET))
#else
#define __Pyx_PY_VECTORCALL_ARGUMENTS_OFFSET 0
#define __Pyx_PyVectorcall_NARGS(n) ((Py_ssize_t)(n))
@ -1194,7 +1196,7 @@ static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject
/* PyVectorcallFastCallDict.proto */
#if CYTHON_METH_FASTCALL
static CYTHON_INLINE PyObject *__Pyx_PyVectorcall_FastCallDict(PyObject *func, __pyx_vectorcallfunc vc, PyObject *const *args, Py_ssize_t nargs, PyObject *kw);
static CYTHON_INLINE PyObject *__Pyx_PyVectorcall_FastCallDict(PyObject *func, __pyx_vectorcallfunc vc, PyObject *const *args, size_t nargs, PyObject *kw);
#endif
/* CythonFunctionShared.proto */
@ -1676,7 +1678,7 @@ PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
PyObject *__pyx_args, PyObject *__pyx_kwds
#endif
); /*proto*/
PyDoc_STRVAR(__pyx_doc_9cpp_utils_default_process, "\n This function preprocesses a string by:\n - removing all non alphanumeric characters\n - trimming whitespaces\n - converting all characters to lower case\n \n Right now this only affects characters lower than 256\n (extended Ascii), while all other characters are not modified.\n This should be enough for most western languages. Full Unicode\n support will be added in a later release.\n \n Parameters\n ----------\n sentence : str\n String to preprocess\n \n Returns\n -------\n processed_string : str\n processed string\n ");
PyDoc_STRVAR(__pyx_doc_9cpp_utils_default_process, "\n This function preprocesses a string by:\n - removing all non alphanumeric characters\n - trimming whitespaces\n - converting all characters to lower case\n \n Parameters\n ----------\n sentence : str\n String to preprocess\n \n Returns\n -------\n processed_string : str\n processed string\n ");
static PyMethodDef __pyx_mdef_9cpp_utils_1default_process = {"default_process", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_9cpp_utils_1default_process, __Pyx_METH_FASTCALL|METH_KEYWORDS, __pyx_doc_9cpp_utils_default_process};
static PyObject *__pyx_pw_9cpp_utils_1default_process(PyObject *__pyx_self,
#if CYTHON_METH_FASTCALL
@ -1753,7 +1755,7 @@ static PyObject *__pyx_pf_9cpp_utils_default_process(CYTHON_UNUSED PyObject *__p
int __pyx_clineno = 0;
__Pyx_RefNannySetupContext("default_process", 0);
/* "cpp_utils.pyx":30
/* "cpp_utils.pyx":25
* processed string
* """
* return default_process_impl(sentence) # <<<<<<<<<<<<<<
@ -1761,10 +1763,10 @@ static PyObject *__pyx_pf_9cpp_utils_default_process(CYTHON_UNUSED PyObject *__p
__Pyx_XDECREF(__pyx_r);
try {
__pyx_t_1 = default_process_impl(__pyx_v_sentence);
if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 30, __pyx_L1_error)
if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 25, __pyx_L1_error)
} catch(...) {
__Pyx_CppExn2PyErr();
__PYX_ERR(0, 30, __pyx_L1_error)
__PYX_ERR(0, 25, __pyx_L1_error)
}
__Pyx_GOTREF(__pyx_t_1);
__pyx_r = __pyx_t_1;
@ -2759,22 +2761,23 @@ static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject
/* PyVectorcallFastCallDict */
#if CYTHON_METH_FASTCALL
static PyObject *__Pyx_PyVectorcall_FastCallDict_kw(PyObject *func, __pyx_vectorcallfunc vc, PyObject *const *args, Py_ssize_t nargs, PyObject *kw)
static PyObject *__Pyx_PyVectorcall_FastCallDict_kw(PyObject *func, __pyx_vectorcallfunc vc, PyObject *const *args, size_t nargs, PyObject *kw)
{
PyObject *res = NULL;
PyObject *kwnames;
PyObject **newargs;
PyObject **kwvalues;
Py_ssize_t i, pos;
size_t j;
PyObject *key, *value;
unsigned long keys_are_strings;
Py_ssize_t nkw = PyDict_GET_SIZE(kw);
newargs = (PyObject **)PyMem_Malloc((size_t)(nargs + nkw) * sizeof(args[0]));
newargs = (PyObject **)PyMem_Malloc((nargs + (size_t)nkw) * sizeof(args[0]));
if (unlikely(newargs == NULL)) {
PyErr_NoMemory();
return NULL;
}
for (i = 0; i < nargs; i++) newargs[i] = args[i];
for (j = 0; j < nargs; j++) newargs[j] = args[j];
kwnames = PyTuple_New(nkw);
if (unlikely(kwnames == NULL)) {
PyMem_Free(newargs);
@ -2795,7 +2798,7 @@ static PyObject *__Pyx_PyVectorcall_FastCallDict_kw(PyObject *func, __pyx_vector
PyErr_SetString(PyExc_TypeError, "keywords must be strings");
goto cleanup;
}
res = vc(func, newargs, (size_t)nargs, kwnames);
res = vc(func, newargs, nargs, kwnames);
cleanup:
Py_DECREF(kwnames);
for (i = 0; i < nkw; i++)
@ -2803,10 +2806,10 @@ cleanup:
PyMem_Free(newargs);
return res;
}
static CYTHON_INLINE PyObject *__Pyx_PyVectorcall_FastCallDict(PyObject *func, __pyx_vectorcallfunc vc, PyObject *const *args, Py_ssize_t nargs, PyObject *kw)
static CYTHON_INLINE PyObject *__Pyx_PyVectorcall_FastCallDict(PyObject *func, __pyx_vectorcallfunc vc, PyObject *const *args, size_t nargs, PyObject *kw)
{
if (likely(kw == NULL) || PyDict_GET_SIZE(kw) == 0) {
return vc(func, args, (size_t)nargs, NULL);
return vc(func, args, nargs, NULL);
}
return __Pyx_PyVectorcall_FastCallDict_kw(func, vc, args, nargs, kw);
}
@ -3320,7 +3323,7 @@ static PyObject *__Pyx_CyFunction_CallAsMethod(PyObject *func, PyObject *args, P
__pyx_vectorcallfunc vc = __Pyx_CyFunction_func_vectorcall(cyfunc);
if (vc) {
#if CYTHON_ASSUME_SAFE_MACROS
return __Pyx_PyVectorcall_FastCallDict(func, vc, &PyTuple_GET_ITEM(args, 0), PyTuple_GET_SIZE(args), kw);
return __Pyx_PyVectorcall_FastCallDict(func, vc, &PyTuple_GET_ITEM(args, 0), (size_t)PyTuple_GET_SIZE(args), kw);
#else
(void) &__Pyx_PyVectorcall_FastCallDict;
return PyVectorcall_Call(func, args, kw);

View File

@ -12,11 +12,6 @@ def default_process(sentence):
- trimming whitespaces
- converting all characters to lower case
Right now this only affects characters lower than 256
(extended Ascii), while all other characters are not modified.
This should be enough for most western languages. Full Unicode
support will be added in a later release.
Parameters
----------
sentence : str

@ -1 +1 @@
Subproject commit 54c3a3f8fea959f3797cee45fb91420b926ac02b
Subproject commit 836f6a5a646df2d1b8d02bd18da37a8e121250ce

View File

@ -3,6 +3,6 @@ rapid string matching library
"""
__author__ = "Max Bachmann"
__license__ = "MIT"
__version__ = "1.3.3"
__version__ = "1.4.0"
from rapidfuzz import process, fuzz, utils, levenshtein, string_metric

View File

@ -70,6 +70,10 @@ def extractOne_scorer(s1, s2, scorer, processor=None, **kwargs):
def extract_scorer(s1, s2, scorer, processor=None, **kwargs):
return process.extract(s1, [s2], processor=processor, scorer=scorer, **kwargs)[0][1]
def extract_iter_scorer(s1, s2, scorer, processor=None, **kwargs):
return list(process.extract_iter(s1, [s2], processor=processor, scorer=scorer, **kwargs))[0][1]
HYPOTHESIS_ALPHABET = ascii_letters + digits + punctuation
SCORERS = [
@ -101,8 +105,8 @@ PROCESSORS = [
def test_partial_ratio(s1, s2):
"""
test partial_ratio. Currently this only tests, so there are no exceptions
In the future this should validate the implementation. However this requires
a correct implementation to be found.
In the future this should validate the implementation. However the current implementation
is not completely optimal in some edge cases
"""
fuzz.partial_ratio(s1, s2)
@ -130,19 +134,33 @@ def test_levenshtein_word(s1, s2):
"""
Test short Levenshtein implementation against simple implementation
"""
# uniform Levenshtein
# distance
reference_dist = levenshtein(s1, s2)
reference_sim = normalize_distance(reference_dist, s1, s2)
assert string_metric.levenshtein(s1, s2) == reference_dist
assert extractOne_scorer( s1, s2, string_metric.levenshtein) == reference_dist
assert extract_scorer( s1, s2, string_metric.levenshtein) == reference_dist
assert extract_iter_scorer(s1, s2, string_metric.levenshtein) == reference_dist
# normalized distance
reference_sim = normalize_distance(reference_dist, s1, s2)
assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim)
assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
# InDel-Distance
# distance
reference_dist = levenshtein(s1, s2, (1,1,2))
reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2))
assert string_metric.levenshtein(s1, s2, (1,1,2)) == reference_dist
assert extractOne_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
assert extract_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
assert extract_iter_scorer(s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
# normalized distance
reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2))
assert isclose(string_metric.normalized_levenshtein(s1, s2, (1,1,2)), reference_sim)
assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
@given(s1=st.text(min_size=65), s2=st.text(min_size=65))
@ -151,20 +169,33 @@ def test_levenshtein_block(s1, s2):
"""
Test blockwise Levenshtein implementation against simple implementation
"""
# uniform Levenshtein
# distance
reference_dist = levenshtein(s1, s2)
reference_sim = normalize_distance(reference_dist, s1, s2)
assert string_metric.levenshtein(s1, s2) == reference_dist
assert extractOne_scorer( s1, s2, string_metric.levenshtein) == reference_dist
assert extract_scorer( s1, s2, string_metric.levenshtein) == reference_dist
assert extract_iter_scorer(s1, s2, string_metric.levenshtein) == reference_dist
# normalized distance
reference_sim = normalize_distance(reference_dist, s1, s2)
assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim)
assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
# InDel-Distance
# distance
reference_dist = levenshtein(s1, s2, (1,1,2))
reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2))
assert string_metric.levenshtein(s1, s2, (1,1,2)) == reference_dist
assert isclose(string_metric.normalized_levenshtein(s1, s2, (1,1,2)), reference_sim)
assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert extractOne_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
assert extract_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
assert extract_iter_scorer(s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
# normalized distance
reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2))
assert isclose(string_metric.normalized_levenshtein(s1, s2, weights=(1,1,2)), reference_sim)
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
@given(s1=st.text(), s2=st.text())
@settings(max_examples=500, deadline=None)
@ -172,20 +203,33 @@ def test_levenshtein_random(s1, s2):
"""
Test mixed strings to test through all implementations of Levenshtein
"""
# uniform Levenshtein
# distance
reference_dist = levenshtein(s1, s2)
reference_sim = normalize_distance(reference_dist, s1, s2)
assert string_metric.levenshtein(s1, s2) == reference_dist
assert extractOne_scorer( s1, s2, string_metric.levenshtein) == reference_dist
assert extract_scorer( s1, s2, string_metric.levenshtein) == reference_dist
assert extract_iter_scorer(s1, s2, string_metric.levenshtein) == reference_dist
# normalized distance
reference_sim = normalize_distance(reference_dist, s1, s2)
assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim)
assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
# InDel-Distance
# distance
reference_dist = levenshtein(s1, s2, (1,1,2))
reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2))
assert string_metric.levenshtein(s1, s2, (1,1,2)) == reference_dist
assert extractOne_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
assert extract_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
assert extract_iter_scorer(s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
# normalized distance
reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2))
assert isclose(string_metric.normalized_levenshtein(s1, s2, (1,1,2)), reference_sim)
assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
@given(sentence=st.text())
@settings(max_examples=200)

View File

@ -160,6 +160,26 @@ class ProcessTest(unittest.TestCase):
self.assertIsNotNone(best)
self.assertEqual(best[1], 0)
def testNoneElements(self):
"""
when a None element is used, it is skipped and the index is still correct
"""
best = process.extractOne("test", [None, "tes"])
self.assertEqual(best[2], 1)
best = process.extract("test", [None, "tes"], limit=1)
self.assertEqual(best[0][2], 1)
def testResultOrder(self):
"""
when multiple elements have the same score, the first one should be returned
"""
best = process.extractOne("test", ["tes", "tes"])
self.assertEqual(best[2], 0)
best = process.extract("test", ["tes", "tes"], limit=1)
self.assertEqual(best[0][2], 0)
def testEmptyStrings(self):
choices = [
"",