add cdist implementation

This commit is contained in:
Max Bachmann 2021-09-10 12:44:54 +02:00
parent 1aed654d4f
commit 56f062b063
15 changed files with 11706 additions and 3583 deletions

View File

@ -101,7 +101,8 @@ jobs:
fail-fast: false
matrix:
python_tag: [ "pp36-*", "pp37-*"]
os: [ubuntu-latest, windows-latest, macos-latest]
# numpy ships no wheels for pypy on mac os
os: [ubuntu-latest, windows-latest]
env:
CIBW_BUILD: ${{matrix.python_tag}}
# activate tests when the fix for

View File

@ -1,6 +1,10 @@
process module
==============
cdist
----------
.. autofunction:: rapidfuzz.process.cdist
extract
-------
.. autofunction:: rapidfuzz.process.extract

View File

@ -2,5 +2,6 @@
requires = [
"setuptools",
"wheel",
"oldest-supported-numpy"
]
build-backend = "setuptools.build_meta"

View File

@ -26,6 +26,8 @@ package_dir=
=src
packages = find:
python_requires = >=3.5
install_requires =
numpy
[options.packages.find]
where=src

View File

@ -2,6 +2,7 @@ from setuptools import setup, Extension
from setuptools.command.build_ext import build_ext
import sys
import os
import numpy as np
# use with export RAPIDFUZZ_TRACE=1
RAPIDFUZZ_TRACE = os.environ.get("RAPIDFUZZ_TRACE", False)
@ -46,7 +47,7 @@ ext_modules = [
'src/cpp_process.cpp',
'src/rapidfuzz-cpp/rapidfuzz/details/unicode.cpp'
],
include_dirs=["src/rapidfuzz-cpp/"],
include_dirs=["src/rapidfuzz-cpp/", np.get_include()],
language='c++',
),
Extension(

View File

@ -204,8 +204,9 @@ double RATIO##_impl_inner_##PROCESSOR(const proc_string& s1, const Sentence& s2,
{ \
switch(s1.kind){ \
LIST_OF_CASES(RATIO_FUNC, PROCESSOR) \
default: \
throw std::logic_error("Reached end of control flow in " #RATIO "_impl_inner_" #PROCESSOR); \
} \
assert(false); /* silence any warnings about missing return value */ \
}
/* generate <ratio_name>_impl_<processor> functions which are used internally
@ -217,8 +218,9 @@ double RATIO##_impl_##PROCESSOR(const proc_string& s1, const proc_string& s2, Ar
{ \
switch(s1.kind){ \
LIST_OF_CASES(RATIO##_impl_inner_##PROCESSOR, PROCESSOR) \
default: \
throw std::logic_error("Reached end of control flow in " #RATIO "_impl_" #PROCESSOR); \
} \
assert(false); /* silence any warnings about missing return value */ \
}
#define RATIO_IMPL_DEF(RATIO, RATIO_FUNC) \
@ -236,8 +238,9 @@ size_t RATIO##_impl_inner_##PROCESSOR(const proc_string& s1, const Sentence& s2,
{ \
switch(s1.kind){ \
LIST_OF_CASES(RATIO_FUNC, PROCESSOR) \
default: \
throw std::logic_error("Reached end of control flow in " #RATIO "_impl_inner_" #PROCESSOR); \
} \
assert(false); /* silence any warnings about missing return value */ \
}
/* generate <ratio_name>_impl_<processor> functions which are used internally
@ -249,8 +252,9 @@ size_t RATIO##_impl_##PROCESSOR(const proc_string& s1, const proc_string& s2, Ar
{ \
switch(s1.kind){ \
LIST_OF_CASES(RATIO##_impl_inner_##PROCESSOR, PROCESSOR) \
default: \
throw std::logic_error("Reached end of control flow in " #RATIO "_impl_" #PROCESSOR); \
} \
assert(false); /* silence any warnings about missing return value */ \
}
#define DISTANCE_IMPL_DEF(RATIO, RATIO_FUNC) \
@ -310,3 +314,36 @@ PyObject* RATIO##_default_process(const proc_string& s1, const proc_string& s2,
size_t result = RATIO##_impl_default_process(s1, s2, max); \
return dist_to_long(result); \
}
template <typename CharT>
proc_string default_process_func_impl(proc_string sentence) {
CharT* str = static_cast<CharT*>(sentence.data);
if (!sentence.allocated)
{
CharT* temp_str = (CharT*)malloc(sentence.length * sizeof(CharT));
if (temp_str == NULL)
{
throw std::bad_alloc();
}
std::copy(str, str + sentence.length, temp_str);
str = temp_str;
}
sentence.allocated = true;
sentence.data = str;
sentence.kind = sentence.kind;
sentence.length = utils::default_process(str, sentence.length);
return sentence;
}
proc_string default_process_func(proc_string sentence) {
switch (sentence.kind) {
# define X_ENUM(KIND, TYPE, MSVC_TUPLE) case KIND: return default_process_func_impl<TYPE>(std::move(sentence));
LIST_OF_CASES()
default:
throw std::logic_error("Reached end of control flow in default_process_func");
# undef X_ENUM
}
}

View File

@ -23,6 +23,7 @@ cdef extern from "cpp_common.hpp":
int is_valid_string(object py_str) except +
proc_string convert_string(object py_str)
void validate_string(object py_str, const char* err) except +
proc_string default_process_func(proc_string sentence) except +
cdef inline proc_string hash_array(arr) except *:
# TODO on Cpython this does not require any copies

244
src/cpp_fuzz.cpp vendored
View File

@ -1976,11 +1976,11 @@ static const char __pyx_k_partial_token_set_ratio[] = "partial_token_set_ratio";
static const char __pyx_k_partial_token_sort_ratio[] = "partial_token_sort_ratio";
static const char __pyx_k_token_set_ratio_line_217[] = "token_set_ratio (line 217)";
static const char __pyx_k_token_sort_ratio_line_170[] = "token_sort_ratio (line 170)";
static const char __pyx_k_Sorts_the_words_in_the_strings[] = "\n Sorts the words in the strings and calculates the fuzz.ratio between them\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/token_sort_ratio.svg\n\n Examples\n --------\n >>> fuzz.token_sort_ratio(\"fuzzy wuzzy was a bear\", \"wuzzy fuzzy was a bear\")\n 100.0\n ";
static const char __pyx_k_Calculates_a_quick_ratio_betwee[] = "\n Calculates a quick ratio between two strings using fuzz.ratio.\n The only difference to fuzz.ratio is, that this preprocesses\n the strings by default.\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Examples\n --------\n >>> fuzz.QRatio(\"this is a test\", \"THIS is a test!\")\n 100.0\n ";
static const char __pyx_k_Calculates_the_normalized_InDel[] = "\n Calculates the normalized InDel distance.\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is None, which deactivates this behaviour.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n See Also\n --------\n rapidfuzz.string_metric.normalized_levenshtein : Normalized levenshtein distance\n\n Notes\n -----\n .. image:: img/ratio.svg\n\n Examples\n --------\n >>> fuzz.ratio(\"this is a test\", \"this is a test!\")\n 96.55171966552734\n ";
static const char __pyx_k_Compares_the_words_in_the_strin[] = "\n Compares the words in the strings based on unique and common words between them\n using fuzz.ratio\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/token_set_ratio.svg\n\n Examples\n --------\n >>> fuzz.token_sort_ratio(\"fuzzy was a bear\", \"fuzzy fuzzy was a bear\")\n 83.8709716796875\n >>> fuzz.token_set_ratio(\"fuzzy was a bear\", \"fuzzy fuzzy was a bear\")\n 100.0\n ";
static const char __pyx_k_Searches_for_the_optimal_alignm[] = "\n Searches for the optimal alignment of the shorter string in the\n longer string and returns the fuzz.ratio for this alignment.\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is None, which deactivates this behaviour.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n Depending on the length of the needle (shorter string) different\n implementations are used to improve the performance.\n\n short needle (length \342\211\244 64):\n When using a short needle length the fuzz.ratio is calculated for all\n alignments that could result in an optimal alignment. It is\n guaranteed to find the optimal alignment. For short needles this is very\n fast, since for them fuzz.ratio runs in ``O(N)`` time. This results in a worst\n case performance of ``O(NM)``.\n \n .. image:: img/partial_ratio_short_needle.svg\n\n long needle (length > 64):\n For long needles a similar implementation to FuzzyWuzzy is used.\n This implementation only considers alignments which start at one\n of the longest common substrings. This results in a worst case performance\n of ``O(N[N/64]M)``. However usually most of the alignments can be skipped.\n The following Python code shows the concept:\n\n .. code-block:: python\n\n blocks = SequenceMatcher(None, needle, longer, False).get_ma""tching_blocks()\n score = 0\n for block in blocks:\n long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0\n long_end = long_start + len(shorter)\n long_substr = longer[long_start:long_end]\n score = max(score, fuzz.ratio(needle, long_substr))\n\n This is a lot faster than checking all possible alignments. However it\n only finds one of the best alignments and not necessarily the optimal one.\n\n .. image:: img/partial_ratio_long_needle.svg\n\n Examples\n --------\n >>> fuzz.partial_ratio(\"this is a test\", \"this is a test!\")\n 100.0\n ";
static const char __pyx_k_Sorts_the_words_in_the_strings[] = "\n Sorts the words in the strings and calculates the fuzz.ratio between them\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/token_sort_ratio.svg\n\n Examples\n --------\n >>> fuzz.token_sort_ratio(\"fuzzy wuzzy was a bear\", \"wuzzy fuzzy was a bear\")\n 100.0\n ";
static const char __pyx_k_Calculates_a_quick_ratio_betwee[] = "\n Calculates a quick ratio between two strings using fuzz.ratio.\n The only difference to fuzz.ratio is, that this preprocesses\n the strings by default.\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Examples\n --------\n >>> fuzz.QRatio(\"this is a test\", \"THIS is a test!\")\n 100.0\n ";
static const char __pyx_k_Calculates_the_normalized_InDel[] = "\n Calculates the normalized InDel distance.\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is None, which deactivates this behaviour.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n See Also\n --------\n rapidfuzz.string_metric.normalized_levenshtein : Normalized levenshtein distance\n\n Notes\n -----\n .. image:: img/ratio.svg\n\n Examples\n --------\n >>> fuzz.ratio(\"this is a test\", \"this is a test!\")\n 96.55171966552734\n ";
static const char __pyx_k_Compares_the_words_in_the_strin[] = "\n Compares the words in the strings based on unique and common words between them\n using fuzz.ratio\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/token_set_ratio.svg\n\n Examples\n --------\n >>> fuzz.token_sort_ratio(\"fuzzy was a bear\", \"fuzzy fuzzy was a bear\")\n 83.8709716796875\n >>> fuzz.token_set_ratio(\"fuzzy was a bear\", \"fuzzy fuzzy was a bear\")\n 100.0\n ";
static const char __pyx_k_Searches_for_the_optimal_alignm[] = "\n Searches for the optimal alignment of the shorter string in the\n longer string and returns the fuzz.ratio for this alignment.\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is None, which deactivates this behaviour.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n Depending on the length of the needle (shorter string) different\n implementations are used to improve the performance.\n\n short needle (length \342\211\244 64):\n When using a short needle length the fuzz.ratio is calculated for all\n alignments that could result in an optimal alignment. It is\n guaranteed to find the optimal alignment. For short needles this is very\n fast, since for them fuzz.ratio runs in ``O(N)`` time. This results in a worst\n case performance of ``O(NM)``.\n \n .. image:: img/partial_ratio_short_needle.svg\n\n long needle (length > 64):\n For long needles a similar implementation to FuzzyWuzzy is used.\n This implementation only considers alignments which start at one\n of the longest common substrings. This results in a worst case performance\n of ``O(N[N/64]M)``. However usually most of the alignments can be skipped.\n The following Python code shows the concept:\n\n .. code-block:: python\n\n blocks = SequenceMatcher(None,"" needle, longer, False).get_matching_blocks()\n score = 0\n for block in blocks:\n long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0\n long_end = long_start + len(shorter)\n long_substr = longer[long_start:long_end]\n score = max(score, fuzz.ratio(needle, long_substr))\n\n This is a lot faster than checking all possible alignments. However it\n only finds one of the best alignments and not necessarily the optimal one.\n\n .. image:: img/partial_ratio_long_needle.svg\n\n Examples\n --------\n >>> fuzz.partial_ratio(\"this is a test\", \"this is a test!\")\n 100.0\n ";
#if !CYTHON_USE_MODULE_STATE
static PyObject *__pyx_kp_u_Calculates_a_quick_ratio_betwee;
static PyObject *__pyx_kp_u_Calculates_the_normalized_InDel;
@ -2542,7 +2542,7 @@ PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
PyObject *__pyx_args, PyObject *__pyx_kwds
#endif
); /*proto*/
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_ratio, "\n Calculates the normalized InDel distance.\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is None, which deactivates this behaviour.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n See Also\n --------\n rapidfuzz.string_metric.normalized_levenshtein : Normalized levenshtein distance\n\n Notes\n -----\n .. image:: img/ratio.svg\n\n Examples\n --------\n >>> fuzz.ratio(\"this is a test\", \"this is a test!\")\n 96.55171966552734\n ");
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_ratio, "\n Calculates the normalized InDel distance.\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is None, which deactivates this behaviour.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n See Also\n --------\n rapidfuzz.string_metric.normalized_levenshtein : Normalized levenshtein distance\n\n Notes\n -----\n .. image:: img/ratio.svg\n\n Examples\n --------\n >>> fuzz.ratio(\"this is a test\", \"this is a test!\")\n 96.55171966552734\n ");
static PyMethodDef __pyx_mdef_8cpp_fuzz_1ratio = {"ratio", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_8cpp_fuzz_1ratio, __Pyx_METH_FASTCALL|METH_KEYWORDS, __pyx_doc_8cpp_fuzz_ratio};
static PyObject *__pyx_pw_8cpp_fuzz_1ratio(PyObject *__pyx_self,
#if CYTHON_METH_FASTCALL
@ -2926,7 +2926,7 @@ PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
PyObject *__pyx_args, PyObject *__pyx_kwds
#endif
); /*proto*/
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_2partial_ratio, "\n Searches for the optimal alignment of the shorter string in the\n longer string and returns the fuzz.ratio for this alignment.\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is None, which deactivates this behaviour.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n Depending on the length of the needle (shorter string) different\n implementations are used to improve the performance.\n\n short needle (length \342\211\244 64):\n When using a short needle length the fuzz.ratio is calculated for all\n alignments that could result in an optimal alignment. It is\n guaranteed to find the optimal alignment. For short needles this is very\n fast, since for them fuzz.ratio runs in ``O(N)`` time. This results in a worst\n case performance of ``O(NM)``.\n \n .. image:: img/partial_ratio_short_needle.svg\n\n long needle (length > 64):\n For long needles a similar implementation to FuzzyWuzzy is used.\n This implementation only considers alignments which start at one\n of the longest common substrings. This results in a worst case performance\n of ``O(N[N/64]M)``. However usually most of the alignments can be skipped.\n The following Python code shows the concept:\n\n .. code-block:: python\n\n blocks = SequenceMatcher(None, needle, longer, False).get_ma""tching_blocks()\n score = 0\n for block in blocks:\n long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0\n long_end = long_start + len(shorter)\n long_substr = longer[long_start:long_end]\n score = max(score, fuzz.ratio(needle, long_substr))\n\n This is a lot faster than checking all possible alignments. However it\n only finds one of the best alignments and not necessarily the optimal one.\n\n .. image:: img/partial_ratio_long_needle.svg\n\n Examples\n --------\n >>> fuzz.partial_ratio(\"this is a test\", \"this is a test!\")\n 100.0\n ");
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_2partial_ratio, "\n Searches for the optimal alignment of the shorter string in the\n longer string and returns the fuzz.ratio for this alignment.\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is None, which deactivates this behaviour.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n Depending on the length of the needle (shorter string) different\n implementations are used to improve the performance.\n\n short needle (length \342\211\244 64):\n When using a short needle length the fuzz.ratio is calculated for all\n alignments that could result in an optimal alignment. It is\n guaranteed to find the optimal alignment. For short needles this is very\n fast, since for them fuzz.ratio runs in ``O(N)`` time. This results in a worst\n case performance of ``O(NM)``.\n \n .. image:: img/partial_ratio_short_needle.svg\n\n long needle (length > 64):\n For long needles a similar implementation to FuzzyWuzzy is used.\n This implementation only considers alignments which start at one\n of the longest common substrings. This results in a worst case performance\n of ``O(N[N/64]M)``. However usually most of the alignments can be skipped.\n The following Python code shows the concept:\n\n .. code-block:: python\n\n blocks = SequenceMatcher(None,"" needle, longer, False).get_matching_blocks()\n score = 0\n for block in blocks:\n long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0\n long_end = long_start + len(shorter)\n long_substr = longer[long_start:long_end]\n score = max(score, fuzz.ratio(needle, long_substr))\n\n This is a lot faster than checking all possible alignments. However it\n only finds one of the best alignments and not necessarily the optimal one.\n\n .. image:: img/partial_ratio_long_needle.svg\n\n Examples\n --------\n >>> fuzz.partial_ratio(\"this is a test\", \"this is a test!\")\n 100.0\n ");
static PyMethodDef __pyx_mdef_8cpp_fuzz_3partial_ratio = {"partial_ratio", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_8cpp_fuzz_3partial_ratio, __Pyx_METH_FASTCALL|METH_KEYWORDS, __pyx_doc_8cpp_fuzz_2partial_ratio};
static PyObject *__pyx_pw_8cpp_fuzz_3partial_ratio(PyObject *__pyx_self,
#if CYTHON_METH_FASTCALL
@ -3310,7 +3310,7 @@ PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
PyObject *__pyx_args, PyObject *__pyx_kwds
#endif
); /*proto*/
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_4token_sort_ratio, "\n Sorts the words in the strings and calculates the fuzz.ratio between them\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/token_sort_ratio.svg\n\n Examples\n --------\n >>> fuzz.token_sort_ratio(\"fuzzy wuzzy was a bear\", \"wuzzy fuzzy was a bear\")\n 100.0\n ");
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_4token_sort_ratio, "\n Sorts the words in the strings and calculates the fuzz.ratio between them\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/token_sort_ratio.svg\n\n Examples\n --------\n >>> fuzz.token_sort_ratio(\"fuzzy wuzzy was a bear\", \"wuzzy fuzzy was a bear\")\n 100.0\n ");
static PyMethodDef __pyx_mdef_8cpp_fuzz_5token_sort_ratio = {"token_sort_ratio", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_8cpp_fuzz_5token_sort_ratio, __Pyx_METH_FASTCALL|METH_KEYWORDS, __pyx_doc_8cpp_fuzz_4token_sort_ratio};
static PyObject *__pyx_pw_8cpp_fuzz_5token_sort_ratio(PyObject *__pyx_self,
#if CYTHON_METH_FASTCALL
@ -3694,7 +3694,7 @@ PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
PyObject *__pyx_args, PyObject *__pyx_kwds
#endif
); /*proto*/
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_6token_set_ratio, "\n Compares the words in the strings based on unique and common words between them\n using fuzz.ratio\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/token_set_ratio.svg\n\n Examples\n --------\n >>> fuzz.token_sort_ratio(\"fuzzy was a bear\", \"fuzzy fuzzy was a bear\")\n 83.8709716796875\n >>> fuzz.token_set_ratio(\"fuzzy was a bear\", \"fuzzy fuzzy was a bear\")\n 100.0\n ");
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_6token_set_ratio, "\n Compares the words in the strings based on unique and common words between them\n using fuzz.ratio\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/token_set_ratio.svg\n\n Examples\n --------\n >>> fuzz.token_sort_ratio(\"fuzzy was a bear\", \"fuzzy fuzzy was a bear\")\n 83.8709716796875\n >>> fuzz.token_set_ratio(\"fuzzy was a bear\", \"fuzzy fuzzy was a bear\")\n 100.0\n ");
static PyMethodDef __pyx_mdef_8cpp_fuzz_7token_set_ratio = {"token_set_ratio", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_8cpp_fuzz_7token_set_ratio, __Pyx_METH_FASTCALL|METH_KEYWORDS, __pyx_doc_8cpp_fuzz_6token_set_ratio};
static PyObject *__pyx_pw_8cpp_fuzz_7token_set_ratio(PyObject *__pyx_self,
#if CYTHON_METH_FASTCALL
@ -4078,7 +4078,7 @@ PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
PyObject *__pyx_args, PyObject *__pyx_kwds
#endif
); /*proto*/
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_8token_ratio, "\n Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio\n (faster than manually executing the two functions)\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/token_ratio.svg\n ");
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_8token_ratio, "\n Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio\n (faster than manually executing the two functions)\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/token_ratio.svg\n ");
static PyMethodDef __pyx_mdef_8cpp_fuzz_9token_ratio = {"token_ratio", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_8cpp_fuzz_9token_ratio, __Pyx_METH_FASTCALL|METH_KEYWORDS, __pyx_doc_8cpp_fuzz_8token_ratio};
static PyObject *__pyx_pw_8cpp_fuzz_9token_ratio(PyObject *__pyx_self,
#if CYTHON_METH_FASTCALL
@ -4462,7 +4462,7 @@ PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
PyObject *__pyx_args, PyObject *__pyx_kwds
#endif
); /*proto*/
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_10partial_token_sort_ratio, "\n sorts the words in the strings and calculates the fuzz.partial_ratio between them\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/partial_token_sort_ratio.svg\n ");
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_10partial_token_sort_ratio, "\n sorts the words in the strings and calculates the fuzz.partial_ratio between them\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/partial_token_sort_ratio.svg\n ");
static PyMethodDef __pyx_mdef_8cpp_fuzz_11partial_token_sort_ratio = {"partial_token_sort_ratio", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_8cpp_fuzz_11partial_token_sort_ratio, __Pyx_METH_FASTCALL|METH_KEYWORDS, __pyx_doc_8cpp_fuzz_10partial_token_sort_ratio};
static PyObject *__pyx_pw_8cpp_fuzz_11partial_token_sort_ratio(PyObject *__pyx_self,
#if CYTHON_METH_FASTCALL
@ -4846,7 +4846,7 @@ PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
PyObject *__pyx_args, PyObject *__pyx_kwds
#endif
); /*proto*/
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_12partial_token_set_ratio, "\n Compares the words in the strings based on unique and common words between them\n using fuzz.partial_ratio\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/partial_token_set_ratio.svg\n ");
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_12partial_token_set_ratio, "\n Compares the words in the strings based on unique and common words between them\n using fuzz.partial_ratio\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/partial_token_set_ratio.svg\n ");
static PyMethodDef __pyx_mdef_8cpp_fuzz_13partial_token_set_ratio = {"partial_token_set_ratio", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_8cpp_fuzz_13partial_token_set_ratio, __Pyx_METH_FASTCALL|METH_KEYWORDS, __pyx_doc_8cpp_fuzz_12partial_token_set_ratio};
static PyObject *__pyx_pw_8cpp_fuzz_13partial_token_set_ratio(PyObject *__pyx_self,
#if CYTHON_METH_FASTCALL
@ -5230,7 +5230,7 @@ PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
PyObject *__pyx_args, PyObject *__pyx_kwds
#endif
); /*proto*/
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_14partial_token_ratio, "\n Helper method that returns the maximum of fuzz.partial_token_set_ratio and\n fuzz.partial_token_sort_ratio (faster than manually executing the two functions)\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/partial_token_ratio.svg\n ");
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_14partial_token_ratio, "\n Helper method that returns the maximum of fuzz.partial_token_set_ratio and\n fuzz.partial_token_sort_ratio (faster than manually executing the two functions)\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/partial_token_ratio.svg\n ");
static PyMethodDef __pyx_mdef_8cpp_fuzz_15partial_token_ratio = {"partial_token_ratio", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_8cpp_fuzz_15partial_token_ratio, __Pyx_METH_FASTCALL|METH_KEYWORDS, __pyx_doc_8cpp_fuzz_14partial_token_ratio};
static PyObject *__pyx_pw_8cpp_fuzz_15partial_token_ratio(PyObject *__pyx_self,
#if CYTHON_METH_FASTCALL
@ -5614,7 +5614,7 @@ PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
PyObject *__pyx_args, PyObject *__pyx_kwds
#endif
); /*proto*/
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_16WRatio, "\n Calculates a weighted ratio based on the other ratio algorithms\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/WRatio.svg\n ");
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_16WRatio, "\n Calculates a weighted ratio based on the other ratio algorithms\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Notes\n -----\n .. image:: img/WRatio.svg\n ");
static PyMethodDef __pyx_mdef_8cpp_fuzz_17WRatio = {"WRatio", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_8cpp_fuzz_17WRatio, __Pyx_METH_FASTCALL|METH_KEYWORDS, __pyx_doc_8cpp_fuzz_16WRatio};
static PyObject *__pyx_pw_8cpp_fuzz_17WRatio(PyObject *__pyx_self,
#if CYTHON_METH_FASTCALL
@ -5998,7 +5998,7 @@ PyObject *const *__pyx_args, Py_ssize_t __pyx_nargs, PyObject *__pyx_kwds
PyObject *__pyx_args, PyObject *__pyx_kwds
#endif
); /*proto*/
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_18QRatio, "\n Calculates a quick ratio between two strings using fuzz.ratio.\n The only difference to fuzz.ratio is, that this preprocesses\n the strings by default.\n\n Parameters\n ----------\n s1 : str\n First string to compare.\n s2 : str\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Examples\n --------\n >>> fuzz.QRatio(\"this is a test\", \"THIS is a test!\")\n 100.0\n ");
PyDoc_STRVAR(__pyx_doc_8cpp_fuzz_18QRatio, "\n Calculates a quick ratio between two strings using fuzz.ratio.\n The only difference to fuzz.ratio is, that this preprocesses\n the strings by default.\n\n Parameters\n ----------\n s1 : Sequence[Hashable]\n First string to compare.\n s2 : Sequence[Hashable]\n Second string to compare.\n processor: bool or callable, optional\n Optional callable that is used to preprocess the strings before\n comparing them. When processor is True ``utils.default_process``\n is used. Default is True.\n score_cutoff : float, optional\n Optional argument for a score threshold as a float between 0 and 100.\n For ratio < score_cutoff 0 is returned instead. Default is 0,\n which deactivates this behaviour.\n\n Returns\n -------\n similarity : float\n similarity between s1 and s2 as a float between 0 and 100\n\n Examples\n --------\n >>> fuzz.QRatio(\"this is a test\", \"THIS is a test!\")\n 100.0\n ");
static PyMethodDef __pyx_mdef_8cpp_fuzz_19QRatio = {"QRatio", (PyCFunction)(void*)(__Pyx_PyCFunction_FastCallWithKeywords)__pyx_pw_8cpp_fuzz_19QRatio, __Pyx_METH_FASTCALL|METH_KEYWORDS, __pyx_doc_8cpp_fuzz_18QRatio};
static PyObject *__pyx_pw_8cpp_fuzz_19QRatio(PyObject *__pyx_self,
#if CYTHON_METH_FASTCALL
@ -6364,8 +6364,8 @@ static PyObject *__pyx_pf_8cpp_fuzz_18QRatio(CYTHON_UNUSED PyObject *__pyx_self,
return __pyx_r;
}
/* "cpp_common.pxd":27
* void validate_string(object py_str, const char* err) except +
/* "cpp_common.pxd":28
* proc_string default_process_func(proc_string sentence) except +
*
* cdef inline proc_string hash_array(arr) except *: # <<<<<<<<<<<<<<
* # TODO on Cpython this does not require any copies
@ -6407,30 +6407,30 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
int __pyx_clineno = 0;
__Pyx_RefNannySetupContext("hash_array", 0);
/* "cpp_common.pxd":30
/* "cpp_common.pxd":31
* # TODO on Cpython this does not require any copies
* cdef proc_string s_proc
* cdef Py_UCS4 typecode = <Py_UCS4>arr.typecode # <<<<<<<<<<<<<<
* s_proc.length = <size_t>len(arr)
*
*/
__pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_arr, __pyx_n_s_typecode); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 30, __pyx_L1_error)
__pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_arr, __pyx_n_s_typecode); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 31, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_2 = __Pyx_PyObject_AsPy_UCS4(__pyx_t_1); if (unlikely((__pyx_t_2 == (Py_UCS4)-1) && PyErr_Occurred())) __PYX_ERR(1, 30, __pyx_L1_error)
__pyx_t_2 = __Pyx_PyObject_AsPy_UCS4(__pyx_t_1); if (unlikely((__pyx_t_2 == (Py_UCS4)-1) && PyErr_Occurred())) __PYX_ERR(1, 31, __pyx_L1_error)
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
__pyx_v_typecode = ((Py_UCS4)__pyx_t_2);
/* "cpp_common.pxd":31
/* "cpp_common.pxd":32
* cdef proc_string s_proc
* cdef Py_UCS4 typecode = <Py_UCS4>arr.typecode
* s_proc.length = <size_t>len(arr) # <<<<<<<<<<<<<<
*
* s_proc.data = malloc(s_proc.length * sizeof(uint64_t))
*/
__pyx_t_3 = PyObject_Length(__pyx_v_arr); if (unlikely(__pyx_t_3 == ((Py_ssize_t)-1))) __PYX_ERR(1, 31, __pyx_L1_error)
__pyx_t_3 = PyObject_Length(__pyx_v_arr); if (unlikely(__pyx_t_3 == ((Py_ssize_t)-1))) __PYX_ERR(1, 32, __pyx_L1_error)
__pyx_v_s_proc.length = ((size_t)__pyx_t_3);
/* "cpp_common.pxd":33
/* "cpp_common.pxd":34
* s_proc.length = <size_t>len(arr)
*
* s_proc.data = malloc(s_proc.length * sizeof(uint64_t)) # <<<<<<<<<<<<<<
@ -6439,7 +6439,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
__pyx_v_s_proc.data = malloc((__pyx_v_s_proc.length * (sizeof(uint64_t))));
/* "cpp_common.pxd":35
/* "cpp_common.pxd":36
* s_proc.data = malloc(s_proc.length * sizeof(uint64_t))
*
* if s_proc.data == NULL: # <<<<<<<<<<<<<<
@ -6449,16 +6449,16 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
__pyx_t_4 = ((__pyx_v_s_proc.data == NULL) != 0);
if (unlikely(__pyx_t_4)) {
/* "cpp_common.pxd":36
/* "cpp_common.pxd":37
*
* if s_proc.data == NULL:
* raise MemoryError # <<<<<<<<<<<<<<
*
* try:
*/
PyErr_NoMemory(); __PYX_ERR(1, 36, __pyx_L1_error)
PyErr_NoMemory(); __PYX_ERR(1, 37, __pyx_L1_error)
/* "cpp_common.pxd":35
/* "cpp_common.pxd":36
* s_proc.data = malloc(s_proc.length * sizeof(uint64_t))
*
* if s_proc.data == NULL: # <<<<<<<<<<<<<<
@ -6467,7 +6467,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
}
/* "cpp_common.pxd":38
/* "cpp_common.pxd":39
* raise MemoryError
*
* try: # <<<<<<<<<<<<<<
@ -6483,7 +6483,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
__Pyx_XGOTREF(__pyx_t_7);
/*try:*/ {
/* "cpp_common.pxd":40
/* "cpp_common.pxd":41
* try:
* # ignore signed/unsigned, since it is not relevant in any of the algorithms
* if typecode in {'b', 'B'}: # signed/unsigned char # <<<<<<<<<<<<<<
@ -6494,7 +6494,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
case 98:
case 66:
/* "cpp_common.pxd":41
/* "cpp_common.pxd":42
* # ignore signed/unsigned, since it is not relevant in any of the algorithms
* if typecode in {'b', 'B'}: # signed/unsigned char
* s_proc.kind = RAPIDFUZZ_UINT64 # <<<<<<<<<<<<<<
@ -6503,7 +6503,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
__pyx_v_s_proc.kind = RAPIDFUZZ_UINT64;
/* "cpp_common.pxd":42
/* "cpp_common.pxd":43
* if typecode in {'b', 'B'}: # signed/unsigned char
* s_proc.kind = RAPIDFUZZ_UINT64
* for i in range(s_proc.length): # <<<<<<<<<<<<<<
@ -6515,21 +6515,21 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
for (__pyx_t_10 = 0; __pyx_t_10 < __pyx_t_9; __pyx_t_10+=1) {
__pyx_v_i = __pyx_t_10;
/* "cpp_common.pxd":43
/* "cpp_common.pxd":44
* s_proc.kind = RAPIDFUZZ_UINT64
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i] # <<<<<<<<<<<<<<
* elif typecode == 'u': # 'u' wchar_t
* s_proc.kind = RAPIDFUZZ_UINT64
*/
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 43, __pyx_L4_error)
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 44, __pyx_L4_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_11 = __Pyx_PyInt_As_uint64_t(__pyx_t_1); if (unlikely((__pyx_t_11 == ((uint64_t)-1)) && PyErr_Occurred())) __PYX_ERR(1, 43, __pyx_L4_error)
__pyx_t_11 = __Pyx_PyInt_As_uint64_t(__pyx_t_1); if (unlikely((__pyx_t_11 == ((uint64_t)-1)) && PyErr_Occurred())) __PYX_ERR(1, 44, __pyx_L4_error)
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
(((uint64_t *)__pyx_v_s_proc.data)[__pyx_v_i]) = ((uint64_t)__pyx_t_11);
}
/* "cpp_common.pxd":40
/* "cpp_common.pxd":41
* try:
* # ignore signed/unsigned, since it is not relevant in any of the algorithms
* if typecode in {'b', 'B'}: # signed/unsigned char # <<<<<<<<<<<<<<
@ -6539,7 +6539,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
break;
case 0x75:
/* "cpp_common.pxd":45
/* "cpp_common.pxd":46
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i]
* elif typecode == 'u': # 'u' wchar_t
* s_proc.kind = RAPIDFUZZ_UINT64 # <<<<<<<<<<<<<<
@ -6548,7 +6548,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
__pyx_v_s_proc.kind = RAPIDFUZZ_UINT64;
/* "cpp_common.pxd":46
/* "cpp_common.pxd":47
* elif typecode == 'u': # 'u' wchar_t
* s_proc.kind = RAPIDFUZZ_UINT64
* for i in range(s_proc.length): # <<<<<<<<<<<<<<
@ -6560,21 +6560,21 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
for (__pyx_t_10 = 0; __pyx_t_10 < __pyx_t_9; __pyx_t_10+=1) {
__pyx_v_i = __pyx_t_10;
/* "cpp_common.pxd":47
/* "cpp_common.pxd":48
* s_proc.kind = RAPIDFUZZ_UINT64
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t><Py_UCS4>arr[i] # <<<<<<<<<<<<<<
* elif typecode in {'h', 'H'}: # signed/unsigned short
* s_proc.kind = RAPIDFUZZ_UINT64
*/
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 47, __pyx_L4_error)
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 48, __pyx_L4_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_2 = __Pyx_PyObject_AsPy_UCS4(__pyx_t_1); if (unlikely((__pyx_t_2 == (Py_UCS4)-1) && PyErr_Occurred())) __PYX_ERR(1, 47, __pyx_L4_error)
__pyx_t_2 = __Pyx_PyObject_AsPy_UCS4(__pyx_t_1); if (unlikely((__pyx_t_2 == (Py_UCS4)-1) && PyErr_Occurred())) __PYX_ERR(1, 48, __pyx_L4_error)
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
(((uint64_t *)__pyx_v_s_proc.data)[__pyx_v_i]) = ((uint64_t)((Py_UCS4)__pyx_t_2));
}
/* "cpp_common.pxd":44
/* "cpp_common.pxd":45
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i]
* elif typecode == 'u': # 'u' wchar_t # <<<<<<<<<<<<<<
@ -6584,7 +6584,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
break;
case 0x68:
/* "cpp_common.pxd":48
/* "cpp_common.pxd":49
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t><Py_UCS4>arr[i]
* elif typecode in {'h', 'H'}: # signed/unsigned short # <<<<<<<<<<<<<<
@ -6593,7 +6593,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
case 72:
/* "cpp_common.pxd":49
/* "cpp_common.pxd":50
* (<uint64_t*>s_proc.data)[i] = <uint64_t><Py_UCS4>arr[i]
* elif typecode in {'h', 'H'}: # signed/unsigned short
* s_proc.kind = RAPIDFUZZ_UINT64 # <<<<<<<<<<<<<<
@ -6602,7 +6602,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
__pyx_v_s_proc.kind = RAPIDFUZZ_UINT64;
/* "cpp_common.pxd":50
/* "cpp_common.pxd":51
* elif typecode in {'h', 'H'}: # signed/unsigned short
* s_proc.kind = RAPIDFUZZ_UINT64
* for i in range(s_proc.length): # <<<<<<<<<<<<<<
@ -6614,21 +6614,21 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
for (__pyx_t_10 = 0; __pyx_t_10 < __pyx_t_9; __pyx_t_10+=1) {
__pyx_v_i = __pyx_t_10;
/* "cpp_common.pxd":51
/* "cpp_common.pxd":52
* s_proc.kind = RAPIDFUZZ_UINT64
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i] # <<<<<<<<<<<<<<
* elif typecode in {'i', 'I'}: # signed/unsigned int
* s_proc.kind = RAPIDFUZZ_UINT64
*/
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 51, __pyx_L4_error)
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 52, __pyx_L4_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_11 = __Pyx_PyInt_As_uint64_t(__pyx_t_1); if (unlikely((__pyx_t_11 == ((uint64_t)-1)) && PyErr_Occurred())) __PYX_ERR(1, 51, __pyx_L4_error)
__pyx_t_11 = __Pyx_PyInt_As_uint64_t(__pyx_t_1); if (unlikely((__pyx_t_11 == ((uint64_t)-1)) && PyErr_Occurred())) __PYX_ERR(1, 52, __pyx_L4_error)
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
(((uint64_t *)__pyx_v_s_proc.data)[__pyx_v_i]) = ((uint64_t)__pyx_t_11);
}
/* "cpp_common.pxd":48
/* "cpp_common.pxd":49
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t><Py_UCS4>arr[i]
* elif typecode in {'h', 'H'}: # signed/unsigned short # <<<<<<<<<<<<<<
@ -6638,7 +6638,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
break;
case 0x69:
/* "cpp_common.pxd":52
/* "cpp_common.pxd":53
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i]
* elif typecode in {'i', 'I'}: # signed/unsigned int # <<<<<<<<<<<<<<
@ -6647,7 +6647,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
case 73:
/* "cpp_common.pxd":53
/* "cpp_common.pxd":54
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i]
* elif typecode in {'i', 'I'}: # signed/unsigned int
* s_proc.kind = RAPIDFUZZ_UINT64 # <<<<<<<<<<<<<<
@ -6656,7 +6656,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
__pyx_v_s_proc.kind = RAPIDFUZZ_UINT64;
/* "cpp_common.pxd":54
/* "cpp_common.pxd":55
* elif typecode in {'i', 'I'}: # signed/unsigned int
* s_proc.kind = RAPIDFUZZ_UINT64
* for i in range(s_proc.length): # <<<<<<<<<<<<<<
@ -6668,21 +6668,21 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
for (__pyx_t_10 = 0; __pyx_t_10 < __pyx_t_9; __pyx_t_10+=1) {
__pyx_v_i = __pyx_t_10;
/* "cpp_common.pxd":55
/* "cpp_common.pxd":56
* s_proc.kind = RAPIDFUZZ_UINT64
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i] # <<<<<<<<<<<<<<
* elif typecode in {'l', 'L'}: # signed/unsigned long
* s_proc.kind = RAPIDFUZZ_UINT64
*/
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 55, __pyx_L4_error)
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 56, __pyx_L4_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_11 = __Pyx_PyInt_As_uint64_t(__pyx_t_1); if (unlikely((__pyx_t_11 == ((uint64_t)-1)) && PyErr_Occurred())) __PYX_ERR(1, 55, __pyx_L4_error)
__pyx_t_11 = __Pyx_PyInt_As_uint64_t(__pyx_t_1); if (unlikely((__pyx_t_11 == ((uint64_t)-1)) && PyErr_Occurred())) __PYX_ERR(1, 56, __pyx_L4_error)
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
(((uint64_t *)__pyx_v_s_proc.data)[__pyx_v_i]) = ((uint64_t)__pyx_t_11);
}
/* "cpp_common.pxd":52
/* "cpp_common.pxd":53
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i]
* elif typecode in {'i', 'I'}: # signed/unsigned int # <<<<<<<<<<<<<<
@ -6692,7 +6692,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
break;
case 0x6C:
/* "cpp_common.pxd":56
/* "cpp_common.pxd":57
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i]
* elif typecode in {'l', 'L'}: # signed/unsigned long # <<<<<<<<<<<<<<
@ -6701,7 +6701,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
case 76:
/* "cpp_common.pxd":57
/* "cpp_common.pxd":58
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i]
* elif typecode in {'l', 'L'}: # signed/unsigned long
* s_proc.kind = RAPIDFUZZ_UINT64 # <<<<<<<<<<<<<<
@ -6710,7 +6710,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
__pyx_v_s_proc.kind = RAPIDFUZZ_UINT64;
/* "cpp_common.pxd":58
/* "cpp_common.pxd":59
* elif typecode in {'l', 'L'}: # signed/unsigned long
* s_proc.kind = RAPIDFUZZ_UINT64
* for i in range(s_proc.length): # <<<<<<<<<<<<<<
@ -6722,21 +6722,21 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
for (__pyx_t_10 = 0; __pyx_t_10 < __pyx_t_9; __pyx_t_10+=1) {
__pyx_v_i = __pyx_t_10;
/* "cpp_common.pxd":59
/* "cpp_common.pxd":60
* s_proc.kind = RAPIDFUZZ_UINT64
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i] # <<<<<<<<<<<<<<
* elif typecode in {'q', 'Q'}: # signed/unsigned long long
* s_proc.kind = RAPIDFUZZ_UINT64
*/
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 59, __pyx_L4_error)
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 60, __pyx_L4_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_11 = __Pyx_PyInt_As_uint64_t(__pyx_t_1); if (unlikely((__pyx_t_11 == ((uint64_t)-1)) && PyErr_Occurred())) __PYX_ERR(1, 59, __pyx_L4_error)
__pyx_t_11 = __Pyx_PyInt_As_uint64_t(__pyx_t_1); if (unlikely((__pyx_t_11 == ((uint64_t)-1)) && PyErr_Occurred())) __PYX_ERR(1, 60, __pyx_L4_error)
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
(((uint64_t *)__pyx_v_s_proc.data)[__pyx_v_i]) = ((uint64_t)__pyx_t_11);
}
/* "cpp_common.pxd":56
/* "cpp_common.pxd":57
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i]
* elif typecode in {'l', 'L'}: # signed/unsigned long # <<<<<<<<<<<<<<
@ -6746,7 +6746,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
break;
case 0x71:
/* "cpp_common.pxd":60
/* "cpp_common.pxd":61
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i]
* elif typecode in {'q', 'Q'}: # signed/unsigned long long # <<<<<<<<<<<<<<
@ -6755,7 +6755,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
case 81:
/* "cpp_common.pxd":61
/* "cpp_common.pxd":62
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i]
* elif typecode in {'q', 'Q'}: # signed/unsigned long long
* s_proc.kind = RAPIDFUZZ_UINT64 # <<<<<<<<<<<<<<
@ -6764,7 +6764,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
__pyx_v_s_proc.kind = RAPIDFUZZ_UINT64;
/* "cpp_common.pxd":62
/* "cpp_common.pxd":63
* elif typecode in {'q', 'Q'}: # signed/unsigned long long
* s_proc.kind = RAPIDFUZZ_UINT64
* for i in range(s_proc.length): # <<<<<<<<<<<<<<
@ -6776,21 +6776,21 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
for (__pyx_t_10 = 0; __pyx_t_10 < __pyx_t_9; __pyx_t_10+=1) {
__pyx_v_i = __pyx_t_10;
/* "cpp_common.pxd":63
/* "cpp_common.pxd":64
* s_proc.kind = RAPIDFUZZ_UINT64
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i] # <<<<<<<<<<<<<<
* else: # float/double are hashed
* s_proc.kind = RAPIDFUZZ_INT64
*/
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 63, __pyx_L4_error)
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 64, __pyx_L4_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_11 = __Pyx_PyInt_As_uint64_t(__pyx_t_1); if (unlikely((__pyx_t_11 == ((uint64_t)-1)) && PyErr_Occurred())) __PYX_ERR(1, 63, __pyx_L4_error)
__pyx_t_11 = __Pyx_PyInt_As_uint64_t(__pyx_t_1); if (unlikely((__pyx_t_11 == ((uint64_t)-1)) && PyErr_Occurred())) __PYX_ERR(1, 64, __pyx_L4_error)
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
(((uint64_t *)__pyx_v_s_proc.data)[__pyx_v_i]) = ((uint64_t)__pyx_t_11);
}
/* "cpp_common.pxd":60
/* "cpp_common.pxd":61
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i]
* elif typecode in {'q', 'Q'}: # signed/unsigned long long # <<<<<<<<<<<<<<
@ -6800,7 +6800,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
break;
default:
/* "cpp_common.pxd":65
/* "cpp_common.pxd":66
* (<uint64_t*>s_proc.data)[i] = <uint64_t>arr[i]
* else: # float/double are hashed
* s_proc.kind = RAPIDFUZZ_INT64 # <<<<<<<<<<<<<<
@ -6809,7 +6809,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
__pyx_v_s_proc.kind = RAPIDFUZZ_INT64;
/* "cpp_common.pxd":66
/* "cpp_common.pxd":67
* else: # float/double are hashed
* s_proc.kind = RAPIDFUZZ_INT64
* for i in range(s_proc.length): # <<<<<<<<<<<<<<
@ -6821,23 +6821,23 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
for (__pyx_t_10 = 0; __pyx_t_10 < __pyx_t_9; __pyx_t_10+=1) {
__pyx_v_i = __pyx_t_10;
/* "cpp_common.pxd":67
/* "cpp_common.pxd":68
* s_proc.kind = RAPIDFUZZ_INT64
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>hash(arr[i]) # <<<<<<<<<<<<<<
* except Exception as e:
* free(s_proc.data)
*/
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 67, __pyx_L4_error)
__pyx_t_1 = __Pyx_GetItemInt(__pyx_v_arr, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 68, __pyx_L4_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_12 = PyObject_Hash(__pyx_t_1); if (unlikely(__pyx_t_12 == ((Py_hash_t)-1))) __PYX_ERR(1, 67, __pyx_L4_error)
__pyx_t_12 = PyObject_Hash(__pyx_t_1); if (unlikely(__pyx_t_12 == ((Py_hash_t)-1))) __PYX_ERR(1, 68, __pyx_L4_error)
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
(((uint64_t *)__pyx_v_s_proc.data)[__pyx_v_i]) = ((uint64_t)__pyx_t_12);
}
break;
}
/* "cpp_common.pxd":38
/* "cpp_common.pxd":39
* raise MemoryError
*
* try: # <<<<<<<<<<<<<<
@ -6852,7 +6852,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
__pyx_L4_error:;
__Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
/* "cpp_common.pxd":68
/* "cpp_common.pxd":69
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>hash(arr[i])
* except Exception as e: # <<<<<<<<<<<<<<
@ -6862,7 +6862,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
__pyx_t_13 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
if (__pyx_t_13) {
__Pyx_AddTraceback("cpp_common.hash_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
if (__Pyx_GetException(&__pyx_t_1, &__pyx_t_14, &__pyx_t_15) < 0) __PYX_ERR(1, 68, __pyx_L6_except_error)
if (__Pyx_GetException(&__pyx_t_1, &__pyx_t_14, &__pyx_t_15) < 0) __PYX_ERR(1, 69, __pyx_L6_except_error)
__Pyx_GOTREF(__pyx_t_1);
__Pyx_GOTREF(__pyx_t_14);
__Pyx_GOTREF(__pyx_t_15);
@ -6870,7 +6870,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
__pyx_v_e = __pyx_t_14;
/*try:*/ {
/* "cpp_common.pxd":69
/* "cpp_common.pxd":70
* (<uint64_t*>s_proc.data)[i] = <uint64_t>hash(arr[i])
* except Exception as e:
* free(s_proc.data) # <<<<<<<<<<<<<<
@ -6879,7 +6879,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
free(__pyx_v_s_proc.data);
/* "cpp_common.pxd":70
/* "cpp_common.pxd":71
* except Exception as e:
* free(s_proc.data)
* s_proc.data = NULL # <<<<<<<<<<<<<<
@ -6888,7 +6888,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
__pyx_v_s_proc.data = NULL;
/* "cpp_common.pxd":71
/* "cpp_common.pxd":72
* free(s_proc.data)
* s_proc.data = NULL
* raise # <<<<<<<<<<<<<<
@ -6900,10 +6900,10 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
__Pyx_XGIVEREF(__pyx_t_15);
__Pyx_ErrRestoreWithState(__pyx_t_1, __pyx_t_14, __pyx_t_15);
__pyx_t_1 = 0; __pyx_t_14 = 0; __pyx_t_15 = 0;
__PYX_ERR(1, 71, __pyx_L29_error)
__PYX_ERR(1, 72, __pyx_L29_error)
}
/* "cpp_common.pxd":68
/* "cpp_common.pxd":69
* for i in range(s_proc.length):
* (<uint64_t*>s_proc.data)[i] = <uint64_t>hash(arr[i])
* except Exception as e: # <<<<<<<<<<<<<<
@ -6947,7 +6947,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
goto __pyx_L6_except_error;
__pyx_L6_except_error:;
/* "cpp_common.pxd":38
/* "cpp_common.pxd":39
* raise MemoryError
*
* try: # <<<<<<<<<<<<<<
@ -6962,7 +6962,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
__pyx_L9_try_end:;
}
/* "cpp_common.pxd":73
/* "cpp_common.pxd":74
* raise
*
* s_proc.allocated = True # <<<<<<<<<<<<<<
@ -6971,7 +6971,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
*/
__pyx_v_s_proc.allocated = 1;
/* "cpp_common.pxd":74
/* "cpp_common.pxd":75
*
* s_proc.allocated = True
* return move(s_proc) # <<<<<<<<<<<<<<
@ -6981,8 +6981,8 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
__pyx_r = cython_std::move<proc_string>(__pyx_v_s_proc);
goto __pyx_L0;
/* "cpp_common.pxd":27
* void validate_string(object py_str, const char* err) except +
/* "cpp_common.pxd":28
* proc_string default_process_func(proc_string sentence) except +
*
* cdef inline proc_string hash_array(arr) except *: # <<<<<<<<<<<<<<
* # TODO on Cpython this does not require any copies
@ -7002,7 +7002,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_array(PyObject *__pyx
return __pyx_r;
}
/* "cpp_common.pxd":77
/* "cpp_common.pxd":78
*
*
* cdef inline proc_string hash_sequence(seq) except *: # <<<<<<<<<<<<<<
@ -7046,17 +7046,17 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
int __pyx_clineno = 0;
__Pyx_RefNannySetupContext("hash_sequence", 0);
/* "cpp_common.pxd":79
/* "cpp_common.pxd":80
* cdef inline proc_string hash_sequence(seq) except *:
* cdef proc_string s_proc
* s_proc.length = <size_t>len(seq) # <<<<<<<<<<<<<<
*
* s_proc.data = malloc(s_proc.length * sizeof(uint64_t))
*/
__pyx_t_1 = PyObject_Length(__pyx_v_seq); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(1, 79, __pyx_L1_error)
__pyx_t_1 = PyObject_Length(__pyx_v_seq); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(1, 80, __pyx_L1_error)
__pyx_v_s_proc.length = ((size_t)__pyx_t_1);
/* "cpp_common.pxd":81
/* "cpp_common.pxd":82
* s_proc.length = <size_t>len(seq)
*
* s_proc.data = malloc(s_proc.length * sizeof(uint64_t)) # <<<<<<<<<<<<<<
@ -7065,7 +7065,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
*/
__pyx_v_s_proc.data = malloc((__pyx_v_s_proc.length * (sizeof(uint64_t))));
/* "cpp_common.pxd":83
/* "cpp_common.pxd":84
* s_proc.data = malloc(s_proc.length * sizeof(uint64_t))
*
* if s_proc.data == NULL: # <<<<<<<<<<<<<<
@ -7075,16 +7075,16 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
__pyx_t_2 = ((__pyx_v_s_proc.data == NULL) != 0);
if (unlikely(__pyx_t_2)) {
/* "cpp_common.pxd":84
/* "cpp_common.pxd":85
*
* if s_proc.data == NULL:
* raise MemoryError # <<<<<<<<<<<<<<
*
* try:
*/
PyErr_NoMemory(); __PYX_ERR(1, 84, __pyx_L1_error)
PyErr_NoMemory(); __PYX_ERR(1, 85, __pyx_L1_error)
/* "cpp_common.pxd":83
/* "cpp_common.pxd":84
* s_proc.data = malloc(s_proc.length * sizeof(uint64_t))
*
* if s_proc.data == NULL: # <<<<<<<<<<<<<<
@ -7093,7 +7093,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
*/
}
/* "cpp_common.pxd":86
/* "cpp_common.pxd":87
* raise MemoryError
*
* try: # <<<<<<<<<<<<<<
@ -7109,7 +7109,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
__Pyx_XGOTREF(__pyx_t_5);
/*try:*/ {
/* "cpp_common.pxd":87
/* "cpp_common.pxd":88
*
* try:
* s_proc.kind = RAPIDFUZZ_INT64 # <<<<<<<<<<<<<<
@ -7118,7 +7118,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
*/
__pyx_v_s_proc.kind = RAPIDFUZZ_INT64;
/* "cpp_common.pxd":88
/* "cpp_common.pxd":89
* try:
* s_proc.kind = RAPIDFUZZ_INT64
* for i in range(s_proc.length): # <<<<<<<<<<<<<<
@ -7130,19 +7130,19 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
for (__pyx_t_8 = 0; __pyx_t_8 < __pyx_t_7; __pyx_t_8+=1) {
__pyx_v_i = __pyx_t_8;
/* "cpp_common.pxd":89
/* "cpp_common.pxd":90
* s_proc.kind = RAPIDFUZZ_INT64
* for i in range(s_proc.length):
* elem = seq[i] # <<<<<<<<<<<<<<
* # this is required so e.g. a list of char can be compared to a string
* if isinstance(elem, str) and len(elem) == 1:
*/
__pyx_t_9 = __Pyx_GetItemInt(__pyx_v_seq, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_9)) __PYX_ERR(1, 89, __pyx_L4_error)
__pyx_t_9 = __Pyx_GetItemInt(__pyx_v_seq, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(!__pyx_t_9)) __PYX_ERR(1, 90, __pyx_L4_error)
__Pyx_GOTREF(__pyx_t_9);
__Pyx_XDECREF_SET(__pyx_v_elem, __pyx_t_9);
__pyx_t_9 = 0;
/* "cpp_common.pxd":91
/* "cpp_common.pxd":92
* elem = seq[i]
* # this is required so e.g. a list of char can be compared to a string
* if isinstance(elem, str) and len(elem) == 1: # <<<<<<<<<<<<<<
@ -7156,23 +7156,23 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
__pyx_t_2 = __pyx_t_11;
goto __pyx_L13_bool_binop_done;
}
__pyx_t_1 = PyObject_Length(__pyx_v_elem); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(1, 91, __pyx_L4_error)
__pyx_t_1 = PyObject_Length(__pyx_v_elem); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(1, 92, __pyx_L4_error)
__pyx_t_11 = ((__pyx_t_1 == 1) != 0);
__pyx_t_2 = __pyx_t_11;
__pyx_L13_bool_binop_done:;
if (__pyx_t_2) {
/* "cpp_common.pxd":92
/* "cpp_common.pxd":93
* # this is required so e.g. a list of char can be compared to a string
* if isinstance(elem, str) and len(elem) == 1:
* (<uint64_t*>s_proc.data)[i] = <uint64_t><Py_UCS4>elem # <<<<<<<<<<<<<<
* else:
* (<uint64_t*>s_proc.data)[i] = <uint64_t>hash(elem)
*/
__pyx_t_12 = __Pyx_PyObject_AsPy_UCS4(__pyx_v_elem); if (unlikely((__pyx_t_12 == (Py_UCS4)-1) && PyErr_Occurred())) __PYX_ERR(1, 92, __pyx_L4_error)
__pyx_t_12 = __Pyx_PyObject_AsPy_UCS4(__pyx_v_elem); if (unlikely((__pyx_t_12 == (Py_UCS4)-1) && PyErr_Occurred())) __PYX_ERR(1, 93, __pyx_L4_error)
(((uint64_t *)__pyx_v_s_proc.data)[__pyx_v_i]) = ((uint64_t)((Py_UCS4)__pyx_t_12));
/* "cpp_common.pxd":91
/* "cpp_common.pxd":92
* elem = seq[i]
* # this is required so e.g. a list of char can be compared to a string
* if isinstance(elem, str) and len(elem) == 1: # <<<<<<<<<<<<<<
@ -7182,7 +7182,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
goto __pyx_L12;
}
/* "cpp_common.pxd":94
/* "cpp_common.pxd":95
* (<uint64_t*>s_proc.data)[i] = <uint64_t><Py_UCS4>elem
* else:
* (<uint64_t*>s_proc.data)[i] = <uint64_t>hash(elem) # <<<<<<<<<<<<<<
@ -7190,13 +7190,13 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
* free(s_proc.data)
*/
/*else*/ {
__pyx_t_13 = PyObject_Hash(__pyx_v_elem); if (unlikely(__pyx_t_13 == ((Py_hash_t)-1))) __PYX_ERR(1, 94, __pyx_L4_error)
__pyx_t_13 = PyObject_Hash(__pyx_v_elem); if (unlikely(__pyx_t_13 == ((Py_hash_t)-1))) __PYX_ERR(1, 95, __pyx_L4_error)
(((uint64_t *)__pyx_v_s_proc.data)[__pyx_v_i]) = ((uint64_t)__pyx_t_13);
}
__pyx_L12:;
}
/* "cpp_common.pxd":86
/* "cpp_common.pxd":87
* raise MemoryError
*
* try: # <<<<<<<<<<<<<<
@ -7211,7 +7211,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
__pyx_L4_error:;
__Pyx_XDECREF(__pyx_t_9); __pyx_t_9 = 0;
/* "cpp_common.pxd":95
/* "cpp_common.pxd":96
* else:
* (<uint64_t*>s_proc.data)[i] = <uint64_t>hash(elem)
* except Exception as e: # <<<<<<<<<<<<<<
@ -7221,7 +7221,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
__pyx_t_14 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
if (__pyx_t_14) {
__Pyx_AddTraceback("cpp_common.hash_sequence", __pyx_clineno, __pyx_lineno, __pyx_filename);
if (__Pyx_GetException(&__pyx_t_9, &__pyx_t_15, &__pyx_t_16) < 0) __PYX_ERR(1, 95, __pyx_L6_except_error)
if (__Pyx_GetException(&__pyx_t_9, &__pyx_t_15, &__pyx_t_16) < 0) __PYX_ERR(1, 96, __pyx_L6_except_error)
__Pyx_GOTREF(__pyx_t_9);
__Pyx_GOTREF(__pyx_t_15);
__Pyx_GOTREF(__pyx_t_16);
@ -7229,7 +7229,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
__pyx_v_e = __pyx_t_15;
/*try:*/ {
/* "cpp_common.pxd":96
/* "cpp_common.pxd":97
* (<uint64_t*>s_proc.data)[i] = <uint64_t>hash(elem)
* except Exception as e:
* free(s_proc.data) # <<<<<<<<<<<<<<
@ -7238,7 +7238,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
*/
free(__pyx_v_s_proc.data);
/* "cpp_common.pxd":97
/* "cpp_common.pxd":98
* except Exception as e:
* free(s_proc.data)
* s_proc.data = NULL # <<<<<<<<<<<<<<
@ -7247,7 +7247,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
*/
__pyx_v_s_proc.data = NULL;
/* "cpp_common.pxd":98
/* "cpp_common.pxd":99
* free(s_proc.data)
* s_proc.data = NULL
* raise # <<<<<<<<<<<<<<
@ -7259,10 +7259,10 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
__Pyx_XGIVEREF(__pyx_t_16);
__Pyx_ErrRestoreWithState(__pyx_t_9, __pyx_t_15, __pyx_t_16);
__pyx_t_9 = 0; __pyx_t_15 = 0; __pyx_t_16 = 0;
__PYX_ERR(1, 98, __pyx_L20_error)
__PYX_ERR(1, 99, __pyx_L20_error)
}
/* "cpp_common.pxd":95
/* "cpp_common.pxd":96
* else:
* (<uint64_t*>s_proc.data)[i] = <uint64_t>hash(elem)
* except Exception as e: # <<<<<<<<<<<<<<
@ -7306,7 +7306,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
goto __pyx_L6_except_error;
__pyx_L6_except_error:;
/* "cpp_common.pxd":86
/* "cpp_common.pxd":87
* raise MemoryError
*
* try: # <<<<<<<<<<<<<<
@ -7321,7 +7321,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
__pyx_L9_try_end:;
}
/* "cpp_common.pxd":100
/* "cpp_common.pxd":101
* raise
*
* s_proc.allocated = True # <<<<<<<<<<<<<<
@ -7329,7 +7329,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
*/
__pyx_v_s_proc.allocated = 1;
/* "cpp_common.pxd":101
/* "cpp_common.pxd":102
*
* s_proc.allocated = True
* return move(s_proc) # <<<<<<<<<<<<<<
@ -7337,7 +7337,7 @@ static CYTHON_INLINE proc_string __pyx_f_10cpp_common_hash_sequence(PyObject *__
__pyx_r = cython_std::move<proc_string>(__pyx_v_s_proc);
goto __pyx_L0;
/* "cpp_common.pxd":77
/* "cpp_common.pxd":78
*
*
* cdef inline proc_string hash_sequence(seq) except *: # <<<<<<<<<<<<<<
@ -7463,8 +7463,8 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = {
};
/* #### Code section: cached_builtins ### */
static CYTHON_SMALL_CODE int __Pyx_InitCachedBuiltins(void) {
__pyx_builtin_MemoryError = __Pyx_GetBuiltinName(__pyx_n_s_MemoryError); if (!__pyx_builtin_MemoryError) __PYX_ERR(1, 36, __pyx_L1_error)
__pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(1, 42, __pyx_L1_error)
__pyx_builtin_MemoryError = __Pyx_GetBuiltinName(__pyx_n_s_MemoryError); if (!__pyx_builtin_MemoryError) __PYX_ERR(1, 37, __pyx_L1_error)
__pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(1, 43, __pyx_L1_error)
return 0;
__pyx_L1_error:;
return -1;
@ -8252,7 +8252,7 @@ if (!__Pyx_RefNanny) {
if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
/* "cpp_common.pxd":77
/* "cpp_common.pxd":78
*
*
* cdef inline proc_string hash_sequence(seq) except *: # <<<<<<<<<<<<<<

View File

@ -42,9 +42,9 @@ def ratio(s1, s2, *, processor=None, score_cutoff=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
@ -94,9 +94,9 @@ def partial_ratio(s1, s2, *, processor=None, score_cutoff=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
@ -173,9 +173,9 @@ def token_sort_ratio(s1, s2, *, processor=True, score_cutoff=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
@ -221,9 +221,9 @@ def token_set_ratio(s1, s2, *, processor=True, score_cutoff=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
@ -271,9 +271,9 @@ def token_ratio(s1, s2, *, processor=True, score_cutoff=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
@ -313,9 +313,9 @@ def partial_token_sort_ratio(s1, s2, *, processor=True, score_cutoff=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
@ -356,9 +356,9 @@ def partial_token_set_ratio(s1, s2, *, processor=True, score_cutoff=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
@ -399,9 +399,9 @@ def partial_token_ratio(s1, s2, *, processor=True, score_cutoff=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
@ -441,9 +441,9 @@ def WRatio(s1, s2, *, processor=True, score_cutoff=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
@ -485,9 +485,9 @@ def QRatio(s1, s2, *, processor=True, score_cutoff=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before

14237
src/cpp_process.cpp vendored

File diff suppressed because one or more lines are too long

View File

@ -28,16 +28,21 @@ from rapidfuzz.fuzz import (
from libcpp.vector cimport vector
from libcpp cimport algorithm
from libcpp.utility cimport move
from libc.stdint cimport uint8_t, int32_t
from libc.math cimport floor
from cpython.list cimport PyList_New, PyList_SET_ITEM
from cpython.object cimport PyObject
from cpython.ref cimport Py_INCREF, Py_DECREF
from cpp_common cimport proc_string, is_valid_string, convert_string, hash_array, hash_sequence
from cpp_common cimport proc_string, is_valid_string, convert_string, hash_array, hash_sequence, default_process_func
import heapq
from array import array
import numpy as np
cimport numpy as np
cimport cython
cdef inline proc_string conv_sequence(seq) except *:
if is_valid_string(seq):
@ -50,15 +55,15 @@ cdef inline proc_string conv_sequence(seq) except *:
cdef extern from "cpp_process.hpp":
cdef cppclass CachedScorerContext:
CachedScorerContext()
double ratio(const proc_string&, double) except +
double ratio(const proc_string&, double) nogil except +
cdef cppclass CachedDistanceContext:
CachedDistanceContext()
size_t ratio(const proc_string&, size_t) except +
size_t ratio(const proc_string&, size_t) nogil except +
# normalized distances
# fuzz
CachedScorerContext cached_ratio_init( const proc_string&, int) except +
CachedScorerContext cached_ratio_init( const proc_string&, int) nogil except +
CachedScorerContext cached_partial_ratio_init( const proc_string&, int) except +
CachedScorerContext cached_token_sort_ratio_init( const proc_string&, int) except +
CachedScorerContext cached_token_set_ratio_init( const proc_string&, int) except +
@ -226,7 +231,7 @@ cdef inline extractOne_dict(CachedScorerContext context, choices, processor, dou
for choice_key, choice in choices.items():
if choice is None:
continue
score = context.ratio(conv_sequence(choice), score_cutoff)
if score >= score_cutoff and score > result_score:
@ -400,8 +405,8 @@ cdef inline py_extractOne_dict(query, choices, scorer, processor, double score_c
score = scorer(query, processor(choice), **kwargs)
if score >= score_cutoff and score > result_score:
kwargs["score_cutoff"] = score_cutoff
score_cutoff = score
kwargs["score_cutoff"] = score
result_score = score
result_choice = choice
result_key = choice_key
@ -416,8 +421,8 @@ cdef inline py_extractOne_dict(query, choices, scorer, processor, double score_c
score = scorer(query, choice, **kwargs)
if score >= score_cutoff and score > result_score:
kwargs["score_cutoff"] = score_cutoff
score_cutoff = score
kwargs["score_cutoff"] = score
result_score = score
result_choice = choice
result_key = choice_key
@ -445,8 +450,8 @@ cdef inline py_extractOne_list(query, choices, scorer, processor, double score_c
score = scorer(query, processor(choice), **kwargs)
if score >= score_cutoff and score > result_score:
kwargs["score_cutoff"] = score_cutoff
score_cutoff = score
kwargs["score_cutoff"] = score
result_score = score
result_choice = choice
result_index = i
@ -461,8 +466,8 @@ cdef inline py_extractOne_list(query, choices, scorer, processor, double score_c
score = scorer(query, choice, **kwargs)
if score >= score_cutoff and score > result_score:
kwargs["score_cutoff"] = score_cutoff
score_cutoff = score
kwargs["score_cutoff"] = score
result_score = score
result_choice = choice
result_index = i
@ -480,9 +485,9 @@ def extractOne(query, choices, *, scorer=WRatio, processor=default_process, scor
Parameters
----------
query : str
query : Sequence[Hashable]
string we want to find
choices : Iterable
choices : Iterable[Sequence[Hashable]] | Mapping[Sequence[Hashable]]
list of all strings the query should be compared with or dict with a mapping
{<result>: <string to compare>}
scorer : Callable, optional
@ -506,7 +511,7 @@ def extractOne(query, choices, *, scorer=WRatio, processor=default_process, scor
Returns
-------
Tuple[str, Any, Any]
Tuple[Sequence[Hashable], Any, Any]
Returns the best match in form of a Tuple with 3 elements. The values stored in the
tuple depend on the types of the input arguments.
@ -634,7 +639,7 @@ def extractOne(query, choices, *, scorer=WRatio, processor=default_process, scor
return extractOne_dict(move(ScorerContext), choices, processor, c_score_cutoff)
else:
return extractOne_list(move(ScorerContext), choices, processor, c_score_cutoff)
if IsIntegratedDistance(scorer):
# distance implemented in C++
query_context = conv_sequence(query)
@ -804,13 +809,13 @@ cdef inline extract_list(CachedScorerContext context, choices, processor, size_t
for i, choice in enumerate(choices):
if choice is None:
continue
proc_choice = processor(choice)
if proc_choice is None:
continue
score = context.ratio(conv_sequence(proc_choice), score_cutoff)
if score >= score_cutoff:
Py_INCREF(choice)
results.push_back(ListMatchScorerElem(score, i, <PyObject*>choice))
@ -818,23 +823,23 @@ cdef inline extract_list(CachedScorerContext context, choices, processor, size_t
for i, choice in enumerate(choices):
if choice is None:
continue
score = context.ratio(conv_sequence(choice), score_cutoff)
if score >= score_cutoff:
Py_INCREF(choice)
results.push_back(ListMatchScorerElem(score, i, <PyObject*>choice))
# due to score_cutoff not always completely filled
if limit > results.size():
limit = results.size()
if limit >= results.size():
algorithm.sort(results.begin(), results.end(), ExtractScorerComp())
else:
algorithm.partial_sort(results.begin(), results.begin() + <ptrdiff_t>limit, results.end(), ExtractScorerComp())
results.resize(limit)
# copy elements into Python List
result_list = PyList_New(<Py_ssize_t>limit)
for i in range(limit):
@ -863,13 +868,13 @@ cdef inline extract_distance_list(CachedDistanceContext context, choices, proces
for i, choice in enumerate(choices):
if choice is None:
continue
proc_choice = processor(choice)
if proc_choice is None:
continue
distance = context.ratio(conv_sequence(proc_choice), max_)
if distance <= max_:
Py_INCREF(choice)
results.push_back(ListMatchDistanceElem(distance, i, <PyObject*>choice))
@ -877,23 +882,23 @@ cdef inline extract_distance_list(CachedDistanceContext context, choices, proces
for i, choice in enumerate(choices):
if choice is None:
continue
distance = context.ratio(conv_sequence(choice), max_)
if distance <= max_:
Py_INCREF(choice)
results.push_back(ListMatchDistanceElem(distance, i, <PyObject*>choice))
# due to max_ not always completely filled
if limit > results.size():
limit = results.size()
if limit >= results.size():
algorithm.sort(results.begin(), results.end(), ExtractDistanceComp())
else:
algorithm.partial_sort(results.begin(), results.begin() + <ptrdiff_t>limit, results.end(), ExtractDistanceComp())
results.resize(limit)
# copy elements into Python List
result_list = PyList_New(<Py_ssize_t>limit)
for i in range(limit):
@ -980,9 +985,9 @@ def extract(query, choices, *, scorer=WRatio, processor=default_process, limit=5
Parameters
----------
query : str
query : Sequence[Hashable]
string we want to find
choices : Iterable
choices : Collection[Sequence[Hashable]] | Mapping[Sequence[Hashable]]
list of all strings the query should be compared with or dict with a mapping
{<result>: <string to compare>}
scorer : Callable, optional
@ -1008,7 +1013,7 @@ def extract(query, choices, *, scorer=WRatio, processor=default_process, limit=5
Returns
-------
List[Tuple[str, Any, Any]]
List[Tuple[Sequence[Hashable], Any, Any]]
The return type is always a List of Tuples with 3 elements. However the values stored in the
tuple depend on the types of the input arguments.
@ -1107,9 +1112,9 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc
Parameters
----------
query : str
query : Sequence[Hashable]
string we want to find
choices : Iterable
choices : Iterable[Sequence[Hashable]] | Mapping[Sequence[Hashable]]
list of all strings the query should be compared with or dict with a mapping
{<result>: <string to compare>}
scorer : Callable, optional
@ -1133,7 +1138,7 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc
Yields
-------
Tuple[str, Any, Any]
Tuple[Sequence[Hashable], Any, Any]
Yields similarity between the query and each choice in form of a Tuple with 3 elements.
The values stored in the tuple depend on the types of the input arguments.
@ -1408,3 +1413,325 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc
yield from py_extract_iter_dict()
else:
yield from py_extract_iter_list()
@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline cdist_two_lists_similarity(
const vector[proc_string]& queries,
const vector[proc_string]& choices,
scorer, score_cutoff, dict kwargs
):
cdef size_t queries_len = queries.size()
cdef size_t choices_len = choices.size()
cdef size_t i, j
cdef double c_score_cutoff = 0
cdef np.ndarray[np.uint8_t, ndim=2] matrix = np.empty((queries_len, choices_len), dtype=np.uint8)
if score_cutoff is not None:
c_score_cutoff = score_cutoff
if c_score_cutoff < 0 or c_score_cutoff > 100:
raise TypeError("score_cutoff has to be in the range of 0.0 - 100.0")
c_score_cutoff = floor(c_score_cutoff)
for i in range(queries_len):
ScorerContext = CachedScorerInit(scorer, queries[i], 0, kwargs)
for j in range(choices_len):
matrix[i, j] = <uint8_t>floor(ScorerContext.ratio(choices[j], c_score_cutoff))
return matrix
@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline cdist_two_lists_distance(
const vector[proc_string]& queries, const vector[proc_string]& choices,
scorer, score_cutoff, dict kwargs
):
cdef size_t queries_len = queries.size()
cdef size_t choices_len = choices.size()
cdef size_t i, j
cdef size_t c_max = <size_t>-1
cdef np.ndarray[np.int32_t, ndim=2] matrix = np.empty((queries_len, choices_len), dtype=np.int32)
if score_cutoff is not None and score_cutoff != -1:
c_max = score_cutoff
for i in range(queries_len):
DistanceContext = CachedDistanceInit(scorer, queries[i], 0, kwargs)
for j in range(choices_len):
matrix[i, j] = <int32_t>DistanceContext.ratio(choices[j], c_max)
return matrix
@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline py_cdist_two_lists(
const vector[PyObject*]& queries, const vector[PyObject*]& choices,
scorer, score_cutoff, dict kwargs
):
cdef size_t queries_len = queries.size()
cdef size_t choices_len = choices.size()
cdef size_t i, j
cdef double c_score_cutoff = 0
cdef np.ndarray[np.uint8_t, ndim=2] matrix = np.empty((queries_len, choices_len), dtype=np.uint8)
if score_cutoff is not None:
c_score_cutoff = score_cutoff
if c_score_cutoff < 0 or c_score_cutoff > 100:
raise TypeError("score_cutoff has to be in the range of 0.0 - 100.0")
c_score_cutoff = floor(c_score_cutoff)
kwargs["processor"] = None
kwargs["score_cutoff"] = c_score_cutoff
for i in range(queries_len):
for j in range(choices_len):
matrix[i, j] = <uint8_t>floor(
<double>scorer(<object>queries[i], <object>choices[j],**kwargs))
return matrix
cdef cdist_two_lists(queries, choices, scorer, processor, score_cutoff, dict kwargs):
cdef vector[proc_string] proc_queries
cdef vector[proc_string] proc_choices
cdef vector[PyObject*] proc_py_queries
cdef vector[PyObject*] proc_py_choices
cdef size_t queries_len = <size_t>len(queries)
cdef size_t choices_len = <size_t>len(choices)
try:
if IsIntegratedScorer(scorer) or IsIntegratedDistance(scorer):
proc_queries.reserve(queries_len)
proc_choices.reserve(choices_len)
# processor None/False
if not processor:
for query in queries:
proc_queries.push_back(move(conv_sequence(query)))
for choice in choices:
proc_choices.push_back(move(conv_sequence(choice)))
# processor has to be called through python
elif processor is not default_process and callable(processor):
proc_py_queries.reserve(queries_len)
for query in queries:
proc_query = processor(query)
Py_INCREF(proc_query)
proc_py_queries.push_back(<PyObject*>proc_query)
proc_queries.push_back(move(conv_sequence(proc_query)))
proc_py_choices.reserve(choices_len)
for choice in choices:
proc_choice = processor(choice)
Py_INCREF(proc_choice)
proc_py_choices.push_back(<PyObject*>proc_choice)
proc_choices.push_back(move(conv_sequence(proc_choice)))
# processor is True / default_process
else:
for query in queries:
proc_queries.push_back(
move(default_process_func(move(conv_sequence(query))))
)
for choice in choices:
proc_choices.push_back(
move(default_process_func(move(conv_sequence(choice))))
)
if IsIntegratedScorer(scorer):
return cdist_two_lists_similarity(proc_queries, proc_choices, scorer, score_cutoff, kwargs)
if IsIntegratedDistance(scorer):
return cdist_two_lists_distance(proc_queries, proc_choices, scorer, score_cutoff, kwargs)
else:
proc_py_queries.reserve(queries_len)
proc_py_choices.reserve(choices_len)
# processor None/False
if not processor:
for query in queries:
Py_INCREF(query)
proc_py_queries.push_back(<PyObject*>query)
for choice in choices:
Py_INCREF(choice)
proc_py_choices.push_back(<PyObject*>choice)
# processor has to be called through python
else:
if not callable(processor):
processor = default_process
for query in queries:
proc_query = processor(query)
Py_INCREF(proc_query)
proc_py_queries.push_back(<PyObject*>proc_query)
for choice in choices:
proc_choice = processor(choice)
Py_INCREF(proc_choice)
proc_py_choices.push_back(<PyObject*>proc_choice)
return py_cdist_two_lists(proc_py_queries, proc_py_choices, scorer, score_cutoff, kwargs)
finally:
# decref all reference counts
for item in proc_py_queries:
Py_DECREF(<object>item)
for item in proc_py_choices:
Py_DECREF(<object>item)
@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline cdist_single_list_similarity(
const vector[proc_string]& queries, scorer, score_cutoff, dict kwargs
):
cdef size_t queries_len = queries.size()
cdef size_t i, j
cdef double c_score_cutoff = 0
cdef np.ndarray[np.uint8_t, ndim=2] matrix = np.empty((queries_len, queries_len), dtype=np.uint8)
if score_cutoff is not None:
c_score_cutoff = score_cutoff
if c_score_cutoff < 0 or c_score_cutoff > 100:
raise TypeError("score_cutoff has to be in the range of 0.0 - 100.0")
c_score_cutoff = floor(c_score_cutoff)
for i in range(queries_len):
matrix[i, i] = 100
ScorerContext = CachedScorerInit(scorer, queries[i], 0, kwargs)
for j in range(i + 1, queries_len):
score = <uint8_t>floor(ScorerContext.ratio(queries[j], c_score_cutoff))
matrix[i, j] = score
matrix[j, i] = score
return matrix
@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline cdist_single_list_distance(
const vector[proc_string]& queries, scorer, score_cutoff, dict kwargs
):
cdef size_t queries_len = queries.size()
cdef size_t i, j
cdef size_t c_max = <size_t>-1
cdef np.ndarray[np.int32_t, ndim=2] matrix = np.empty((queries_len, queries_len), dtype=np.int32)
if score_cutoff is not None and score_cutoff != -1:
c_max = score_cutoff
for i in range(queries_len):
matrix[i, i] = 0
DistanceContext = CachedDistanceInit(scorer, queries[i], 0, kwargs)
for j in range(i + 1, queries_len):
score = <int32_t>DistanceContext.ratio(queries[j], c_max)
matrix[i, j] = score
matrix[j, i] = score
return matrix
cdef cdist_single_list(queries, scorer, processor, score_cutoff, dict kwargs):
cdef size_t queries_len = <size_t>len(queries)
cdef vector[proc_string] proc_queries
cdef vector[PyObject*] proc_py_queries
try:
if IsIntegratedScorer(scorer) or IsIntegratedDistance(scorer):
proc_queries.reserve(queries_len)
# processor None/False
if not processor:
for query in queries:
proc_queries.push_back(move(conv_sequence(query)))
# processor has to be called through python
elif processor is not default_process and callable(processor):
proc_py_queries.reserve(queries_len)
for query in queries:
proc_query = processor(query)
Py_INCREF(proc_query)
proc_py_queries.push_back(<PyObject*>proc_query)
proc_queries.push_back(move(conv_sequence(proc_query)))
# processor is True / default_process
else:
for query in queries:
proc_queries.push_back(
move(default_process_func(move(conv_sequence(query))))
)
if IsIntegratedScorer(scorer):
return cdist_single_list_similarity(proc_queries, scorer, score_cutoff, kwargs)
if IsIntegratedDistance(scorer):
return cdist_single_list_distance(proc_queries, scorer, score_cutoff, kwargs)
else:
proc_py_queries.reserve(queries_len)
# processor None/False
if not processor:
for query in queries:
Py_INCREF(query)
proc_py_queries.push_back(<PyObject*>query)
# processor has to be called through python
else:
if not callable(processor):
processor = default_process
for query in queries:
proc_query = processor(query)
Py_INCREF(proc_query)
proc_py_queries.push_back(<PyObject*>proc_query)
# scorer(a, b) might not be equal to scorer(b, a)
return py_cdist_two_lists(proc_py_queries, proc_py_queries, scorer, score_cutoff, kwargs)
finally:
# decref all reference counts
for item in proc_py_queries:
Py_DECREF(<object>item)
def cdist(queries, choices, *, scorer=ratio, processor=None, score_cutoff=None, **kwargs):
"""
Compute distance/similarity between each pair of the two collections of inputs.
Parameters
----------
queries : Collection[Sequence[Hashable]]
list of all strings the queries
choices : Collection[Sequence[Hashable]]
list of all strings the query should be compared
scorer : Callable, optional
Optional callable that is used to calculate the matching score between
the query and each choice. This can be any of the scorers included in RapidFuzz
(both scorers that calculate the edit distance or the normalized edit distance).
Custom functions are not supported so far!
fuzz.ratio is used by default.
processor : Callable, optional
Optional callable that is used to preprocess the strings before
comparing them. When processor is True ``utils.default_process``
is used. Default is None, which deactivates this behaviour.
score_cutoff : Any, optional
Optional argument for a score threshold. When an edit distance is used this represents the maximum
edit distance and matches with a `distance <= score_cutoff` are inserted as -1. When a
normalized edit distance is used this represents the minimal similarity
and matches with a `similarity >= score_cutoff` are inserted as 0.
Default is None, which deactivates this behaviour.
**kwargs : Any, optional
any other named parameters are passed to the scorer. This can be used to pass
e.g. weights to string_metric.levenshtein
Returns
-------
List[Tuple[Sequence[Hashable], Any, Any]]
"""
if queries is choices:
return cdist_single_list(queries, scorer, processor, score_cutoff, kwargs)
else:
return cdist_two_lists(queries, choices, scorer, processor, score_cutoff, kwargs)

File diff suppressed because one or more lines are too long

View File

@ -55,9 +55,9 @@ def levenshtein(s1, s2, *, weights=(1,1,1), processor=None, max=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
weights : Tuple[int, int, int] or None, optional
The weights for the three operations in the form
@ -259,9 +259,9 @@ def levenshtein_editops(s1, s2, *, processor=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
@ -301,9 +301,9 @@ def normalized_levenshtein(s1, s2, *, weights=(1,1,1), processor=None, score_cut
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
weights : Tuple[int, int, int] or None, optional
The weights for the three operations in the form
@ -407,9 +407,9 @@ def hamming(s1, s2, *, processor=None, max=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
@ -451,9 +451,9 @@ def normalized_hamming(s1, s2, *, processor=None, score_cutoff=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
@ -498,9 +498,9 @@ def jaro_similarity(s1, s2, *, processor=None, score_cutoff=None):
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
processor: bool or callable, optional
Optional callable that is used to preprocess the strings before
@ -537,9 +537,9 @@ def jaro_winkler_similarity(s1, s2, *, double prefix_weight=0.1, processor=None,
Parameters
----------
s1 : str
s1 : Sequence[Hashable]
First string to compare.
s2 : str
s2 : Sequence[Hashable]
Second string to compare.
prefix_weight : float, optional
Weight used for the common prefix of the two strings.

View File

@ -1,4 +1,4 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2021 Max Bachmann
from rapidfuzz.cpp_process import extract, extractOne, extract_iter
from rapidfuzz.cpp_process import extract, extractOne, extract_iter, cdist

View File

@ -9,6 +9,7 @@ import pytest
from rapidfuzz import fuzz, process, utils, string_metric
import random
from math import isclose
import numpy as np
def levenshtein(s1, s2, weights=(1, 1, 1)):
"""
@ -79,6 +80,24 @@ def partial_ratio_short_needle(s1, s2):
res = max(res, fuzz.ratio(s1, part))
return res
def cdist_scorer(queries, choices, scorer):
matrix = np.zeros((len(queries), len(choices)), dtype=np.uint8)
for i, query in enumerate(queries):
for j, choice in enumerate(choices):
matrix[i, j] = scorer(query, choice)
return matrix
def cdist_distance(queries, choices, scorer):
matrix = np.zeros((len(queries), len(choices)), dtype=np.int32)
for i, query in enumerate(queries):
for j, choice in enumerate(choices):
matrix[i, j] = scorer(query, choice)
return matrix
def extractOne_scorer(s1, s2, scorer, processor=None, **kwargs):
return process.extractOne(s1, [s2], processor=processor, scorer=scorer, **kwargs)[1]
@ -294,3 +313,19 @@ def test_only_identical_strings_extracted(scorer, processor, choices):
for match in matches:
assert processor(query) == processor(match[0])
@given(queries=st.lists(st.text(), min_size=1), choices=st.lists(st.text(), min_size=1))
@settings(max_examples=500, deadline=5000)
def test_cdist(queries, choices):
"""
Test that cdist returns correct results
"""
reference_matrix = cdist_distance(queries, choices, scorer=string_metric.levenshtein)
matrix = process.cdist(queries, choices, scorer=string_metric.levenshtein)
assert (matrix == reference_matrix).all()
reference_matrix = cdist_distance(queries, queries, scorer=string_metric.levenshtein)
matrix = process.cdist(queries, queries, scorer=string_metric.levenshtein)
assert (matrix == reference_matrix).all()