start replacing pybind11 with the python C API

This commit is contained in:
maxbachmann 2020-03-31 19:42:48 +02:00
parent 028db547d1
commit 7ee4808cf9
No known key found for this signature in database
GPG Key ID: 60334E83C23820B8
15 changed files with 752 additions and 730 deletions

View File

@ -8,82 +8,50 @@
#include <iterator>
percent fuzz::partial_ratio(std::wstring s1, std::wstring s2, percent score_cutoff, bool preprocess) {
if (score_cutoff >= 100) {
percent fuzz::partial_ratio(std::wstring_view s1, std::wstring_view s2, percent score_cutoff) {
if (s1.empty() || s2.empty() || score_cutoff > 100) {
return 0;
}
if (preprocess) {
s1 = utils::default_process(std::move(s1));
s2 = utils::default_process(std::move(s2));
}
if (s1.empty() || s2.empty()) {
return 0;
}
std::wstring_view shorter;
std::wstring_view longer;
if (s1.length() > s2.length()) {
shorter = s2;
longer = s1;
} else {
shorter = s1;
longer = s2;
std::swap(s1, s2);
}
auto blocks = levenshtein::matching_blocks(shorter, longer);
auto blocks = levenshtein::matching_blocks(s1, s2);
float max_ratio = 0;
for (const auto &block : blocks) {
std::size_t long_start = (block.second_start > block.first_start) ? block.second_start - block.first_start : 0;
std::wstring_view long_substr = longer.substr(long_start, shorter.length());
std::wstring_view long_substr = s2.substr(long_start, s1.length());
float ls_ratio = levenshtein::normalized_weighted_distance(shorter, long_substr, score_cutoff / 100);
float ls_ratio = levenshtein::normalized_weighted_distance(s1, long_substr, score_cutoff / 100);
if (ls_ratio > 0.995) {
return 100;
}
return 100;
}
if (ls_ratio > max_ratio) {
max_ratio = ls_ratio;
}
max_ratio = ls_ratio;
}
}
return utils::result_cutoff(max_ratio*100, score_cutoff);
}
percent fuzz::ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) {
float result;
if (preprocess) {
result = levenshtein::normalized_weighted_distance(
utils::default_process(s1), utils::default_process(s2), score_cutoff / 100);
} else {
result = levenshtein::normalized_weighted_distance(s1, s2, score_cutoff / 100);
}
percent fuzz::ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) {
float result = levenshtein::normalized_weighted_distance(s1, s2, score_cutoff / 100);
return utils::result_cutoff(result*100, score_cutoff);
}
percent fuzz::token_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) {
if (score_cutoff >= 100) {
percent fuzz::token_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) {
if (score_cutoff > 100) {
return 0;
}
std::wstring a;
std::wstring b;
if (preprocess) {
a = utils::default_process(s1);
b = utils::default_process(s2);
} else {
a = s1;
b = s2;
}
std::vector<std::wstring_view> tokens_a = utils::splitSV(a);
std::vector<std::wstring_view> tokens_a = utils::splitSV(s1);
std::sort(tokens_a.begin(), tokens_a.end());
std::vector<std::wstring_view> tokens_b = utils::splitSV(b);
std::vector<std::wstring_view> tokens_b = utils::splitSV(s2);
std::sort(tokens_b.begin(), tokens_b.end());
auto [intersection, difference_ab, difference_ba] = utils::set_decomposition(tokens_a, tokens_b);
@ -132,24 +100,14 @@ percent fuzz::token_ratio(const std::wstring &s1, const std::wstring &s2, percen
// combines token_set and token_sort ratio from fuzzywuzzy so it is only required to
// do a lot of operations once
percent fuzz::partial_token_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) {
if (score_cutoff >= 100) {
percent fuzz::partial_token_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) {
if (score_cutoff > 100) {
return 0;
}
std::wstring a;
std::wstring b;
if (preprocess) {
a = utils::default_process(s1);
b = utils::default_process(s2);
} else {
a = s1;
b = s2;
}
std::vector<std::wstring_view> tokens_a = utils::splitSV(a);
std::vector<std::wstring_view> tokens_a = utils::splitSV(s1);
std::sort(tokens_a.begin(), tokens_a.end());
std::vector<std::wstring_view> tokens_b = utils::splitSV(b);
std::vector<std::wstring_view> tokens_b = utils::splitSV(s2);
std::sort(tokens_b.begin(), tokens_b.end());
auto unique_a = tokens_a;
@ -170,7 +128,7 @@ percent fuzz::partial_token_ratio(const std::wstring &s1, const std::wstring &s2
return 100;
}
percent result = partial_ratio(utils::join(tokens_a), utils::join(tokens_b), score_cutoff, false);
percent result = partial_ratio(utils::join(tokens_a), utils::join(tokens_b), score_cutoff);
// do not calculate the same partial_ratio twice
if (tokens_a.size() == unique_a.size() && tokens_b.size() == unique_b.size()) {
return result;
@ -179,33 +137,23 @@ percent fuzz::partial_token_ratio(const std::wstring &s1, const std::wstring &s2
score_cutoff = std::max(score_cutoff, result);
return std::max(
result,
partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff, false)
partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff)
);
}
percent _token_sort(const std::wstring &s1, const std::wstring &s2, bool partial, percent score_cutoff=0.0, bool preprocess = true) {
if (score_cutoff >= 100) {
percent _token_sort(const std::wstring_view &s1, const std::wstring_view &s2, bool partial, percent score_cutoff=0.0) {
if (score_cutoff > 100) {
return 0;
}
std::wstring a;
std::wstring b;
if (preprocess) {
a = utils::default_process(s1);
b = utils::default_process(s2);
} else {
a = s1;
b = s2;
}
std::vector<std::wstring_view> tokens_a = utils::splitSV(a);
std::vector<std::wstring_view> tokens_a = utils::splitSV(s1);
std::sort(tokens_a.begin(), tokens_a.end());
std::vector<std::wstring_view> tokens_b = utils::splitSV(b);
std::vector<std::wstring_view> tokens_b = utils::splitSV(s2);
std::sort(tokens_b.begin(), tokens_b.end());
if (partial) {
return fuzz::partial_ratio(utils::join(tokens_a), utils::join(tokens_b), score_cutoff, false);
return fuzz::partial_ratio(utils::join(tokens_a), utils::join(tokens_b), score_cutoff);
} else {
float result = levenshtein::normalized_weighted_distance(tokens_a, tokens_b, score_cutoff / 100);
return utils::result_cutoff(result*100, score_cutoff);
@ -213,34 +161,24 @@ percent _token_sort(const std::wstring &s1, const std::wstring &s2, bool partial
}
percent fuzz::token_sort_ratio(const std::wstring &a, const std::wstring &b, percent score_cutoff, bool preprocess) {
percent fuzz::token_sort_ratio(const std::wstring_view &a, const std::wstring_view &b, percent score_cutoff) {
return _token_sort(a, b, false, score_cutoff);
}
percent fuzz::partial_token_sort_ratio(const std::wstring &a, const std::wstring &b, percent score_cutoff, bool preprocess) {
percent fuzz::partial_token_sort_ratio(const std::wstring_view &a, const std::wstring_view &b, percent score_cutoff) {
return _token_sort(a, b, true, score_cutoff);
}
percent fuzz::token_set_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) {
if (score_cutoff >= 100) {
percent fuzz::token_set_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) {
if (score_cutoff > 100) {
return 0;
}
std::wstring a;
std::wstring b;
if (preprocess) {
a = utils::default_process(s1);
b = utils::default_process(s2);
} else {
a = s1;
b = s2;
}
std::vector<std::wstring_view> tokens_a = utils::splitSV(a);
std::vector<std::wstring_view> tokens_a = utils::splitSV(s1);
std::sort(tokens_a.begin(), tokens_a.end());
std::vector<std::wstring_view> tokens_b = utils::splitSV(b);
std::vector<std::wstring_view> tokens_b = utils::splitSV(s2);
std::sort(tokens_b.begin(), tokens_b.end());
auto [intersection, difference_ab, difference_ba] = utils::set_decomposition(tokens_a, tokens_b);
@ -287,24 +225,14 @@ percent fuzz::token_set_ratio(const std::wstring &s1, const std::wstring &s2, pe
}
percent fuzz::partial_token_set_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) {
if (score_cutoff >= 100) {
percent fuzz::partial_token_set_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) {
if (score_cutoff > 100) {
return 0;
}
std::wstring a;
std::wstring b;
if (preprocess) {
a = utils::default_process(s1);
b = utils::default_process(s2);
} else {
a = s1;
b = s2;
}
std::vector<std::wstring_view> tokens_a = utils::splitSV(a);
std::vector<std::wstring_view> tokens_a = utils::splitSV(s1);
std::sort(tokens_a.begin(), tokens_a.end());
std::vector<std::wstring_view> tokens_b = utils::splitSV(b);
std::vector<std::wstring_view> tokens_b = utils::splitSV(s2);
std::sort(tokens_b.begin(), tokens_b.end());
tokens_a.erase(std::unique(tokens_a.begin(), tokens_a.end()), tokens_a.end());
@ -323,43 +251,33 @@ percent fuzz::partial_token_set_ratio(const std::wstring &s1, const std::wstring
return 100;
}
return partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff, false);
return partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff);
}
percent fuzz::WRatio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) {
if (score_cutoff >= 100) {
percent fuzz::WRatio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) {
if (score_cutoff > 100) {
return 0;
}
std::wstring a;
std::wstring b;
if (preprocess) {
a = utils::default_process(s1);
b = utils::default_process(s2);
} else {
a = s1;
b = s2;
}
const float UNBASE_SCALE = 0.95;
std::size_t len_a = a.length();
std::size_t len_b = b.length();
std::size_t len_a = s1.length();
std::size_t len_b = s2.length();
float len_ratio = (len_a > len_b) ? (float)len_a / (float)len_b : (float)len_b / (float)len_a;
float sratio = ratio(a, b, score_cutoff, false);
float sratio = ratio(s1, s2, score_cutoff);
if (len_ratio < 1.5) {
score_cutoff = std::max(score_cutoff, sratio);
return std::max(sratio, token_ratio(a, b, score_cutoff/UNBASE_SCALE, false) * UNBASE_SCALE);
return std::max(sratio, token_ratio(s1, s2, score_cutoff/UNBASE_SCALE) * UNBASE_SCALE);
}
float partial_scale = (len_ratio < 8.0) ? 0.9 : 0.6;
score_cutoff = std::max(score_cutoff, sratio)/partial_scale;
sratio = std::max(sratio, partial_ratio(a, b, score_cutoff, false) * partial_scale);
sratio = std::max(sratio, partial_ratio(s1, s2, score_cutoff) * partial_scale);
score_cutoff = std::max(score_cutoff, sratio)/UNBASE_SCALE;
return std::max(sratio, partial_token_ratio(a, b, score_cutoff, false) * UNBASE_SCALE * partial_scale );
return std::max(sratio, partial_token_ratio(s1, s2, score_cutoff) * UNBASE_SCALE * partial_scale );
}

View File

@ -5,17 +5,17 @@
using percent = float;
namespace fuzz {
percent ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true);
percent partial_ratio(std::wstring s1, std::wstring s2, percent score_cutoff=0, bool preprocess = true);
percent ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0);
percent partial_ratio(std::wstring_view s1, std::wstring_view s2, percent score_cutoff=0);
percent token_sort_ratio(const std::wstring &a, const std::wstring &b, percent score_cutoff=0, bool preprocess = true);
percent partial_token_sort_ratio(const std::wstring &a, const std::wstring &b, percent score_cutoff=0, bool preprocess = true);
percent token_sort_ratio(const std::wstring_view &a, const std::wstring_view &b, percent score_cutoff=0);
percent partial_token_sort_ratio(const std::wstring_view &a, const std::wstring_view &b, percent score_cutoff=0);
percent token_set_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true);
percent partial_token_set_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true);
percent token_set_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0);
percent partial_token_set_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0);
percent token_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true);
percent partial_token_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true);
percent token_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0);
percent partial_token_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0);
percent WRatio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff = 0, bool preprocess = true);
percent WRatio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff = 0);
}

View File

@ -106,7 +106,7 @@ std::vector<levenshtein::EditOp> levenshtein::editops(std::wstring_view sentence
std::vector<levenshtein::MatchingBlock> levenshtein::matching_blocks(std::wstring_view sentence1, std::wstring_view sentence2) {
auto edit_ops = editops(sentence1, sentence2);
std::size_t first_start = 0;
std::size_t second_start = 0;
std::size_t second_start = 0;
std::vector<MatchingBlock> mblocks;
for (const auto &op : edit_ops) {
@ -191,13 +191,13 @@ std::size_t levenshtein::distance(std::wstring_view sentence1, std::wstring_view
++temp;
}
temp = std::min({
*cache_iter + 1,
*(++cache_iter) + 1,
temp
temp = std::min({
*cache_iter + 1,
*(++cache_iter) + 1,
temp
});
std::swap(*cache_iter, temp);
}
std::swap(*cache_iter, temp);
}
}
return cache.back();
}

View File

@ -35,9 +35,9 @@ namespace levenshtein {
std::vector<EditOp> editops(std::wstring_view sentence1, std::wstring_view sentence2);
struct MatchingBlock {
std::size_t first_start;
std::size_t second_start;
std::size_t len;
std::size_t first_start;
std::size_t second_start;
std::size_t len;
MatchingBlock(std::size_t first_start, std::size_t second_start, std::size_t len)
: first_start(first_start), second_start(second_start), len(len) {}
};
@ -90,7 +90,6 @@ namespace levenshtein {
}
template<typename MaxDistanceCalc>
inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const std::vector<std::wstring_view> &words,
std::vector<std::size_t> &cache, std::size_t current_cache)
@ -101,8 +100,8 @@ inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const s
auto min_distance = std::numeric_limits<std::size_t>::max();
auto charCmp = [&] (const wchar_t &char2) {
if (letter_cmp == char2) { result = current_cache; }
else { ++result; }
if (letter_cmp == char2) { result = current_cache; }
else { ++result; }
current_cache = *cache_iter;
if (result > current_cache + 1) {
@ -121,7 +120,7 @@ inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const s
// no whitespace should be added in front of the first word
for (const auto &letter : *word_iter) {
charCmp(letter);
charCmp(letter);
}
++word_iter;
@ -131,7 +130,7 @@ inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const s
// check following word
for (const auto &letter : *word_iter) {
charCmp(letter);
charCmp(letter);
}
}
@ -260,16 +259,12 @@ inline std::size_t levenshtein::weighted_distance(std::wstring_view sentence1, s
}
template<typename Sentence1, typename Sentence2>
inline float levenshtein::normalized_weighted_distance(const Sentence1 &sentence1, const Sentence2 &sentence2, float min_ratio)
{
if (sentence1.empty() || sentence2.empty()) {
return sentence1.empty() && sentence2.empty();
}
return 1;
std::size_t sentence1_len = utils::joined_size(sentence1);
std::size_t sentence2_len = utils::joined_size(sentence2);

View File

@ -27,7 +27,7 @@ process::extract(const std::wstring &query, const std::vector<std::wstring> &cho
b = choice;
}
float score = fuzz::WRatio(query, choice, score_cutoff, false);
float score = fuzz::WRatio(query, choice, score_cutoff);
if (score >= score_cutoff) {
results.emplace_back(std::make_pair(choice, score));
}
@ -68,7 +68,7 @@ process::extractOne(const std::wstring &query, const std::vector<std::wstring> &
b = choice;
}
float score = fuzz::WRatio(a, b, score_cutoff, false);
float score = fuzz::WRatio(a, b, score_cutoff);
if (score >= score_cutoff) {
score_cutoff = score;
match_found = true;

View File

@ -5,7 +5,7 @@
*/
template <typename InputIterator1, typename InputIterator2>
inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1,
InputIterator2 first2, InputIterator2 last2)
InputIterator2 first2, InputIterator2 last2)
{
return std::distance(first1, std::mismatch(first1, last1, first2, last2).first);
}
@ -15,8 +15,8 @@ inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1,
*/
std::size_t remove_common_prefix(std::wstring_view& a, std::wstring_view& b) {
auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end());
a.remove_prefix(prefix);
b.remove_prefix(prefix);
a.remove_prefix(prefix);
b.remove_prefix(prefix);
return prefix;
}
@ -25,7 +25,7 @@ std::size_t remove_common_prefix(std::wstring_view& a, std::wstring_view& b) {
*/
std::size_t remove_common_suffix(std::wstring_view& a, std::wstring_view& b) {
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
a.remove_suffix(suffix);
a.remove_suffix(suffix);
b.remove_suffix(suffix);
return suffix;
}
@ -34,7 +34,7 @@ std::size_t remove_common_suffix(std::wstring_view& a, std::wstring_view& b) {
* Removes common affix of two string views
*/
Affix utils::remove_common_affix(std::wstring_view& a, std::wstring_view& b) {
return Affix {
return Affix {
remove_common_prefix(a, b),
remove_common_suffix(a, b)
};
@ -104,7 +104,7 @@ void utils::trim(std::wstring &s) {
void utils::lower_case(std::wstring &s) {
std::for_each(s.begin(), s.end(), [](wchar_t & c){
c = ::tolower(c);
c = std::tolower(c);
});
}
@ -114,6 +114,7 @@ std::wstring utils::default_process(std::wstring s) {
return s;
}
DecomposedSet utils::set_decomposition(std::vector<std::wstring_view> a, std::vector<std::wstring_view> b) {
std::vector<std::wstring_view> intersection;
std::vector<std::wstring_view> difference_ab;
@ -134,7 +135,7 @@ DecomposedSet utils::set_decomposition(std::vector<std::wstring_view> a, std::ve
}
std::size_t utils::joined_size(const std::wstring_view &x){
return x.size();
return x.size();
}
@ -145,7 +146,7 @@ std::size_t utils::joined_size(const std::vector<std::wstring_view> &x){
// there is a whitespace between each word
std::size_t result = x.size() - 1;
for (const auto &y: x) result += y.size();
for (const auto &y: x) result += y.size();
return result;
return result;
}

View File

@ -40,6 +40,7 @@ namespace utils {
void trim(std::wstring &s);
void lower_case(std::wstring &s);
void lower_case(std::wstring &s);
std::wstring default_process(std::wstring s);

445
python/src/py_fuzz.cpp Normal file
View File

@ -0,0 +1,445 @@
#define PY_SSIZE_T_CLEAN /* Make "s#" use Py_ssize_t rather than int. */
#include <Python.h>
#include <string>
#include "fuzz.hpp"
#include "utils.hpp"
PyObject* ratio(PyObject *self, PyObject *args, PyObject *keywds) {
const wchar_t *s1;
const wchar_t *s2;
float score_cutoff = 0;
bool preprocess = true;
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
&s1, &s2, &score_cutoff, &preprocess))
return NULL;
double result;
if (preprocess) {
result = fuzz::ratio(
utils::default_process(s1),
utils::default_process(s2),
score_cutoff);
} else {
result = fuzz::ratio(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)),
score_cutoff);
}
return PyFloat_FromDouble(result);
}
PyObject* partial_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
const wchar_t *s1;
const wchar_t *s2;
float score_cutoff = 0;
bool preprocess = true;
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
&s1, &s2, &score_cutoff, &preprocess))
return NULL;
double result;
if (preprocess) {
result = fuzz::partial_ratio(
utils::default_process(s1),
utils::default_process(s2),
score_cutoff);
} else {
result = fuzz::partial_ratio(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)),
score_cutoff);
}
return PyFloat_FromDouble(result);
}
PyObject* token_sort_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
const wchar_t *s1;
const wchar_t *s2;
float score_cutoff = 0;
bool preprocess = true;
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
&s1, &s2, &score_cutoff, &preprocess))
return NULL;
double result;
if (preprocess) {
result = fuzz::token_sort_ratio(
utils::default_process(s1),
utils::default_process(s2),
score_cutoff);
} else {
result = fuzz::token_sort_ratio(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)),
score_cutoff);
}
return PyFloat_FromDouble(result);
}
PyObject* partial_token_sort_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
const wchar_t *s1;
const wchar_t *s2;
float score_cutoff = 0;
bool preprocess = true;
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
&s1, &s2, &score_cutoff, &preprocess))
return NULL;
double result;
if (preprocess) {
result = fuzz::partial_token_sort_ratio(
utils::default_process(s1),
utils::default_process(s2),
score_cutoff);
} else {
result = fuzz::partial_token_sort_ratio(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)),
score_cutoff);
}
return PyFloat_FromDouble(result);
}
PyObject* token_set_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
const wchar_t *s1;
const wchar_t *s2;
float score_cutoff = 0;
bool preprocess = true;
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
&s1, &s2, &score_cutoff, &preprocess))
return NULL;
double result;
if (preprocess) {
result = fuzz::token_set_ratio(
utils::default_process(s1),
utils::default_process(s2),
score_cutoff);
} else {
result = fuzz::token_set_ratio(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)),
score_cutoff);
}
return PyFloat_FromDouble(result);
}
PyObject* partial_token_set_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
const wchar_t *s1;
const wchar_t *s2;
float score_cutoff = 0;
bool preprocess = true;
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
&s1, &s2, &score_cutoff, &preprocess))
return NULL;
double result;
if (preprocess) {
result = fuzz::partial_token_set_ratio(
utils::default_process(s1),
utils::default_process(s2),
score_cutoff);
} else {
result = fuzz::partial_token_set_ratio(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)),
score_cutoff);
}
return PyFloat_FromDouble(result);
}
PyObject* token_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
const wchar_t *s1;
const wchar_t *s2;
float score_cutoff = 0;
bool preprocess = true;
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
&s1, &s2, &score_cutoff, &preprocess))
return NULL;
double result;
if (preprocess) {
result = fuzz::token_ratio(
utils::default_process(s1),
utils::default_process(s2),
score_cutoff);
} else {
result = fuzz::token_ratio(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)),
score_cutoff);
}
return PyFloat_FromDouble(result);
}
PyObject* partial_token_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
const wchar_t *s1;
const wchar_t *s2;
float score_cutoff = 0;
bool preprocess = true;
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
&s1, &s2, &score_cutoff, &preprocess))
return NULL;
double result;
if (preprocess) {
result = fuzz::partial_token_ratio(
utils::default_process(s1),
utils::default_process(s2),
score_cutoff);
} else {
result = fuzz::partial_token_ratio(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)),
score_cutoff);
}
return PyFloat_FromDouble(result);
}
PyObject* WRatio(PyObject *self, PyObject *args, PyObject *keywds) {
const wchar_t *s1;
const wchar_t *s2;
float score_cutoff = 0;
bool preprocess = true;
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
&s1, &s2, &score_cutoff, &preprocess))
return NULL;
double result;
if (preprocess) {
result = fuzz::WRatio(
utils::default_process(s1),
utils::default_process(s2),
score_cutoff);
} else {
result = fuzz::WRatio(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)),
score_cutoff);
}
return PyFloat_FromDouble(result);
}
static PyMethodDef methods[] = {
/* The cast of the function is necessary since PyCFunction values
* only take two PyObject* parameters, and these functions take
* three.
*/
{"ratio", (PyCFunction)(void(*)(void))ratio, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
calculates a simple ratio between two strings
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.ratio("this is a test", "this is a test!")
96.55171966552734
)pbdoc"},
{"partial_ratio", (PyCFunction)(void(*)(void))partial_ratio, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
calculates a partial ratio between two strings
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.partial_ratio("this is a test", "this is a test!")
100.0
)pbdoc"},
{"token_sort_ratio", (PyCFunction)(void(*)(void))token_sort_ratio, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
sorts the words in the string and calculates the fuzz.ratio between them
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
100.0
)pbdoc"},
{"partial_token_sort_ratio", (PyCFunction)(void(*)(void))partial_token_sort_ratio, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
sorts the words in the strings and calculates the fuzz.partial_ratio between them
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
)pbdoc"},
{"token_set_ratio", (PyCFunction)(void(*)(void))token_set_ratio, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
Compares the words in the strings based on unique and common words between them using fuzz.ratio
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
83.8709716796875
>>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
100.0
)pbdoc"},
{"partial_token_sort_ratio", (PyCFunction)(void(*)(void))partial_token_sort_ratio, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
Compares the words in the strings based on unique and common words between them using fuzz.partial_ratio
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
)pbdoc"},
{"token_ratio", (PyCFunction)(void(*)(void))token_ratio, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio
(faster than manually executing the two functions)
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
)pbdoc"},
{"partial_token_ratio", (PyCFunction)(void(*)(void))partial_token_ratio, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
Helper method that returns the maximum of fuzz.partial_token_set_ratio and fuzz.partial_token_sort_ratio
(faster than manually executing the two functions)
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
)pbdoc"},
{"QRatio", (PyCFunction)(void(*)(void))ratio, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
calculates a quick ratio between two strings using fuzz.ratio
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.ratio("this is a test", "this is a test!")
96.55171966552734
)pbdoc"},
{"WRatio", (PyCFunction)(void(*)(void))WRatio, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
Calculates a weighted ratio based on the other ratio algorithms
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
)pbdoc"},
{NULL, NULL, 0, NULL} /* sentinel */
};
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"rapidfuzz.fuzz",
NULL,
-1,
methods
};
PyMODINIT_FUNC PyInit_fuzz(void) {
return PyModule_Create(&moduledef);
}

View File

@ -0,0 +1,169 @@
#define PY_SSIZE_T_CLEAN /* Make "s#" use Py_ssize_t rather than int. */
#include <Python.h>
#include <string>
#include "levenshtein.hpp"
PyObject* distance(PyObject *self, PyObject *args, PyObject *keywds) {
const wchar_t *s1;
const wchar_t *s2;
static const char *kwlist[] = {"s1", "s2", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu", const_cast<char **>(kwlist),
&s1, &s2))
return NULL;
std::size_t result = levenshtein::distance(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)));
return PyLong_FromSize_t(result);
}
PyObject* normalized_distance(PyObject *self, PyObject *args, PyObject *keywds) {
const wchar_t *s1;
const wchar_t *s2;
float score_cutoff = 0;
static const char *kwlist[] = {"s1", "s2", "score_cutoff", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|f", const_cast<char **>(kwlist),
&s1, &s2, &score_cutoff))
return NULL;
double result = levenshtein::normalized_distance(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)),
score_cutoff/100);
return PyFloat_FromDouble(result*100);
}
PyObject* weighted_distance(PyObject *self, PyObject *args, PyObject *keywds) {
const wchar_t *s1;
const wchar_t *s2;
std::size_t insert_cost = 1;
std::size_t delete_cost = 1;
std::size_t replace_cost = 1;
static const char *kwlist[] = {"s1", "s2", "insert_cost", "delete_cost", "replace_cost", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|nnn", const_cast<char **>(kwlist),
&s1, &s2, &insert_cost, &delete_cost, &replace_cost))
return NULL;
if (insert_cost == 1 && delete_cost == 1) {
if (replace_cost == 1) {
std::size_t result = levenshtein::distance(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)));
return PyLong_FromSize_t(result);
} else if (replace_cost == 2) {
std::size_t result = levenshtein::weighted_distance(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)));
return PyLong_FromSize_t(result);
}
}
std::size_t result = levenshtein::generic_distance(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)),
insert_cost, delete_cost, replace_cost);
return PyLong_FromSize_t(result);
}
PyObject* normalized_weighted_distance(PyObject *self, PyObject *args, PyObject *keywds) {
const wchar_t *s1;
const wchar_t *s2;
float score_cutoff = 0;
static const char *kwlist[] = {"s1", "s2", "score_cutoff", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|f", const_cast<char **>(kwlist),
&s1, &s2, &score_cutoff))
return NULL;
double result = levenshtein::normalized_weighted_distance(
std::wstring_view(s1, wcslen(s1)),
std::wstring_view(s2, wcslen(s2)),
score_cutoff/100);
return PyFloat_FromDouble(result*100);
}
static PyMethodDef methods[] = {
/* The cast of the function is necessary since PyCFunction values
* only take two PyObject* parameters, and these functions take
* three.
*/
{"distance", (PyCFunction)(void(*)(void))distance, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
Calculates the minimum number of insertions, deletions, and substitutions
required to change one sequence into the other according to Levenshtein.
Args:
s1 (str): first string to compare
s2 (str): second string to compare
Returns:
int: levenshtein distance between s1 and s2
)pbdoc"},
{"normalized_distance", (PyCFunction)(void(*)(void))normalized_distance, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
Calculates a normalized levenshtein distance based on levenshtein.distance
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
Returns:
float: normalized levenshtein distance between s1 and s2 as a float between 0 and 100
)pbdoc"},
{"weighted_distance", (PyCFunction)(void(*)(void))weighted_distance, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
Calculates the minimum number of insertions, deletions, and substitutions
required to change one sequence into the other according to Levenshtein with custom
costs for insertion, deletion and substitution
Args:
s1 (str): first string to compare
s2 (str): second string to compare
insert_cost (int): cost for insertions
delete_cost (int): cost for deletions
replace_cost (int): cost for substitutions
Returns:
int: weighted levenshtein distance between s1 and s2
)pbdoc"},
{"normalized_weighted_distance", (PyCFunction)(void(*)(void))normalized_weighted_distance, METH_VARARGS | METH_KEYWORDS,
R"pbdoc(
Calculates a normalized levenshtein distance based on levenshtein.weighted_distance
It uses the following costs for edit operations:
edit operation | cost
:------------- | :---
Insert | 1
Remove | 1
Replace | 2
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
Returns:
float: normalized weighted levenshtein distance between s1 and s2 as a float between 0 and 100
)pbdoc"},
{NULL, NULL, 0, NULL} /* sentinel */
};
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"rapidfuzz.levenshtein",
NULL,
-1,
methods
};
PyMODINIT_FUNC PyInit_levenshtein(void) {
return PyModule_Create(&moduledef);
}

25
python/src/py_process.cpp Normal file
View File

@ -0,0 +1,25 @@
#define PY_SSIZE_T_CLEAN /* Make "s#" use Py_ssize_t rather than int. */
#include <Python.h>
#include <string>
#include "process.hpp"
static PyMethodDef methods[] = {
/* The cast of the function is necessary since PyCFunction values
* only take two PyObject* parameters, and these functions take
* three.
*/
{NULL, NULL, 0, NULL} /* sentinel */
};
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"rapidfuzz._process",
NULL,
-1,
methods
};
PyMODINIT_FUNC PyInit__process(void) {
return PyModule_Create(&moduledef);
}

View File

@ -1,287 +0,0 @@
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <string>
#include "process.hpp"
#include "fuzz.hpp"
#include "utils.hpp"
#include "levenshtein.hpp"
namespace py = pybind11;
PYBIND11_MODULE(_rapidfuzz_cpp, m) {
m.doc() = R"pbdoc(
rapid string matching library
)pbdoc";
m.attr("__version__") = VERSION_INFO;
/********************************************************/
/* process module */
/********************************************************/
auto mprocess = m.def_submodule("process");
mprocess.def("extract", &process::extract);
mprocess.def("extractOne", &process::extractOne);
/********************************************************/
/* fuzz module */
/********************************************************/
auto mfuzz = m.def_submodule("fuzz");
mfuzz.def("ratio", &fuzz::ratio,
R"pbdoc(
calculates a simple ratio between two strings
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio.
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.ratio("this is a test", "this is a test!")
96.55171966552734
)pbdoc",
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
mfuzz.def("partial_ratio", &fuzz::partial_ratio,
R"pbdoc(
calculates a partial ratio between two strings
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio.
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.partial_ratio("this is a test", "this is a test!")
100.0
)pbdoc",
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
mfuzz.def("token_sort_ratio", &fuzz::token_sort_ratio,
R"pbdoc(
sorts the words in the string and calculates the fuzz.ratio between them
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio.
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
100.0
)pbdoc",
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
mfuzz.def("partial_token_sort_ratio", &fuzz::partial_token_sort_ratio,
R"pbdoc(
sorts the words in the strings and calculates the fuzz.partial_ratio between them
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio.
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
)pbdoc",
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
mfuzz.def("token_set_ratio", &fuzz::token_set_ratio,
R"pbdoc(
Compares the words in the strings based on unique and common words between them using fuzz.ratio
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio.
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
83.8709716796875
>>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
100.0
)pbdoc",
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
mfuzz.def("partial_token_set_ratio", &fuzz::partial_token_set_ratio,
R"pbdoc(
Compares the words in the strings based on unique and common words between them using fuzz.partial_ratio
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio.
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
)pbdoc",
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
mfuzz.def("token_ratio", &fuzz::token_ratio,
R"pbdoc(
Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio
(faster than manually executing the two functions)
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio.
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
)pbdoc",
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
mfuzz.def("partial_token_ratio", &fuzz::partial_token_ratio,
R"pbdoc(
Helper method that returns the maximum of fuzz.partial_token_set_ratio and fuzz.partial_token_sort_ratio
(faster than manually executing the two functions)
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio.
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
)pbdoc",
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
mfuzz.def("WRatio", &fuzz::WRatio,
R"pbdoc(
Calculates a weighted ratio based on the other ratio algorithms
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio.
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
)pbdoc",
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
/********************************************************/
/* levenshtein module */
/********************************************************/
auto mlevenshtein = m.def_submodule("levenshtein");
mlevenshtein.def("distance",
[](std::wstring_view s1, std::wstring_view s2){
return levenshtein::distance(s1, s2);
},
R"pbdoc(
Calculates the minimum number of insertions, deletions, and substitutions
required to change one sequence into the other according to Levenshtein.
Args:
s1 (str): first string to compare
s2 (str): second string to compare
Returns:
int: levenshtein distance between s1 and s2
)pbdoc",
py::arg("s1"), py::arg("s2"));
mlevenshtein.def("normalized_distance",
[](std::wstring_view s1, std::wstring_view s2, float score_cutoff){
return levenshtein::normalized_distance(s1, s2, score_cutoff/100)*100;
},
R"pbdoc(
Calculates a normalized levenshtein distance based on levenshtein.distance
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
Returns:
float: normalized levenshtein distance between s1 and s2 as a float between 0 and 100
)pbdoc",
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff") = 0);
mlevenshtein.def("weighted_distance",
[](std::wstring_view s1, std::wstring_view s2, size_t insert_cost, size_t delete_cost, size_t replace_cost){
if (insert_cost == 1 && delete_cost == 1) {
if (replace_cost == 1) {
return levenshtein::distance(s1, s2);
} else if (replace_cost == 2) {
return levenshtein::weighted_distance(s1, s2);
}
}
return levenshtein::generic_distance(s1, s2, insert_cost, delete_cost, replace_cost);
},
R"pbdoc(
Calculates the minimum number of insertions, deletions, and substitutions
required to change one sequence into the other according to Levenshtein with custom
costs for insertion, deletion and substitution
Args:
s1 (str): first string to compare
s2 (str): second string to compare
insert_cost (int): cost for insertions
delete_cost (int): cost for deletions
replace_cost (int): cost for substitutions
Returns:
int: weighted levenshtein distance between s1 and s2
)pbdoc",
py::arg("s1"), py::arg("s2"), py::arg("insert_cost")=1, py::arg("delete_cost")=1, py::arg("replace_cost")=1);
mlevenshtein.def("normalized_weighted_distance",
[](std::wstring_view s1, std::wstring_view s2, float score_cutoff){
return levenshtein::normalized_weighted_distance(s1, s2, score_cutoff/100)*100;
},
R"pbdoc(
Calculates a normalized levenshtein distance based on levenshtein.weighted_distance
It uses the following costs for edit operations:
edit operation | cost
:------------- | :---
Insert | 1
Remove | 1
Replace | 2
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
Returns:
float: normalized weighted levenshtein distance between s1 and s2 as a float between 0 and 100
)pbdoc",
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff") = 0);
}

View File

@ -1,205 +0,0 @@
import _rapidfuzz_cpp.fuzz as fuzz_cpp
def ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
"""
calculates a simple ratio between two strings
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.ratio("this is a test", "this is a test!")
96.55171966552734
"""
return fuzz_cpp.ratio(s1, s2, score_cutoff, preprocess)
def partial_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
"""
calculates a partial ratio between two strings
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.partial_ratio("this is a test", "this is a test!")
100.0
"""
return fuzz_cpp.partial_ratio(s1, s2, score_cutoff, preprocess)
def token_sort_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
"""
sorts the words in the string and calculates the fuzz.ratio between them
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
100.0
"""
return fuzz_cpp.token_sort_ratio(s1, s2, score_cutoff, preprocess)
def partial_token_sort_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
"""
sorts the words in the strings and calculates the fuzz.partial_ratio between them
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
"""
return fuzz_cpp.partial_token_sort_ratio(s1, s2, score_cutoff, preprocess)
def token_set_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
"""
Compares the words in the strings based on unique and common words between them using fuzz.ratio
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
83.8709716796875
>>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
100.0
"""
return fuzz_cpp.token_set_ratio(s1, s2, score_cutoff, preprocess)
def partial_token_set_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
"""
Compares the words in the strings based on unique and common words between them using fuzz.partial_ratio
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
"""
return fuzz_cpp.partial_token_set_ratio(s1, s2, score_cutoff, preprocess)
def token_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
"""
Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio
(faster than manually executing the two functions)
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
"""
return fuzz_cpp.token_ratio(s1, s2, score_cutoff, preprocess)
def partial_token_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
"""
Helper method that returns the maximum of fuzz.partial_token_set_ratio and fuzz.partial_token_sort_ratio
(faster than manually executing the two functions)
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
"""
return fuzz_cpp.partial_token_ratio(s1, s2, score_cutoff, preprocess)
def QRatio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
"""
calculates a quick ratio between two strings using fuzz.ratio
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
Example:
>>> fuzz.ratio("this is a test", "this is a test!")
96.55171966552734
"""
return fuzz_cpp.ratio(s1, s2, score_cutoff, preprocess)
def WRatio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
"""
Calculates a weighted ratio based on the other ratio algorithms
Args:
s1 (str): first string to compare
s2 (str): second string to compare
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
using utils.default_process. Defaults to True.
Returns:
float: ratio between s1 and s2 as a float between 0 and 100
"""
return fuzz_cpp.WRatio(s1, s2, score_cutoff, preprocess)

View File

@ -1 +0,0 @@
from _rapidfuzz_cpp.levenshtein import *

View File

@ -1,4 +1,4 @@
import _rapidfuzz_cpp.process
import rapidfuzz._process
from rapidfuzz import fuzz, utils
from typing import Iterable, List, Tuple, Optional, Union, Callable
import heapq
@ -22,8 +22,8 @@ def extract(query: str, choices: Iterable, scorer: Callable = fuzz.WRatio, proce
List[Tuple[str, float]]: returns a list of all matches that have a score >= score_cutoff
"""
if (not scorer or scorer == fuzz.WRatio) and (not processor or processor == utils.default_process):
return _rapidfuzz_cpp.process.extract(query, list(choices), limit, score_cutoff, bool(processor))
#if (not scorer or scorer == fuzz.WRatio) and (not processor or processor == utils.default_process):
# return _rapidfuzz_cpp.process.extract(query, list(choices), limit, score_cutoff, bool(processor))
# evaluate score inside python since scorer is a python function and so it would be required
# to add the python layer from C++ aswell
@ -63,8 +63,8 @@ def extractOne(query: str, choices: Iterable, scorer: Callable = fuzz.WRatio, pr
Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is
no match with a score >= score_cutoff
"""
if (not scorer or scorer == fuzz.WRatio) and (not processor or processor == utils.default_process):
return _rapidfuzz_cpp.process.extractOne(query, list(choices), score_cutoff, bool(processor))
#if (not scorer or scorer == fuzz.WRatio) and (not processor or processor == utils.default_process):
# return _rapidfuzz_cpp.process.extractOne(query, list(choices), score_cutoff, bool(processor))
# evaluate score inside python since scorer is a python function and so it would be required
# to add the python layer from C++ aswell

View File

@ -1,7 +1,6 @@
from setuptools import setup, Extension
from setuptools.command.build_ext import build_ext
import sys
import setuptools
from os import path
this_dir = path.abspath(path.dirname(__file__))
@ -12,64 +11,11 @@ with open(path.join(this_dir, "VERSION"), encoding='utf-8') as version_file:
with open(path.join(this_dir, 'README.md'), encoding='utf-8') as f:
long_description = f.read()
class get_pybind_include(object):
"""Helper class to determine the pybind11 include path
The purpose of this class is to postpone importing pybind11
until it is actually installed, so that the ``get_include()``
method can be invoked. """
def __init__(self, user=False):
self.user = user
def __str__(self):
import pybind11
return pybind11.get_include(self.user)
ext_modules = [
Extension(
'_rapidfuzz_cpp',
[
'python/src/rapidfuzz.cpp',
'cpp/src/fuzz.cpp',
'cpp/src/process.cpp',
'cpp/src/levenshtein.cpp',
'cpp/src/utils.cpp'
],
include_dirs=[
# Path to pybind11 headers
get_pybind_include(),
get_pybind_include(user=True),
"cpp/src"
],
language='c++',
),
]
# As of Python 3.6, CCompiler has a `has_flag` method.
# cf http://bugs.python.org/issue26689
def has_flag(compiler, flagname):
"""Return a boolean indicating whether a flag name is supported on
the specified compiler.
"""
import tempfile
with tempfile.NamedTemporaryFile('w', suffix='.cpp') as f:
f.write('int main (int argc, char **argv) { return 0; }')
try:
compiler.compile([f.name], extra_postargs=[flagname])
except setuptools.distutils.errors.CompileError:
return False
return True
class BuildExt(build_ext):
"""A custom build extension for adding compiler-specific options."""
c_opts = {
'msvc': ['/EHsc', '/O2', '/std:c++17'],
'unix': ['-O3', '-std=c++17'],
'unix': ['-O3', '-std=c++17', '-Werror'],
}
l_opts = {
'msvc': [],
@ -87,8 +33,6 @@ class BuildExt(build_ext):
link_opts = self.l_opts.get(ct, [])
if ct == 'unix':
opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version())
if has_flag(self.compiler, '-fvisibility=hidden'):
opts.append('-fvisibility=hidden')
elif ct == 'msvc':
opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
for ext in self.extensions:
@ -105,14 +49,31 @@ setup(
description='rapid fuzzy string matching',
long_description=long_description,
long_description_content_type='text/markdown',
ext_modules=ext_modules,
install_requires=['pybind11>=2.4'],
setup_requires=['pybind11>=2.4'],
cmdclass={'build_ext': BuildExt},
package_data={'': ['LICENSE', 'VERSION']},
ext_modules = [
Extension(
'rapidfuzz.levenshtein',
['python/src/py_levenshtein.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'],
include_dirs=["cpp/src"],
language='c++',
),
Extension(
'rapidfuzz.fuzz',
['python/src/py_fuzz.cpp', 'cpp/src/fuzz.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'],
include_dirs=["cpp/src"],
language='c++',
),
Extension(
'rapidfuzz._process',
['python/src/py_process.cpp', 'cpp/src/process.cpp', 'cpp/src/fuzz.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'],
include_dirs=["cpp/src"],
language='c++',
),
],
cmdclass={'build_ext': BuildExt},
package_data={'': ['LICENSE', 'VERSION']},
package_dir={'': 'python/src'},
packages=['rapidfuzz'],
include_package_data=True,
packages=['rapidfuzz'],
include_package_data=True,
zip_safe=False,
classifiers=[
"Programming Language :: Python :: 3",
@ -123,4 +84,4 @@ setup(
"License :: OSI Approved :: MIT License",
],
python_requires=">=3.5",
)
)