diff --git a/cpp/src/fuzz.cpp b/cpp/src/fuzz.cpp index cd04e81..7031eb5 100644 --- a/cpp/src/fuzz.cpp +++ b/cpp/src/fuzz.cpp @@ -8,82 +8,50 @@ #include -percent fuzz::partial_ratio(std::wstring s1, std::wstring s2, percent score_cutoff, bool preprocess) { - if (score_cutoff >= 100) { +percent fuzz::partial_ratio(std::wstring_view s1, std::wstring_view s2, percent score_cutoff) { + if (s1.empty() || s2.empty() || score_cutoff > 100) { return 0; } - if (preprocess) { - s1 = utils::default_process(std::move(s1)); - s2 = utils::default_process(std::move(s2)); - } - - if (s1.empty() || s2.empty()) { - return 0; - } - - std::wstring_view shorter; - std::wstring_view longer; - if (s1.length() > s2.length()) { - shorter = s2; - longer = s1; - } else { - shorter = s1; - longer = s2; + std::swap(s1, s2); } - auto blocks = levenshtein::matching_blocks(shorter, longer); + auto blocks = levenshtein::matching_blocks(s1, s2); float max_ratio = 0; for (const auto &block : blocks) { std::size_t long_start = (block.second_start > block.first_start) ? block.second_start - block.first_start : 0; - std::wstring_view long_substr = longer.substr(long_start, shorter.length()); + std::wstring_view long_substr = s2.substr(long_start, s1.length()); - float ls_ratio = levenshtein::normalized_weighted_distance(shorter, long_substr, score_cutoff / 100); + float ls_ratio = levenshtein::normalized_weighted_distance(s1, long_substr, score_cutoff / 100); if (ls_ratio > 0.995) { - return 100; - } + return 100; + } if (ls_ratio > max_ratio) { - max_ratio = ls_ratio; - } + max_ratio = ls_ratio; + } } return utils::result_cutoff(max_ratio*100, score_cutoff); } -percent fuzz::ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) { - float result; - if (preprocess) { - result = levenshtein::normalized_weighted_distance( - utils::default_process(s1), utils::default_process(s2), score_cutoff / 100); - } else { - result = levenshtein::normalized_weighted_distance(s1, s2, score_cutoff / 100); - } - +percent fuzz::ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) { + float result = levenshtein::normalized_weighted_distance(s1, s2, score_cutoff / 100); return utils::result_cutoff(result*100, score_cutoff); } -percent fuzz::token_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) { - if (score_cutoff >= 100) { +percent fuzz::token_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) { + if (score_cutoff > 100) { return 0; } - std::wstring a; - std::wstring b; - if (preprocess) { - a = utils::default_process(s1); - b = utils::default_process(s2); - } else { - a = s1; - b = s2; - } - std::vector tokens_a = utils::splitSV(a); + std::vector tokens_a = utils::splitSV(s1); std::sort(tokens_a.begin(), tokens_a.end()); - std::vector tokens_b = utils::splitSV(b); + std::vector tokens_b = utils::splitSV(s2); std::sort(tokens_b.begin(), tokens_b.end()); auto [intersection, difference_ab, difference_ba] = utils::set_decomposition(tokens_a, tokens_b); @@ -132,24 +100,14 @@ percent fuzz::token_ratio(const std::wstring &s1, const std::wstring &s2, percen // combines token_set and token_sort ratio from fuzzywuzzy so it is only required to // do a lot of operations once -percent fuzz::partial_token_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) { - if (score_cutoff >= 100) { +percent fuzz::partial_token_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) { + if (score_cutoff > 100) { return 0; } - std::wstring a; - std::wstring b; - if (preprocess) { - a = utils::default_process(s1); - b = utils::default_process(s2); - } else { - a = s1; - b = s2; - } - - std::vector tokens_a = utils::splitSV(a); + std::vector tokens_a = utils::splitSV(s1); std::sort(tokens_a.begin(), tokens_a.end()); - std::vector tokens_b = utils::splitSV(b); + std::vector tokens_b = utils::splitSV(s2); std::sort(tokens_b.begin(), tokens_b.end()); auto unique_a = tokens_a; @@ -170,7 +128,7 @@ percent fuzz::partial_token_ratio(const std::wstring &s1, const std::wstring &s2 return 100; } - percent result = partial_ratio(utils::join(tokens_a), utils::join(tokens_b), score_cutoff, false); + percent result = partial_ratio(utils::join(tokens_a), utils::join(tokens_b), score_cutoff); // do not calculate the same partial_ratio twice if (tokens_a.size() == unique_a.size() && tokens_b.size() == unique_b.size()) { return result; @@ -179,33 +137,23 @@ percent fuzz::partial_token_ratio(const std::wstring &s1, const std::wstring &s2 score_cutoff = std::max(score_cutoff, result); return std::max( result, - partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff, false) + partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff) ); } -percent _token_sort(const std::wstring &s1, const std::wstring &s2, bool partial, percent score_cutoff=0.0, bool preprocess = true) { - if (score_cutoff >= 100) { +percent _token_sort(const std::wstring_view &s1, const std::wstring_view &s2, bool partial, percent score_cutoff=0.0) { + if (score_cutoff > 100) { return 0; } - std::wstring a; - std::wstring b; - if (preprocess) { - a = utils::default_process(s1); - b = utils::default_process(s2); - } else { - a = s1; - b = s2; - } - - std::vector tokens_a = utils::splitSV(a); + std::vector tokens_a = utils::splitSV(s1); std::sort(tokens_a.begin(), tokens_a.end()); - std::vector tokens_b = utils::splitSV(b); + std::vector tokens_b = utils::splitSV(s2); std::sort(tokens_b.begin(), tokens_b.end()); if (partial) { - return fuzz::partial_ratio(utils::join(tokens_a), utils::join(tokens_b), score_cutoff, false); + return fuzz::partial_ratio(utils::join(tokens_a), utils::join(tokens_b), score_cutoff); } else { float result = levenshtein::normalized_weighted_distance(tokens_a, tokens_b, score_cutoff / 100); return utils::result_cutoff(result*100, score_cutoff); @@ -213,34 +161,24 @@ percent _token_sort(const std::wstring &s1, const std::wstring &s2, bool partial } -percent fuzz::token_sort_ratio(const std::wstring &a, const std::wstring &b, percent score_cutoff, bool preprocess) { +percent fuzz::token_sort_ratio(const std::wstring_view &a, const std::wstring_view &b, percent score_cutoff) { return _token_sort(a, b, false, score_cutoff); } -percent fuzz::partial_token_sort_ratio(const std::wstring &a, const std::wstring &b, percent score_cutoff, bool preprocess) { +percent fuzz::partial_token_sort_ratio(const std::wstring_view &a, const std::wstring_view &b, percent score_cutoff) { return _token_sort(a, b, true, score_cutoff); } -percent fuzz::token_set_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) { - if (score_cutoff >= 100) { +percent fuzz::token_set_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) { + if (score_cutoff > 100) { return 0; } - std::wstring a; - std::wstring b; - if (preprocess) { - a = utils::default_process(s1); - b = utils::default_process(s2); - } else { - a = s1; - b = s2; - } - - std::vector tokens_a = utils::splitSV(a); + std::vector tokens_a = utils::splitSV(s1); std::sort(tokens_a.begin(), tokens_a.end()); - std::vector tokens_b = utils::splitSV(b); + std::vector tokens_b = utils::splitSV(s2); std::sort(tokens_b.begin(), tokens_b.end()); auto [intersection, difference_ab, difference_ba] = utils::set_decomposition(tokens_a, tokens_b); @@ -287,24 +225,14 @@ percent fuzz::token_set_ratio(const std::wstring &s1, const std::wstring &s2, pe } -percent fuzz::partial_token_set_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) { - if (score_cutoff >= 100) { +percent fuzz::partial_token_set_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) { + if (score_cutoff > 100) { return 0; } - std::wstring a; - std::wstring b; - if (preprocess) { - a = utils::default_process(s1); - b = utils::default_process(s2); - } else { - a = s1; - b = s2; - } - - std::vector tokens_a = utils::splitSV(a); + std::vector tokens_a = utils::splitSV(s1); std::sort(tokens_a.begin(), tokens_a.end()); - std::vector tokens_b = utils::splitSV(b); + std::vector tokens_b = utils::splitSV(s2); std::sort(tokens_b.begin(), tokens_b.end()); tokens_a.erase(std::unique(tokens_a.begin(), tokens_a.end()), tokens_a.end()); @@ -323,43 +251,33 @@ percent fuzz::partial_token_set_ratio(const std::wstring &s1, const std::wstring return 100; } - return partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff, false); + return partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff); } -percent fuzz::WRatio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) { - if (score_cutoff >= 100) { +percent fuzz::WRatio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) { + if (score_cutoff > 100) { return 0; } - std::wstring a; - std::wstring b; - if (preprocess) { - a = utils::default_process(s1); - b = utils::default_process(s2); - } else { - a = s1; - b = s2; - } - const float UNBASE_SCALE = 0.95; - std::size_t len_a = a.length(); - std::size_t len_b = b.length(); + std::size_t len_a = s1.length(); + std::size_t len_b = s2.length(); float len_ratio = (len_a > len_b) ? (float)len_a / (float)len_b : (float)len_b / (float)len_a; - float sratio = ratio(a, b, score_cutoff, false); + float sratio = ratio(s1, s2, score_cutoff); if (len_ratio < 1.5) { score_cutoff = std::max(score_cutoff, sratio); - return std::max(sratio, token_ratio(a, b, score_cutoff/UNBASE_SCALE, false) * UNBASE_SCALE); + return std::max(sratio, token_ratio(s1, s2, score_cutoff/UNBASE_SCALE) * UNBASE_SCALE); } float partial_scale = (len_ratio < 8.0) ? 0.9 : 0.6; score_cutoff = std::max(score_cutoff, sratio)/partial_scale; - sratio = std::max(sratio, partial_ratio(a, b, score_cutoff, false) * partial_scale); + sratio = std::max(sratio, partial_ratio(s1, s2, score_cutoff) * partial_scale); score_cutoff = std::max(score_cutoff, sratio)/UNBASE_SCALE; - return std::max(sratio, partial_token_ratio(a, b, score_cutoff, false) * UNBASE_SCALE * partial_scale ); + return std::max(sratio, partial_token_ratio(s1, s2, score_cutoff) * UNBASE_SCALE * partial_scale ); } diff --git a/cpp/src/fuzz.hpp b/cpp/src/fuzz.hpp index 41d647b..ff592f4 100644 --- a/cpp/src/fuzz.hpp +++ b/cpp/src/fuzz.hpp @@ -5,17 +5,17 @@ using percent = float; namespace fuzz { - percent ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true); - percent partial_ratio(std::wstring s1, std::wstring s2, percent score_cutoff=0, bool preprocess = true); + percent ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0); + percent partial_ratio(std::wstring_view s1, std::wstring_view s2, percent score_cutoff=0); - percent token_sort_ratio(const std::wstring &a, const std::wstring &b, percent score_cutoff=0, bool preprocess = true); - percent partial_token_sort_ratio(const std::wstring &a, const std::wstring &b, percent score_cutoff=0, bool preprocess = true); + percent token_sort_ratio(const std::wstring_view &a, const std::wstring_view &b, percent score_cutoff=0); + percent partial_token_sort_ratio(const std::wstring_view &a, const std::wstring_view &b, percent score_cutoff=0); - percent token_set_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true); - percent partial_token_set_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true); + percent token_set_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0); + percent partial_token_set_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0); - percent token_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true); - percent partial_token_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true); + percent token_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0); + percent partial_token_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0); - percent WRatio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff = 0, bool preprocess = true); + percent WRatio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff = 0); } diff --git a/cpp/src/levenshtein.cpp b/cpp/src/levenshtein.cpp index b6eff0f..3e24be5 100644 --- a/cpp/src/levenshtein.cpp +++ b/cpp/src/levenshtein.cpp @@ -106,7 +106,7 @@ std::vector levenshtein::editops(std::wstring_view sentence std::vector levenshtein::matching_blocks(std::wstring_view sentence1, std::wstring_view sentence2) { auto edit_ops = editops(sentence1, sentence2); std::size_t first_start = 0; - std::size_t second_start = 0; + std::size_t second_start = 0; std::vector mblocks; for (const auto &op : edit_ops) { @@ -191,13 +191,13 @@ std::size_t levenshtein::distance(std::wstring_view sentence1, std::wstring_view ++temp; } - temp = std::min({ - *cache_iter + 1, - *(++cache_iter) + 1, - temp + temp = std::min({ + *cache_iter + 1, + *(++cache_iter) + 1, + temp }); - std::swap(*cache_iter, temp); - } + std::swap(*cache_iter, temp); + } } return cache.back(); } diff --git a/cpp/src/levenshtein.hpp b/cpp/src/levenshtein.hpp index 658172a..bf43e4d 100644 --- a/cpp/src/levenshtein.hpp +++ b/cpp/src/levenshtein.hpp @@ -35,9 +35,9 @@ namespace levenshtein { std::vector editops(std::wstring_view sentence1, std::wstring_view sentence2); struct MatchingBlock { - std::size_t first_start; - std::size_t second_start; - std::size_t len; + std::size_t first_start; + std::size_t second_start; + std::size_t len; MatchingBlock(std::size_t first_start, std::size_t second_start, std::size_t len) : first_start(first_start), second_start(second_start), len(len) {} }; @@ -90,7 +90,6 @@ namespace levenshtein { } - template inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const std::vector &words, std::vector &cache, std::size_t current_cache) @@ -101,8 +100,8 @@ inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const s auto min_distance = std::numeric_limits::max(); auto charCmp = [&] (const wchar_t &char2) { - if (letter_cmp == char2) { result = current_cache; } - else { ++result; } + if (letter_cmp == char2) { result = current_cache; } + else { ++result; } current_cache = *cache_iter; if (result > current_cache + 1) { @@ -121,7 +120,7 @@ inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const s // no whitespace should be added in front of the first word for (const auto &letter : *word_iter) { - charCmp(letter); + charCmp(letter); } ++word_iter; @@ -131,7 +130,7 @@ inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const s // check following word for (const auto &letter : *word_iter) { - charCmp(letter); + charCmp(letter); } } @@ -260,16 +259,12 @@ inline std::size_t levenshtein::weighted_distance(std::wstring_view sentence1, s } - - - template inline float levenshtein::normalized_weighted_distance(const Sentence1 &sentence1, const Sentence2 &sentence2, float min_ratio) { if (sentence1.empty() || sentence2.empty()) { return sentence1.empty() && sentence2.empty(); } - return 1; std::size_t sentence1_len = utils::joined_size(sentence1); std::size_t sentence2_len = utils::joined_size(sentence2); diff --git a/cpp/src/process.cpp b/cpp/src/process.cpp index 74f9f5e..b3b9e9f 100644 --- a/cpp/src/process.cpp +++ b/cpp/src/process.cpp @@ -27,7 +27,7 @@ process::extract(const std::wstring &query, const std::vector &cho b = choice; } - float score = fuzz::WRatio(query, choice, score_cutoff, false); + float score = fuzz::WRatio(query, choice, score_cutoff); if (score >= score_cutoff) { results.emplace_back(std::make_pair(choice, score)); } @@ -68,7 +68,7 @@ process::extractOne(const std::wstring &query, const std::vector & b = choice; } - float score = fuzz::WRatio(a, b, score_cutoff, false); + float score = fuzz::WRatio(a, b, score_cutoff); if (score >= score_cutoff) { score_cutoff = score; match_found = true; diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp index 9835206..beeddd7 100644 --- a/cpp/src/utils.cpp +++ b/cpp/src/utils.cpp @@ -5,7 +5,7 @@ */ template inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1, - InputIterator2 first2, InputIterator2 last2) + InputIterator2 first2, InputIterator2 last2) { return std::distance(first1, std::mismatch(first1, last1, first2, last2).first); } @@ -15,8 +15,8 @@ inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1, */ std::size_t remove_common_prefix(std::wstring_view& a, std::wstring_view& b) { auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end()); - a.remove_prefix(prefix); - b.remove_prefix(prefix); + a.remove_prefix(prefix); + b.remove_prefix(prefix); return prefix; } @@ -25,7 +25,7 @@ std::size_t remove_common_prefix(std::wstring_view& a, std::wstring_view& b) { */ std::size_t remove_common_suffix(std::wstring_view& a, std::wstring_view& b) { auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend()); - a.remove_suffix(suffix); + a.remove_suffix(suffix); b.remove_suffix(suffix); return suffix; } @@ -34,7 +34,7 @@ std::size_t remove_common_suffix(std::wstring_view& a, std::wstring_view& b) { * Removes common affix of two string views */ Affix utils::remove_common_affix(std::wstring_view& a, std::wstring_view& b) { - return Affix { + return Affix { remove_common_prefix(a, b), remove_common_suffix(a, b) }; @@ -104,7 +104,7 @@ void utils::trim(std::wstring &s) { void utils::lower_case(std::wstring &s) { std::for_each(s.begin(), s.end(), [](wchar_t & c){ - c = ::tolower(c); + c = std::tolower(c); }); } @@ -114,6 +114,7 @@ std::wstring utils::default_process(std::wstring s) { return s; } + DecomposedSet utils::set_decomposition(std::vector a, std::vector b) { std::vector intersection; std::vector difference_ab; @@ -134,7 +135,7 @@ DecomposedSet utils::set_decomposition(std::vector a, std::ve } std::size_t utils::joined_size(const std::wstring_view &x){ - return x.size(); + return x.size(); } @@ -145,7 +146,7 @@ std::size_t utils::joined_size(const std::vector &x){ // there is a whitespace between each word std::size_t result = x.size() - 1; - for (const auto &y: x) result += y.size(); + for (const auto &y: x) result += y.size(); - return result; + return result; } \ No newline at end of file diff --git a/cpp/src/utils.hpp b/cpp/src/utils.hpp index 3dd781a..4544de8 100644 --- a/cpp/src/utils.hpp +++ b/cpp/src/utils.hpp @@ -40,6 +40,7 @@ namespace utils { void trim(std::wstring &s); void lower_case(std::wstring &s); + void lower_case(std::wstring &s); std::wstring default_process(std::wstring s); diff --git a/python/src/py_fuzz.cpp b/python/src/py_fuzz.cpp new file mode 100644 index 0000000..0a47feb --- /dev/null +++ b/python/src/py_fuzz.cpp @@ -0,0 +1,445 @@ +#define PY_SSIZE_T_CLEAN /* Make "s#" use Py_ssize_t rather than int. */ +#include +#include +#include "fuzz.hpp" +#include "utils.hpp" + + +PyObject* ratio(PyObject *self, PyObject *args, PyObject *keywds) { + const wchar_t *s1; + const wchar_t *s2; + float score_cutoff = 0; + bool preprocess = true; + static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast(kwlist), + &s1, &s2, &score_cutoff, &preprocess)) + return NULL; + + double result; + if (preprocess) { + result = fuzz::ratio( + utils::default_process(s1), + utils::default_process(s2), + score_cutoff); + } else { + result = fuzz::ratio( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2)), + score_cutoff); + } + + return PyFloat_FromDouble(result); +} + +PyObject* partial_ratio(PyObject *self, PyObject *args, PyObject *keywds) { + const wchar_t *s1; + const wchar_t *s2; + float score_cutoff = 0; + bool preprocess = true; + static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast(kwlist), + &s1, &s2, &score_cutoff, &preprocess)) + return NULL; + + double result; + if (preprocess) { + result = fuzz::partial_ratio( + utils::default_process(s1), + utils::default_process(s2), + score_cutoff); + } else { + result = fuzz::partial_ratio( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2)), + score_cutoff); + } + + return PyFloat_FromDouble(result); +} + +PyObject* token_sort_ratio(PyObject *self, PyObject *args, PyObject *keywds) { + const wchar_t *s1; + const wchar_t *s2; + float score_cutoff = 0; + bool preprocess = true; + static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast(kwlist), + &s1, &s2, &score_cutoff, &preprocess)) + return NULL; + + double result; + if (preprocess) { + result = fuzz::token_sort_ratio( + utils::default_process(s1), + utils::default_process(s2), + score_cutoff); + } else { + result = fuzz::token_sort_ratio( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2)), + score_cutoff); + } + + return PyFloat_FromDouble(result); +} + +PyObject* partial_token_sort_ratio(PyObject *self, PyObject *args, PyObject *keywds) { + const wchar_t *s1; + const wchar_t *s2; + float score_cutoff = 0; + bool preprocess = true; + static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast(kwlist), + &s1, &s2, &score_cutoff, &preprocess)) + return NULL; + + double result; + if (preprocess) { + result = fuzz::partial_token_sort_ratio( + utils::default_process(s1), + utils::default_process(s2), + score_cutoff); + } else { + result = fuzz::partial_token_sort_ratio( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2)), + score_cutoff); + } + + return PyFloat_FromDouble(result); +} + +PyObject* token_set_ratio(PyObject *self, PyObject *args, PyObject *keywds) { + const wchar_t *s1; + const wchar_t *s2; + float score_cutoff = 0; + bool preprocess = true; + static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast(kwlist), + &s1, &s2, &score_cutoff, &preprocess)) + return NULL; + + double result; + if (preprocess) { + result = fuzz::token_set_ratio( + utils::default_process(s1), + utils::default_process(s2), + score_cutoff); + } else { + result = fuzz::token_set_ratio( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2)), + score_cutoff); + } + + return PyFloat_FromDouble(result); +} + + +PyObject* partial_token_set_ratio(PyObject *self, PyObject *args, PyObject *keywds) { + const wchar_t *s1; + const wchar_t *s2; + float score_cutoff = 0; + bool preprocess = true; + static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast(kwlist), + &s1, &s2, &score_cutoff, &preprocess)) + return NULL; + + double result; + if (preprocess) { + result = fuzz::partial_token_set_ratio( + utils::default_process(s1), + utils::default_process(s2), + score_cutoff); + } else { + result = fuzz::partial_token_set_ratio( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2)), + score_cutoff); + } + + return PyFloat_FromDouble(result); +} + +PyObject* token_ratio(PyObject *self, PyObject *args, PyObject *keywds) { + const wchar_t *s1; + const wchar_t *s2; + float score_cutoff = 0; + bool preprocess = true; + static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast(kwlist), + &s1, &s2, &score_cutoff, &preprocess)) + return NULL; + + double result; + if (preprocess) { + result = fuzz::token_ratio( + utils::default_process(s1), + utils::default_process(s2), + score_cutoff); + } else { + result = fuzz::token_ratio( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2)), + score_cutoff); + } + + return PyFloat_FromDouble(result); +} + +PyObject* partial_token_ratio(PyObject *self, PyObject *args, PyObject *keywds) { + const wchar_t *s1; + const wchar_t *s2; + float score_cutoff = 0; + bool preprocess = true; + static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast(kwlist), + &s1, &s2, &score_cutoff, &preprocess)) + return NULL; + + double result; + if (preprocess) { + result = fuzz::partial_token_ratio( + utils::default_process(s1), + utils::default_process(s2), + score_cutoff); + } else { + result = fuzz::partial_token_ratio( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2)), + score_cutoff); + } + + return PyFloat_FromDouble(result); +} + +PyObject* WRatio(PyObject *self, PyObject *args, PyObject *keywds) { + const wchar_t *s1; + const wchar_t *s2; + float score_cutoff = 0; + bool preprocess = true; + static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast(kwlist), + &s1, &s2, &score_cutoff, &preprocess)) + return NULL; + + double result; + if (preprocess) { + result = fuzz::WRatio( + utils::default_process(s1), + utils::default_process(s2), + score_cutoff); + } else { + result = fuzz::WRatio( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2)), + score_cutoff); + } + + return PyFloat_FromDouble(result); +} + + +static PyMethodDef methods[] = { + /* The cast of the function is necessary since PyCFunction values + * only take two PyObject* parameters, and these functions take + * three. + */ + {"ratio", (PyCFunction)(void(*)(void))ratio, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + calculates a simple ratio between two strings + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. + For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. + preprocess (bool): Optional argument to specify whether the strings should be preprocessed + using utils.default_process. Defaults to True. + + Returns: + float: ratio between s1 and s2 as a float between 0 and 100 + + Example: + >>> fuzz.ratio("this is a test", "this is a test!") + 96.55171966552734 + )pbdoc"}, + {"partial_ratio", (PyCFunction)(void(*)(void))partial_ratio, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + calculates a partial ratio between two strings + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. + For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. + preprocess (bool): Optional argument to specify whether the strings should be preprocessed + using utils.default_process. Defaults to True. + + Returns: + float: ratio between s1 and s2 as a float between 0 and 100 + + Example: + >>> fuzz.partial_ratio("this is a test", "this is a test!") + 100.0 + )pbdoc"}, + {"token_sort_ratio", (PyCFunction)(void(*)(void))token_sort_ratio, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + sorts the words in the string and calculates the fuzz.ratio between them + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. + For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. + preprocess (bool): Optional argument to specify whether the strings should be preprocessed + using utils.default_process. Defaults to True. + + Returns: + float: ratio between s1 and s2 as a float between 0 and 100 + + Example: + >>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear") + 100.0 + )pbdoc"}, + {"partial_token_sort_ratio", (PyCFunction)(void(*)(void))partial_token_sort_ratio, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + sorts the words in the strings and calculates the fuzz.partial_ratio between them + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. + For ratio < score_cutoff 0 is returned instead of the ratio. + preprocess (bool): Optional argument to specify whether the strings should be preprocessed + using utils.default_process. + + Returns: + float: ratio between s1 and s2 as a float between 0 and 100 + )pbdoc"}, + {"token_set_ratio", (PyCFunction)(void(*)(void))token_set_ratio, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + Compares the words in the strings based on unique and common words between them using fuzz.ratio + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. + For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. + preprocess (bool): Optional argument to specify whether the strings should be preprocessed + using utils.default_process. Defaults to True. + + Returns: + float: ratio between s1 and s2 as a float between 0 and 100 + + Example: + >>> fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear") + 83.8709716796875 + >>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear") + 100.0 + )pbdoc"}, + {"partial_token_sort_ratio", (PyCFunction)(void(*)(void))partial_token_sort_ratio, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + Compares the words in the strings based on unique and common words between them using fuzz.partial_ratio + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. + For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. + preprocess (bool): Optional argument to specify whether the strings should be preprocessed + using utils.default_process. Defaults to True. + + Returns: + float: ratio between s1 and s2 as a float between 0 and 100 + )pbdoc"}, + {"token_ratio", (PyCFunction)(void(*)(void))token_ratio, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio + (faster than manually executing the two functions) + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. + For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. + preprocess (bool): Optional argument to specify whether the strings should be preprocessed + using utils.default_process. Defaults to True. + + Returns: + float: ratio between s1 and s2 as a float between 0 and 100 + )pbdoc"}, + {"partial_token_ratio", (PyCFunction)(void(*)(void))partial_token_ratio, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + Helper method that returns the maximum of fuzz.partial_token_set_ratio and fuzz.partial_token_sort_ratio + (faster than manually executing the two functions) + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. + For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. + preprocess (bool): Optional argument to specify whether the strings should be preprocessed + using utils.default_process. Defaults to True. + + Returns: + float: ratio between s1 and s2 as a float between 0 and 100 + )pbdoc"}, + {"QRatio", (PyCFunction)(void(*)(void))ratio, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + calculates a quick ratio between two strings using fuzz.ratio + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. + For ratio < score_cutoff 0 is returned instead. Defaults to 0. + preprocess (bool): Optional argument to specify whether the strings should be preprocessed + using utils.default_process. Defaults to True. + + Returns: + float: ratio between s1 and s2 as a float between 0 and 100 + + Example: + >>> fuzz.ratio("this is a test", "this is a test!") + 96.55171966552734 + )pbdoc"}, + {"WRatio", (PyCFunction)(void(*)(void))WRatio, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + Calculates a weighted ratio based on the other ratio algorithms + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. + For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. + preprocess (bool): Optional argument to specify whether the strings should be preprocessed + using utils.default_process. Defaults to True. + + Returns: + float: ratio between s1 and s2 as a float between 0 and 100 + )pbdoc"}, + {NULL, NULL, 0, NULL} /* sentinel */ +}; + +static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "rapidfuzz.fuzz", + NULL, + -1, + methods +}; + +PyMODINIT_FUNC PyInit_fuzz(void) { + return PyModule_Create(&moduledef); +} \ No newline at end of file diff --git a/python/src/py_levenshtein.cpp b/python/src/py_levenshtein.cpp new file mode 100644 index 0000000..d85ebef --- /dev/null +++ b/python/src/py_levenshtein.cpp @@ -0,0 +1,169 @@ +#define PY_SSIZE_T_CLEAN /* Make "s#" use Py_ssize_t rather than int. */ +#include +#include +#include "levenshtein.hpp" + + +PyObject* distance(PyObject *self, PyObject *args, PyObject *keywds) { + const wchar_t *s1; + const wchar_t *s2; + static const char *kwlist[] = {"s1", "s2", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu", const_cast(kwlist), + &s1, &s2)) + return NULL; + + std::size_t result = levenshtein::distance( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2))); + return PyLong_FromSize_t(result); +} + +PyObject* normalized_distance(PyObject *self, PyObject *args, PyObject *keywds) { + const wchar_t *s1; + const wchar_t *s2; + float score_cutoff = 0; + static const char *kwlist[] = {"s1", "s2", "score_cutoff", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|f", const_cast(kwlist), + &s1, &s2, &score_cutoff)) + return NULL; + + double result = levenshtein::normalized_distance( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2)), + score_cutoff/100); + return PyFloat_FromDouble(result*100); +} + +PyObject* weighted_distance(PyObject *self, PyObject *args, PyObject *keywds) { + const wchar_t *s1; + const wchar_t *s2; + std::size_t insert_cost = 1; + std::size_t delete_cost = 1; + std::size_t replace_cost = 1; + + static const char *kwlist[] = {"s1", "s2", "insert_cost", "delete_cost", "replace_cost", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|nnn", const_cast(kwlist), + &s1, &s2, &insert_cost, &delete_cost, &replace_cost)) + return NULL; + + if (insert_cost == 1 && delete_cost == 1) { + if (replace_cost == 1) { + std::size_t result = levenshtein::distance( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2))); + return PyLong_FromSize_t(result); + } else if (replace_cost == 2) { + std::size_t result = levenshtein::weighted_distance( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2))); + return PyLong_FromSize_t(result); + } + } + std::size_t result = levenshtein::generic_distance( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2)), + insert_cost, delete_cost, replace_cost); + return PyLong_FromSize_t(result); +} + +PyObject* normalized_weighted_distance(PyObject *self, PyObject *args, PyObject *keywds) { + const wchar_t *s1; + const wchar_t *s2; + float score_cutoff = 0; + static const char *kwlist[] = {"s1", "s2", "score_cutoff", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|f", const_cast(kwlist), + &s1, &s2, &score_cutoff)) + return NULL; + + double result = levenshtein::normalized_weighted_distance( + std::wstring_view(s1, wcslen(s1)), + std::wstring_view(s2, wcslen(s2)), + score_cutoff/100); + return PyFloat_FromDouble(result*100); +} + + +static PyMethodDef methods[] = { + /* The cast of the function is necessary since PyCFunction values + * only take two PyObject* parameters, and these functions take + * three. + */ + {"distance", (PyCFunction)(void(*)(void))distance, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + Calculates the minimum number of insertions, deletions, and substitutions + required to change one sequence into the other according to Levenshtein. + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + + Returns: + int: levenshtein distance between s1 and s2 + )pbdoc"}, + {"normalized_distance", (PyCFunction)(void(*)(void))normalized_distance, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + Calculates a normalized levenshtein distance based on levenshtein.distance + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. + For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. + + Returns: + float: normalized levenshtein distance between s1 and s2 as a float between 0 and 100 + )pbdoc"}, + {"weighted_distance", (PyCFunction)(void(*)(void))weighted_distance, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + Calculates the minimum number of insertions, deletions, and substitutions + required to change one sequence into the other according to Levenshtein with custom + costs for insertion, deletion and substitution + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + insert_cost (int): cost for insertions + delete_cost (int): cost for deletions + replace_cost (int): cost for substitutions + + Returns: + int: weighted levenshtein distance between s1 and s2 + )pbdoc"}, + {"normalized_weighted_distance", (PyCFunction)(void(*)(void))normalized_weighted_distance, METH_VARARGS | METH_KEYWORDS, + R"pbdoc( + Calculates a normalized levenshtein distance based on levenshtein.weighted_distance + It uses the following costs for edit operations: + + edit operation | cost + :------------- | :--- + Insert | 1 + Remove | 1 + Replace | 2 + + Args: + s1 (str): first string to compare + s2 (str): second string to compare + score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. + For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. + + Returns: + float: normalized weighted levenshtein distance between s1 and s2 as a float between 0 and 100 + )pbdoc"}, + {NULL, NULL, 0, NULL} /* sentinel */ +}; + +static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "rapidfuzz.levenshtein", + NULL, + -1, + methods +}; + +PyMODINIT_FUNC PyInit_levenshtein(void) { + return PyModule_Create(&moduledef); +} \ No newline at end of file diff --git a/python/src/py_process.cpp b/python/src/py_process.cpp new file mode 100644 index 0000000..67f6e3b --- /dev/null +++ b/python/src/py_process.cpp @@ -0,0 +1,25 @@ +#define PY_SSIZE_T_CLEAN /* Make "s#" use Py_ssize_t rather than int. */ +#include +#include +#include "process.hpp" + +static PyMethodDef methods[] = { + /* The cast of the function is necessary since PyCFunction values + * only take two PyObject* parameters, and these functions take + * three. + */ + + {NULL, NULL, 0, NULL} /* sentinel */ +}; + +static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "rapidfuzz._process", + NULL, + -1, + methods +}; + +PyMODINIT_FUNC PyInit__process(void) { + return PyModule_Create(&moduledef); +} \ No newline at end of file diff --git a/python/src/rapidfuzz.cpp b/python/src/rapidfuzz.cpp deleted file mode 100644 index 4277ce4..0000000 --- a/python/src/rapidfuzz.cpp +++ /dev/null @@ -1,287 +0,0 @@ -#include -#include -#include -#include "process.hpp" -#include "fuzz.hpp" -#include "utils.hpp" -#include "levenshtein.hpp" - -namespace py = pybind11; - -PYBIND11_MODULE(_rapidfuzz_cpp, m) { - m.doc() = R"pbdoc( - rapid string matching library - )pbdoc"; - - m.attr("__version__") = VERSION_INFO; - -/********************************************************/ -/* process module */ -/********************************************************/ - auto mprocess = m.def_submodule("process"); - mprocess.def("extract", &process::extract); - mprocess.def("extractOne", &process::extractOne); - -/********************************************************/ -/* fuzz module */ -/********************************************************/ - auto mfuzz = m.def_submodule("fuzz"); - mfuzz.def("ratio", &fuzz::ratio, - R"pbdoc( - calculates a simple ratio between two strings - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. - preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - - Example: - >>> fuzz.ratio("this is a test", "this is a test!") - 96.55171966552734 - )pbdoc", - py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess")); - - mfuzz.def("partial_ratio", &fuzz::partial_ratio, - R"pbdoc( - calculates a partial ratio between two strings - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. - preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - - Example: - >>> fuzz.partial_ratio("this is a test", "this is a test!") - 100.0 - )pbdoc", - py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess")); - - mfuzz.def("token_sort_ratio", &fuzz::token_sort_ratio, - R"pbdoc( - sorts the words in the string and calculates the fuzz.ratio between them - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. - preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - - Example: - >>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear") - 100.0 - )pbdoc", - py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess")); - - - mfuzz.def("partial_token_sort_ratio", &fuzz::partial_token_sort_ratio, - R"pbdoc( - sorts the words in the strings and calculates the fuzz.partial_ratio between them - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. - preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - )pbdoc", - py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess")); - - mfuzz.def("token_set_ratio", &fuzz::token_set_ratio, - R"pbdoc( - Compares the words in the strings based on unique and common words between them using fuzz.ratio - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. - preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - - Example: - >>> fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear") - 83.8709716796875 - >>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear") - 100.0 - )pbdoc", - py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess")); - - mfuzz.def("partial_token_set_ratio", &fuzz::partial_token_set_ratio, - R"pbdoc( - Compares the words in the strings based on unique and common words between them using fuzz.partial_ratio - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. - preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - )pbdoc", - py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess")); - - mfuzz.def("token_ratio", &fuzz::token_ratio, - R"pbdoc( - Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio - (faster than manually executing the two functions) - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. - preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - )pbdoc", - py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess")); - - mfuzz.def("partial_token_ratio", &fuzz::partial_token_ratio, - R"pbdoc( - Helper method that returns the maximum of fuzz.partial_token_set_ratio and fuzz.partial_token_sort_ratio - (faster than manually executing the two functions) - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. - preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - )pbdoc", - py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess")); - - mfuzz.def("WRatio", &fuzz::WRatio, - R"pbdoc( - Calculates a weighted ratio based on the other ratio algorithms - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. - preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - )pbdoc", - py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess")); - - -/********************************************************/ -/* levenshtein module */ -/********************************************************/ - auto mlevenshtein = m.def_submodule("levenshtein"); - - mlevenshtein.def("distance", - [](std::wstring_view s1, std::wstring_view s2){ - return levenshtein::distance(s1, s2); - }, - R"pbdoc( - Calculates the minimum number of insertions, deletions, and substitutions - required to change one sequence into the other according to Levenshtein. - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - - Returns: - int: levenshtein distance between s1 and s2 - )pbdoc", - py::arg("s1"), py::arg("s2")); - - mlevenshtein.def("normalized_distance", - [](std::wstring_view s1, std::wstring_view s2, float score_cutoff){ - return levenshtein::normalized_distance(s1, s2, score_cutoff/100)*100; - }, - R"pbdoc( - Calculates a normalized levenshtein distance based on levenshtein.distance - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. - - Returns: - float: normalized levenshtein distance between s1 and s2 as a float between 0 and 100 - )pbdoc", - py::arg("s1"), py::arg("s2"), py::arg("score_cutoff") = 0); - - mlevenshtein.def("weighted_distance", - [](std::wstring_view s1, std::wstring_view s2, size_t insert_cost, size_t delete_cost, size_t replace_cost){ - if (insert_cost == 1 && delete_cost == 1) { - if (replace_cost == 1) { - return levenshtein::distance(s1, s2); - } else if (replace_cost == 2) { - return levenshtein::weighted_distance(s1, s2); - } - } - return levenshtein::generic_distance(s1, s2, insert_cost, delete_cost, replace_cost); - }, - R"pbdoc( - Calculates the minimum number of insertions, deletions, and substitutions - required to change one sequence into the other according to Levenshtein with custom - costs for insertion, deletion and substitution - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - insert_cost (int): cost for insertions - delete_cost (int): cost for deletions - replace_cost (int): cost for substitutions - - Returns: - int: weighted levenshtein distance between s1 and s2 - )pbdoc", - py::arg("s1"), py::arg("s2"), py::arg("insert_cost")=1, py::arg("delete_cost")=1, py::arg("replace_cost")=1); - - mlevenshtein.def("normalized_weighted_distance", - [](std::wstring_view s1, std::wstring_view s2, float score_cutoff){ - return levenshtein::normalized_weighted_distance(s1, s2, score_cutoff/100)*100; - }, - R"pbdoc( - Calculates a normalized levenshtein distance based on levenshtein.weighted_distance - It uses the following costs for edit operations: - - edit operation | cost - :------------- | :--- - Insert | 1 - Remove | 1 - Replace | 2 - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. - - Returns: - float: normalized weighted levenshtein distance between s1 and s2 as a float between 0 and 100 - )pbdoc", - py::arg("s1"), py::arg("s2"), py::arg("score_cutoff") = 0); -} diff --git a/python/src/rapidfuzz/fuzz.py b/python/src/rapidfuzz/fuzz.py deleted file mode 100644 index bf2ecfe..0000000 --- a/python/src/rapidfuzz/fuzz.py +++ /dev/null @@ -1,205 +0,0 @@ -import _rapidfuzz_cpp.fuzz as fuzz_cpp - - -def ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True): - """ - calculates a simple ratio between two strings - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. - preprocess (bool): Optional argument to specify whether the strings should be preprocessed - using utils.default_process. Defaults to True. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - - Example: - >>> fuzz.ratio("this is a test", "this is a test!") - 96.55171966552734 - """ - return fuzz_cpp.ratio(s1, s2, score_cutoff, preprocess) - - -def partial_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True): - """ - calculates a partial ratio between two strings - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. - preprocess (bool): Optional argument to specify whether the strings should be preprocessed - using utils.default_process. Defaults to True. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - - Example: - >>> fuzz.partial_ratio("this is a test", "this is a test!") - 100.0 - """ - return fuzz_cpp.partial_ratio(s1, s2, score_cutoff, preprocess) - - -def token_sort_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True): - """ - sorts the words in the string and calculates the fuzz.ratio between them - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. - preprocess (bool): Optional argument to specify whether the strings should be preprocessed - using utils.default_process. Defaults to True. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - - Example: - >>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear") - 100.0 - """ - return fuzz_cpp.token_sort_ratio(s1, s2, score_cutoff, preprocess) - - -def partial_token_sort_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True): - """ - sorts the words in the strings and calculates the fuzz.partial_ratio between them - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. - preprocess (bool): Optional argument to specify whether the strings should be preprocessed - using utils.default_process. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - """ - return fuzz_cpp.partial_token_sort_ratio(s1, s2, score_cutoff, preprocess) - - -def token_set_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True): - """ - Compares the words in the strings based on unique and common words between them using fuzz.ratio - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. - preprocess (bool): Optional argument to specify whether the strings should be preprocessed - using utils.default_process. Defaults to True. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - - Example: - >>> fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear") - 83.8709716796875 - >>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear") - 100.0 - """ - return fuzz_cpp.token_set_ratio(s1, s2, score_cutoff, preprocess) - - -def partial_token_set_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True): - """ - Compares the words in the strings based on unique and common words between them using fuzz.partial_ratio - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. - preprocess (bool): Optional argument to specify whether the strings should be preprocessed - using utils.default_process. Defaults to True. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - """ - return fuzz_cpp.partial_token_set_ratio(s1, s2, score_cutoff, preprocess) - - -def token_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True): - """ - Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio - (faster than manually executing the two functions) - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. - preprocess (bool): Optional argument to specify whether the strings should be preprocessed - using utils.default_process. Defaults to True. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - """ - return fuzz_cpp.token_ratio(s1, s2, score_cutoff, preprocess) - - -def partial_token_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True): - """ - Helper method that returns the maximum of fuzz.partial_token_set_ratio and fuzz.partial_token_sort_ratio - (faster than manually executing the two functions) - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. - preprocess (bool): Optional argument to specify whether the strings should be preprocessed - using utils.default_process. Defaults to True. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - """ - return fuzz_cpp.partial_token_ratio(s1, s2, score_cutoff, preprocess) - - -def QRatio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True): - """ - calculates a quick ratio between two strings using fuzz.ratio - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead. Defaults to 0. - preprocess (bool): Optional argument to specify whether the strings should be preprocessed - using utils.default_process. Defaults to True. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - - Example: - >>> fuzz.ratio("this is a test", "this is a test!") - 96.55171966552734 - """ - return fuzz_cpp.ratio(s1, s2, score_cutoff, preprocess) - - -def WRatio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True): - """ - Calculates a weighted ratio based on the other ratio algorithms - - Args: - s1 (str): first string to compare - s2 (str): second string to compare - score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. - For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0. - preprocess (bool): Optional argument to specify whether the strings should be preprocessed - using utils.default_process. Defaults to True. - - Returns: - float: ratio between s1 and s2 as a float between 0 and 100 - """ - return fuzz_cpp.WRatio(s1, s2, score_cutoff, preprocess) \ No newline at end of file diff --git a/python/src/rapidfuzz/levenshtein.py b/python/src/rapidfuzz/levenshtein.py deleted file mode 100644 index b01570d..0000000 --- a/python/src/rapidfuzz/levenshtein.py +++ /dev/null @@ -1 +0,0 @@ -from _rapidfuzz_cpp.levenshtein import * \ No newline at end of file diff --git a/python/src/rapidfuzz/process.py b/python/src/rapidfuzz/process.py index d49ab26..56f0265 100644 --- a/python/src/rapidfuzz/process.py +++ b/python/src/rapidfuzz/process.py @@ -1,4 +1,4 @@ -import _rapidfuzz_cpp.process +import rapidfuzz._process from rapidfuzz import fuzz, utils from typing import Iterable, List, Tuple, Optional, Union, Callable import heapq @@ -22,8 +22,8 @@ def extract(query: str, choices: Iterable, scorer: Callable = fuzz.WRatio, proce List[Tuple[str, float]]: returns a list of all matches that have a score >= score_cutoff """ - if (not scorer or scorer == fuzz.WRatio) and (not processor or processor == utils.default_process): - return _rapidfuzz_cpp.process.extract(query, list(choices), limit, score_cutoff, bool(processor)) + #if (not scorer or scorer == fuzz.WRatio) and (not processor or processor == utils.default_process): + # return _rapidfuzz_cpp.process.extract(query, list(choices), limit, score_cutoff, bool(processor)) # evaluate score inside python since scorer is a python function and so it would be required # to add the python layer from C++ aswell @@ -63,8 +63,8 @@ def extractOne(query: str, choices: Iterable, scorer: Callable = fuzz.WRatio, pr Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is no match with a score >= score_cutoff """ - if (not scorer or scorer == fuzz.WRatio) and (not processor or processor == utils.default_process): - return _rapidfuzz_cpp.process.extractOne(query, list(choices), score_cutoff, bool(processor)) + #if (not scorer or scorer == fuzz.WRatio) and (not processor or processor == utils.default_process): + # return _rapidfuzz_cpp.process.extractOne(query, list(choices), score_cutoff, bool(processor)) # evaluate score inside python since scorer is a python function and so it would be required # to add the python layer from C++ aswell diff --git a/setup.py b/setup.py index 4d29ed1..a02b807 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,6 @@ from setuptools import setup, Extension from setuptools.command.build_ext import build_ext import sys -import setuptools from os import path this_dir = path.abspath(path.dirname(__file__)) @@ -12,64 +11,11 @@ with open(path.join(this_dir, "VERSION"), encoding='utf-8') as version_file: with open(path.join(this_dir, 'README.md'), encoding='utf-8') as f: long_description = f.read() - -class get_pybind_include(object): - """Helper class to determine the pybind11 include path - - The purpose of this class is to postpone importing pybind11 - until it is actually installed, so that the ``get_include()`` - method can be invoked. """ - - def __init__(self, user=False): - self.user = user - - def __str__(self): - import pybind11 - return pybind11.get_include(self.user) - - -ext_modules = [ - Extension( - '_rapidfuzz_cpp', - [ - 'python/src/rapidfuzz.cpp', - 'cpp/src/fuzz.cpp', - 'cpp/src/process.cpp', - 'cpp/src/levenshtein.cpp', - 'cpp/src/utils.cpp' - ], - include_dirs=[ - # Path to pybind11 headers - get_pybind_include(), - get_pybind_include(user=True), - "cpp/src" - ], - language='c++', - ), -] - - -# As of Python 3.6, CCompiler has a `has_flag` method. -# cf http://bugs.python.org/issue26689 -def has_flag(compiler, flagname): - """Return a boolean indicating whether a flag name is supported on - the specified compiler. - """ - import tempfile - with tempfile.NamedTemporaryFile('w', suffix='.cpp') as f: - f.write('int main (int argc, char **argv) { return 0; }') - try: - compiler.compile([f.name], extra_postargs=[flagname]) - except setuptools.distutils.errors.CompileError: - return False - return True - - class BuildExt(build_ext): """A custom build extension for adding compiler-specific options.""" c_opts = { 'msvc': ['/EHsc', '/O2', '/std:c++17'], - 'unix': ['-O3', '-std=c++17'], + 'unix': ['-O3', '-std=c++17', '-Werror'], } l_opts = { 'msvc': [], @@ -87,8 +33,6 @@ class BuildExt(build_ext): link_opts = self.l_opts.get(ct, []) if ct == 'unix': opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) - if has_flag(self.compiler, '-fvisibility=hidden'): - opts.append('-fvisibility=hidden') elif ct == 'msvc': opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) for ext in self.extensions: @@ -105,14 +49,31 @@ setup( description='rapid fuzzy string matching', long_description=long_description, long_description_content_type='text/markdown', - ext_modules=ext_modules, - install_requires=['pybind11>=2.4'], - setup_requires=['pybind11>=2.4'], - cmdclass={'build_ext': BuildExt}, - package_data={'': ['LICENSE', 'VERSION']}, + ext_modules = [ + Extension( + 'rapidfuzz.levenshtein', + ['python/src/py_levenshtein.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'], + include_dirs=["cpp/src"], + language='c++', + ), + Extension( + 'rapidfuzz.fuzz', + ['python/src/py_fuzz.cpp', 'cpp/src/fuzz.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'], + include_dirs=["cpp/src"], + language='c++', + ), + Extension( + 'rapidfuzz._process', + ['python/src/py_process.cpp', 'cpp/src/process.cpp', 'cpp/src/fuzz.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'], + include_dirs=["cpp/src"], + language='c++', + ), + ], + cmdclass={'build_ext': BuildExt}, + package_data={'': ['LICENSE', 'VERSION']}, package_dir={'': 'python/src'}, - packages=['rapidfuzz'], - include_package_data=True, + packages=['rapidfuzz'], + include_package_data=True, zip_safe=False, classifiers=[ "Programming Language :: Python :: 3", @@ -123,4 +84,4 @@ setup( "License :: OSI Approved :: MIT License", ], python_requires=">=3.5", -) +) \ No newline at end of file