start replacing pybind11 with the python C API
This commit is contained in:
parent
028db547d1
commit
7ee4808cf9
174
cpp/src/fuzz.cpp
174
cpp/src/fuzz.cpp
|
@ -8,82 +8,50 @@
|
|||
#include <iterator>
|
||||
|
||||
|
||||
percent fuzz::partial_ratio(std::wstring s1, std::wstring s2, percent score_cutoff, bool preprocess) {
|
||||
if (score_cutoff >= 100) {
|
||||
percent fuzz::partial_ratio(std::wstring_view s1, std::wstring_view s2, percent score_cutoff) {
|
||||
if (s1.empty() || s2.empty() || score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (preprocess) {
|
||||
s1 = utils::default_process(std::move(s1));
|
||||
s2 = utils::default_process(std::move(s2));
|
||||
}
|
||||
|
||||
if (s1.empty() || s2.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::wstring_view shorter;
|
||||
std::wstring_view longer;
|
||||
|
||||
if (s1.length() > s2.length()) {
|
||||
shorter = s2;
|
||||
longer = s1;
|
||||
} else {
|
||||
shorter = s1;
|
||||
longer = s2;
|
||||
std::swap(s1, s2);
|
||||
}
|
||||
|
||||
auto blocks = levenshtein::matching_blocks(shorter, longer);
|
||||
auto blocks = levenshtein::matching_blocks(s1, s2);
|
||||
float max_ratio = 0;
|
||||
for (const auto &block : blocks) {
|
||||
std::size_t long_start = (block.second_start > block.first_start) ? block.second_start - block.first_start : 0;
|
||||
std::wstring_view long_substr = longer.substr(long_start, shorter.length());
|
||||
std::wstring_view long_substr = s2.substr(long_start, s1.length());
|
||||
|
||||
float ls_ratio = levenshtein::normalized_weighted_distance(shorter, long_substr, score_cutoff / 100);
|
||||
float ls_ratio = levenshtein::normalized_weighted_distance(s1, long_substr, score_cutoff / 100);
|
||||
|
||||
if (ls_ratio > 0.995) {
|
||||
return 100;
|
||||
}
|
||||
return 100;
|
||||
}
|
||||
|
||||
if (ls_ratio > max_ratio) {
|
||||
max_ratio = ls_ratio;
|
||||
}
|
||||
max_ratio = ls_ratio;
|
||||
}
|
||||
}
|
||||
|
||||
return utils::result_cutoff(max_ratio*100, score_cutoff);
|
||||
}
|
||||
|
||||
|
||||
percent fuzz::ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) {
|
||||
float result;
|
||||
if (preprocess) {
|
||||
result = levenshtein::normalized_weighted_distance(
|
||||
utils::default_process(s1), utils::default_process(s2), score_cutoff / 100);
|
||||
} else {
|
||||
result = levenshtein::normalized_weighted_distance(s1, s2, score_cutoff / 100);
|
||||
}
|
||||
|
||||
percent fuzz::ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) {
|
||||
float result = levenshtein::normalized_weighted_distance(s1, s2, score_cutoff / 100);
|
||||
return utils::result_cutoff(result*100, score_cutoff);
|
||||
}
|
||||
|
||||
|
||||
percent fuzz::token_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) {
|
||||
if (score_cutoff >= 100) {
|
||||
percent fuzz::token_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) {
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::wstring a;
|
||||
std::wstring b;
|
||||
if (preprocess) {
|
||||
a = utils::default_process(s1);
|
||||
b = utils::default_process(s2);
|
||||
} else {
|
||||
a = s1;
|
||||
b = s2;
|
||||
}
|
||||
std::vector<std::wstring_view> tokens_a = utils::splitSV(a);
|
||||
std::vector<std::wstring_view> tokens_a = utils::splitSV(s1);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
std::vector<std::wstring_view> tokens_b = utils::splitSV(b);
|
||||
std::vector<std::wstring_view> tokens_b = utils::splitSV(s2);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
auto [intersection, difference_ab, difference_ba] = utils::set_decomposition(tokens_a, tokens_b);
|
||||
|
@ -132,24 +100,14 @@ percent fuzz::token_ratio(const std::wstring &s1, const std::wstring &s2, percen
|
|||
|
||||
// combines token_set and token_sort ratio from fuzzywuzzy so it is only required to
|
||||
// do a lot of operations once
|
||||
percent fuzz::partial_token_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) {
|
||||
if (score_cutoff >= 100) {
|
||||
percent fuzz::partial_token_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) {
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::wstring a;
|
||||
std::wstring b;
|
||||
if (preprocess) {
|
||||
a = utils::default_process(s1);
|
||||
b = utils::default_process(s2);
|
||||
} else {
|
||||
a = s1;
|
||||
b = s2;
|
||||
}
|
||||
|
||||
std::vector<std::wstring_view> tokens_a = utils::splitSV(a);
|
||||
std::vector<std::wstring_view> tokens_a = utils::splitSV(s1);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
std::vector<std::wstring_view> tokens_b = utils::splitSV(b);
|
||||
std::vector<std::wstring_view> tokens_b = utils::splitSV(s2);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
auto unique_a = tokens_a;
|
||||
|
@ -170,7 +128,7 @@ percent fuzz::partial_token_ratio(const std::wstring &s1, const std::wstring &s2
|
|||
return 100;
|
||||
}
|
||||
|
||||
percent result = partial_ratio(utils::join(tokens_a), utils::join(tokens_b), score_cutoff, false);
|
||||
percent result = partial_ratio(utils::join(tokens_a), utils::join(tokens_b), score_cutoff);
|
||||
// do not calculate the same partial_ratio twice
|
||||
if (tokens_a.size() == unique_a.size() && tokens_b.size() == unique_b.size()) {
|
||||
return result;
|
||||
|
@ -179,33 +137,23 @@ percent fuzz::partial_token_ratio(const std::wstring &s1, const std::wstring &s2
|
|||
score_cutoff = std::max(score_cutoff, result);
|
||||
return std::max(
|
||||
result,
|
||||
partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff, false)
|
||||
partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
percent _token_sort(const std::wstring &s1, const std::wstring &s2, bool partial, percent score_cutoff=0.0, bool preprocess = true) {
|
||||
if (score_cutoff >= 100) {
|
||||
percent _token_sort(const std::wstring_view &s1, const std::wstring_view &s2, bool partial, percent score_cutoff=0.0) {
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::wstring a;
|
||||
std::wstring b;
|
||||
if (preprocess) {
|
||||
a = utils::default_process(s1);
|
||||
b = utils::default_process(s2);
|
||||
} else {
|
||||
a = s1;
|
||||
b = s2;
|
||||
}
|
||||
|
||||
std::vector<std::wstring_view> tokens_a = utils::splitSV(a);
|
||||
std::vector<std::wstring_view> tokens_a = utils::splitSV(s1);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
std::vector<std::wstring_view> tokens_b = utils::splitSV(b);
|
||||
std::vector<std::wstring_view> tokens_b = utils::splitSV(s2);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
if (partial) {
|
||||
return fuzz::partial_ratio(utils::join(tokens_a), utils::join(tokens_b), score_cutoff, false);
|
||||
return fuzz::partial_ratio(utils::join(tokens_a), utils::join(tokens_b), score_cutoff);
|
||||
} else {
|
||||
float result = levenshtein::normalized_weighted_distance(tokens_a, tokens_b, score_cutoff / 100);
|
||||
return utils::result_cutoff(result*100, score_cutoff);
|
||||
|
@ -213,34 +161,24 @@ percent _token_sort(const std::wstring &s1, const std::wstring &s2, bool partial
|
|||
}
|
||||
|
||||
|
||||
percent fuzz::token_sort_ratio(const std::wstring &a, const std::wstring &b, percent score_cutoff, bool preprocess) {
|
||||
percent fuzz::token_sort_ratio(const std::wstring_view &a, const std::wstring_view &b, percent score_cutoff) {
|
||||
return _token_sort(a, b, false, score_cutoff);
|
||||
}
|
||||
|
||||
|
||||
percent fuzz::partial_token_sort_ratio(const std::wstring &a, const std::wstring &b, percent score_cutoff, bool preprocess) {
|
||||
percent fuzz::partial_token_sort_ratio(const std::wstring_view &a, const std::wstring_view &b, percent score_cutoff) {
|
||||
return _token_sort(a, b, true, score_cutoff);
|
||||
}
|
||||
|
||||
|
||||
percent fuzz::token_set_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) {
|
||||
if (score_cutoff >= 100) {
|
||||
percent fuzz::token_set_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) {
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::wstring a;
|
||||
std::wstring b;
|
||||
if (preprocess) {
|
||||
a = utils::default_process(s1);
|
||||
b = utils::default_process(s2);
|
||||
} else {
|
||||
a = s1;
|
||||
b = s2;
|
||||
}
|
||||
|
||||
std::vector<std::wstring_view> tokens_a = utils::splitSV(a);
|
||||
std::vector<std::wstring_view> tokens_a = utils::splitSV(s1);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
std::vector<std::wstring_view> tokens_b = utils::splitSV(b);
|
||||
std::vector<std::wstring_view> tokens_b = utils::splitSV(s2);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
auto [intersection, difference_ab, difference_ba] = utils::set_decomposition(tokens_a, tokens_b);
|
||||
|
@ -287,24 +225,14 @@ percent fuzz::token_set_ratio(const std::wstring &s1, const std::wstring &s2, pe
|
|||
}
|
||||
|
||||
|
||||
percent fuzz::partial_token_set_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) {
|
||||
if (score_cutoff >= 100) {
|
||||
percent fuzz::partial_token_set_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) {
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::wstring a;
|
||||
std::wstring b;
|
||||
if (preprocess) {
|
||||
a = utils::default_process(s1);
|
||||
b = utils::default_process(s2);
|
||||
} else {
|
||||
a = s1;
|
||||
b = s2;
|
||||
}
|
||||
|
||||
std::vector<std::wstring_view> tokens_a = utils::splitSV(a);
|
||||
std::vector<std::wstring_view> tokens_a = utils::splitSV(s1);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
std::vector<std::wstring_view> tokens_b = utils::splitSV(b);
|
||||
std::vector<std::wstring_view> tokens_b = utils::splitSV(s2);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
tokens_a.erase(std::unique(tokens_a.begin(), tokens_a.end()), tokens_a.end());
|
||||
|
@ -323,43 +251,33 @@ percent fuzz::partial_token_set_ratio(const std::wstring &s1, const std::wstring
|
|||
return 100;
|
||||
}
|
||||
|
||||
return partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff, false);
|
||||
return partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff);
|
||||
}
|
||||
|
||||
|
||||
percent fuzz::WRatio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff, bool preprocess) {
|
||||
if (score_cutoff >= 100) {
|
||||
percent fuzz::WRatio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff) {
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::wstring a;
|
||||
std::wstring b;
|
||||
if (preprocess) {
|
||||
a = utils::default_process(s1);
|
||||
b = utils::default_process(s2);
|
||||
} else {
|
||||
a = s1;
|
||||
b = s2;
|
||||
}
|
||||
|
||||
const float UNBASE_SCALE = 0.95;
|
||||
|
||||
std::size_t len_a = a.length();
|
||||
std::size_t len_b = b.length();
|
||||
std::size_t len_a = s1.length();
|
||||
std::size_t len_b = s2.length();
|
||||
float len_ratio = (len_a > len_b) ? (float)len_a / (float)len_b : (float)len_b / (float)len_a;
|
||||
|
||||
float sratio = ratio(a, b, score_cutoff, false);
|
||||
float sratio = ratio(s1, s2, score_cutoff);
|
||||
|
||||
if (len_ratio < 1.5) {
|
||||
score_cutoff = std::max(score_cutoff, sratio);
|
||||
return std::max(sratio, token_ratio(a, b, score_cutoff/UNBASE_SCALE, false) * UNBASE_SCALE);
|
||||
return std::max(sratio, token_ratio(s1, s2, score_cutoff/UNBASE_SCALE) * UNBASE_SCALE);
|
||||
}
|
||||
|
||||
float partial_scale = (len_ratio < 8.0) ? 0.9 : 0.6;
|
||||
|
||||
score_cutoff = std::max(score_cutoff, sratio)/partial_scale;
|
||||
sratio = std::max(sratio, partial_ratio(a, b, score_cutoff, false) * partial_scale);
|
||||
sratio = std::max(sratio, partial_ratio(s1, s2, score_cutoff) * partial_scale);
|
||||
|
||||
score_cutoff = std::max(score_cutoff, sratio)/UNBASE_SCALE;
|
||||
return std::max(sratio, partial_token_ratio(a, b, score_cutoff, false) * UNBASE_SCALE * partial_scale );
|
||||
return std::max(sratio, partial_token_ratio(s1, s2, score_cutoff) * UNBASE_SCALE * partial_scale );
|
||||
}
|
||||
|
|
|
@ -5,17 +5,17 @@
|
|||
using percent = float;
|
||||
|
||||
namespace fuzz {
|
||||
percent ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true);
|
||||
percent partial_ratio(std::wstring s1, std::wstring s2, percent score_cutoff=0, bool preprocess = true);
|
||||
percent ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0);
|
||||
percent partial_ratio(std::wstring_view s1, std::wstring_view s2, percent score_cutoff=0);
|
||||
|
||||
percent token_sort_ratio(const std::wstring &a, const std::wstring &b, percent score_cutoff=0, bool preprocess = true);
|
||||
percent partial_token_sort_ratio(const std::wstring &a, const std::wstring &b, percent score_cutoff=0, bool preprocess = true);
|
||||
percent token_sort_ratio(const std::wstring_view &a, const std::wstring_view &b, percent score_cutoff=0);
|
||||
percent partial_token_sort_ratio(const std::wstring_view &a, const std::wstring_view &b, percent score_cutoff=0);
|
||||
|
||||
percent token_set_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true);
|
||||
percent partial_token_set_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true);
|
||||
percent token_set_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0);
|
||||
percent partial_token_set_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0);
|
||||
|
||||
percent token_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true);
|
||||
percent partial_token_ratio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff=0, bool preprocess = true);
|
||||
percent token_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0);
|
||||
percent partial_token_ratio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff=0);
|
||||
|
||||
percent WRatio(const std::wstring &s1, const std::wstring &s2, percent score_cutoff = 0, bool preprocess = true);
|
||||
percent WRatio(const std::wstring_view &s1, const std::wstring_view &s2, percent score_cutoff = 0);
|
||||
}
|
||||
|
|
|
@ -106,7 +106,7 @@ std::vector<levenshtein::EditOp> levenshtein::editops(std::wstring_view sentence
|
|||
std::vector<levenshtein::MatchingBlock> levenshtein::matching_blocks(std::wstring_view sentence1, std::wstring_view sentence2) {
|
||||
auto edit_ops = editops(sentence1, sentence2);
|
||||
std::size_t first_start = 0;
|
||||
std::size_t second_start = 0;
|
||||
std::size_t second_start = 0;
|
||||
std::vector<MatchingBlock> mblocks;
|
||||
|
||||
for (const auto &op : edit_ops) {
|
||||
|
@ -191,13 +191,13 @@ std::size_t levenshtein::distance(std::wstring_view sentence1, std::wstring_view
|
|||
++temp;
|
||||
}
|
||||
|
||||
temp = std::min({
|
||||
*cache_iter + 1,
|
||||
*(++cache_iter) + 1,
|
||||
temp
|
||||
temp = std::min({
|
||||
*cache_iter + 1,
|
||||
*(++cache_iter) + 1,
|
||||
temp
|
||||
});
|
||||
std::swap(*cache_iter, temp);
|
||||
}
|
||||
std::swap(*cache_iter, temp);
|
||||
}
|
||||
}
|
||||
return cache.back();
|
||||
}
|
||||
|
|
|
@ -35,9 +35,9 @@ namespace levenshtein {
|
|||
std::vector<EditOp> editops(std::wstring_view sentence1, std::wstring_view sentence2);
|
||||
|
||||
struct MatchingBlock {
|
||||
std::size_t first_start;
|
||||
std::size_t second_start;
|
||||
std::size_t len;
|
||||
std::size_t first_start;
|
||||
std::size_t second_start;
|
||||
std::size_t len;
|
||||
MatchingBlock(std::size_t first_start, std::size_t second_start, std::size_t len)
|
||||
: first_start(first_start), second_start(second_start), len(len) {}
|
||||
};
|
||||
|
@ -90,7 +90,6 @@ namespace levenshtein {
|
|||
}
|
||||
|
||||
|
||||
|
||||
template<typename MaxDistanceCalc>
|
||||
inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const std::vector<std::wstring_view> &words,
|
||||
std::vector<std::size_t> &cache, std::size_t current_cache)
|
||||
|
@ -101,8 +100,8 @@ inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const s
|
|||
auto min_distance = std::numeric_limits<std::size_t>::max();
|
||||
|
||||
auto charCmp = [&] (const wchar_t &char2) {
|
||||
if (letter_cmp == char2) { result = current_cache; }
|
||||
else { ++result; }
|
||||
if (letter_cmp == char2) { result = current_cache; }
|
||||
else { ++result; }
|
||||
|
||||
current_cache = *cache_iter;
|
||||
if (result > current_cache + 1) {
|
||||
|
@ -121,7 +120,7 @@ inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const s
|
|||
|
||||
// no whitespace should be added in front of the first word
|
||||
for (const auto &letter : *word_iter) {
|
||||
charCmp(letter);
|
||||
charCmp(letter);
|
||||
}
|
||||
++word_iter;
|
||||
|
||||
|
@ -131,7 +130,7 @@ inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const s
|
|||
|
||||
// check following word
|
||||
for (const auto &letter : *word_iter) {
|
||||
charCmp(letter);
|
||||
charCmp(letter);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -260,16 +259,12 @@ inline std::size_t levenshtein::weighted_distance(std::wstring_view sentence1, s
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename Sentence1, typename Sentence2>
|
||||
inline float levenshtein::normalized_weighted_distance(const Sentence1 &sentence1, const Sentence2 &sentence2, float min_ratio)
|
||||
{
|
||||
if (sentence1.empty() || sentence2.empty()) {
|
||||
return sentence1.empty() && sentence2.empty();
|
||||
}
|
||||
return 1;
|
||||
|
||||
std::size_t sentence1_len = utils::joined_size(sentence1);
|
||||
std::size_t sentence2_len = utils::joined_size(sentence2);
|
||||
|
|
|
@ -27,7 +27,7 @@ process::extract(const std::wstring &query, const std::vector<std::wstring> &cho
|
|||
b = choice;
|
||||
}
|
||||
|
||||
float score = fuzz::WRatio(query, choice, score_cutoff, false);
|
||||
float score = fuzz::WRatio(query, choice, score_cutoff);
|
||||
if (score >= score_cutoff) {
|
||||
results.emplace_back(std::make_pair(choice, score));
|
||||
}
|
||||
|
@ -68,7 +68,7 @@ process::extractOne(const std::wstring &query, const std::vector<std::wstring> &
|
|||
b = choice;
|
||||
}
|
||||
|
||||
float score = fuzz::WRatio(a, b, score_cutoff, false);
|
||||
float score = fuzz::WRatio(a, b, score_cutoff);
|
||||
if (score >= score_cutoff) {
|
||||
score_cutoff = score;
|
||||
match_found = true;
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
*/
|
||||
template <typename InputIterator1, typename InputIterator2>
|
||||
inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1,
|
||||
InputIterator2 first2, InputIterator2 last2)
|
||||
InputIterator2 first2, InputIterator2 last2)
|
||||
{
|
||||
return std::distance(first1, std::mismatch(first1, last1, first2, last2).first);
|
||||
}
|
||||
|
@ -15,8 +15,8 @@ inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1,
|
|||
*/
|
||||
std::size_t remove_common_prefix(std::wstring_view& a, std::wstring_view& b) {
|
||||
auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end());
|
||||
a.remove_prefix(prefix);
|
||||
b.remove_prefix(prefix);
|
||||
a.remove_prefix(prefix);
|
||||
b.remove_prefix(prefix);
|
||||
return prefix;
|
||||
}
|
||||
|
||||
|
@ -25,7 +25,7 @@ std::size_t remove_common_prefix(std::wstring_view& a, std::wstring_view& b) {
|
|||
*/
|
||||
std::size_t remove_common_suffix(std::wstring_view& a, std::wstring_view& b) {
|
||||
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
|
||||
a.remove_suffix(suffix);
|
||||
a.remove_suffix(suffix);
|
||||
b.remove_suffix(suffix);
|
||||
return suffix;
|
||||
}
|
||||
|
@ -34,7 +34,7 @@ std::size_t remove_common_suffix(std::wstring_view& a, std::wstring_view& b) {
|
|||
* Removes common affix of two string views
|
||||
*/
|
||||
Affix utils::remove_common_affix(std::wstring_view& a, std::wstring_view& b) {
|
||||
return Affix {
|
||||
return Affix {
|
||||
remove_common_prefix(a, b),
|
||||
remove_common_suffix(a, b)
|
||||
};
|
||||
|
@ -104,7 +104,7 @@ void utils::trim(std::wstring &s) {
|
|||
|
||||
void utils::lower_case(std::wstring &s) {
|
||||
std::for_each(s.begin(), s.end(), [](wchar_t & c){
|
||||
c = ::tolower(c);
|
||||
c = std::tolower(c);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -114,6 +114,7 @@ std::wstring utils::default_process(std::wstring s) {
|
|||
return s;
|
||||
}
|
||||
|
||||
|
||||
DecomposedSet utils::set_decomposition(std::vector<std::wstring_view> a, std::vector<std::wstring_view> b) {
|
||||
std::vector<std::wstring_view> intersection;
|
||||
std::vector<std::wstring_view> difference_ab;
|
||||
|
@ -134,7 +135,7 @@ DecomposedSet utils::set_decomposition(std::vector<std::wstring_view> a, std::ve
|
|||
}
|
||||
|
||||
std::size_t utils::joined_size(const std::wstring_view &x){
|
||||
return x.size();
|
||||
return x.size();
|
||||
}
|
||||
|
||||
|
||||
|
@ -145,7 +146,7 @@ std::size_t utils::joined_size(const std::vector<std::wstring_view> &x){
|
|||
|
||||
// there is a whitespace between each word
|
||||
std::size_t result = x.size() - 1;
|
||||
for (const auto &y: x) result += y.size();
|
||||
for (const auto &y: x) result += y.size();
|
||||
|
||||
return result;
|
||||
return result;
|
||||
}
|
|
@ -40,6 +40,7 @@ namespace utils {
|
|||
|
||||
void trim(std::wstring &s);
|
||||
void lower_case(std::wstring &s);
|
||||
void lower_case(std::wstring &s);
|
||||
|
||||
std::wstring default_process(std::wstring s);
|
||||
|
||||
|
|
|
@ -0,0 +1,445 @@
|
|||
#define PY_SSIZE_T_CLEAN /* Make "s#" use Py_ssize_t rather than int. */
|
||||
#include <Python.h>
|
||||
#include <string>
|
||||
#include "fuzz.hpp"
|
||||
#include "utils.hpp"
|
||||
|
||||
|
||||
PyObject* ratio(PyObject *self, PyObject *args, PyObject *keywds) {
|
||||
const wchar_t *s1;
|
||||
const wchar_t *s2;
|
||||
float score_cutoff = 0;
|
||||
bool preprocess = true;
|
||||
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
|
||||
&s1, &s2, &score_cutoff, &preprocess))
|
||||
return NULL;
|
||||
|
||||
double result;
|
||||
if (preprocess) {
|
||||
result = fuzz::ratio(
|
||||
utils::default_process(s1),
|
||||
utils::default_process(s2),
|
||||
score_cutoff);
|
||||
} else {
|
||||
result = fuzz::ratio(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
return PyFloat_FromDouble(result);
|
||||
}
|
||||
|
||||
PyObject* partial_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
|
||||
const wchar_t *s1;
|
||||
const wchar_t *s2;
|
||||
float score_cutoff = 0;
|
||||
bool preprocess = true;
|
||||
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
|
||||
&s1, &s2, &score_cutoff, &preprocess))
|
||||
return NULL;
|
||||
|
||||
double result;
|
||||
if (preprocess) {
|
||||
result = fuzz::partial_ratio(
|
||||
utils::default_process(s1),
|
||||
utils::default_process(s2),
|
||||
score_cutoff);
|
||||
} else {
|
||||
result = fuzz::partial_ratio(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
return PyFloat_FromDouble(result);
|
||||
}
|
||||
|
||||
PyObject* token_sort_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
|
||||
const wchar_t *s1;
|
||||
const wchar_t *s2;
|
||||
float score_cutoff = 0;
|
||||
bool preprocess = true;
|
||||
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
|
||||
&s1, &s2, &score_cutoff, &preprocess))
|
||||
return NULL;
|
||||
|
||||
double result;
|
||||
if (preprocess) {
|
||||
result = fuzz::token_sort_ratio(
|
||||
utils::default_process(s1),
|
||||
utils::default_process(s2),
|
||||
score_cutoff);
|
||||
} else {
|
||||
result = fuzz::token_sort_ratio(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
return PyFloat_FromDouble(result);
|
||||
}
|
||||
|
||||
PyObject* partial_token_sort_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
|
||||
const wchar_t *s1;
|
||||
const wchar_t *s2;
|
||||
float score_cutoff = 0;
|
||||
bool preprocess = true;
|
||||
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
|
||||
&s1, &s2, &score_cutoff, &preprocess))
|
||||
return NULL;
|
||||
|
||||
double result;
|
||||
if (preprocess) {
|
||||
result = fuzz::partial_token_sort_ratio(
|
||||
utils::default_process(s1),
|
||||
utils::default_process(s2),
|
||||
score_cutoff);
|
||||
} else {
|
||||
result = fuzz::partial_token_sort_ratio(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
return PyFloat_FromDouble(result);
|
||||
}
|
||||
|
||||
PyObject* token_set_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
|
||||
const wchar_t *s1;
|
||||
const wchar_t *s2;
|
||||
float score_cutoff = 0;
|
||||
bool preprocess = true;
|
||||
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
|
||||
&s1, &s2, &score_cutoff, &preprocess))
|
||||
return NULL;
|
||||
|
||||
double result;
|
||||
if (preprocess) {
|
||||
result = fuzz::token_set_ratio(
|
||||
utils::default_process(s1),
|
||||
utils::default_process(s2),
|
||||
score_cutoff);
|
||||
} else {
|
||||
result = fuzz::token_set_ratio(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
return PyFloat_FromDouble(result);
|
||||
}
|
||||
|
||||
|
||||
PyObject* partial_token_set_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
|
||||
const wchar_t *s1;
|
||||
const wchar_t *s2;
|
||||
float score_cutoff = 0;
|
||||
bool preprocess = true;
|
||||
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
|
||||
&s1, &s2, &score_cutoff, &preprocess))
|
||||
return NULL;
|
||||
|
||||
double result;
|
||||
if (preprocess) {
|
||||
result = fuzz::partial_token_set_ratio(
|
||||
utils::default_process(s1),
|
||||
utils::default_process(s2),
|
||||
score_cutoff);
|
||||
} else {
|
||||
result = fuzz::partial_token_set_ratio(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
return PyFloat_FromDouble(result);
|
||||
}
|
||||
|
||||
PyObject* token_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
|
||||
const wchar_t *s1;
|
||||
const wchar_t *s2;
|
||||
float score_cutoff = 0;
|
||||
bool preprocess = true;
|
||||
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
|
||||
&s1, &s2, &score_cutoff, &preprocess))
|
||||
return NULL;
|
||||
|
||||
double result;
|
||||
if (preprocess) {
|
||||
result = fuzz::token_ratio(
|
||||
utils::default_process(s1),
|
||||
utils::default_process(s2),
|
||||
score_cutoff);
|
||||
} else {
|
||||
result = fuzz::token_ratio(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
return PyFloat_FromDouble(result);
|
||||
}
|
||||
|
||||
PyObject* partial_token_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
|
||||
const wchar_t *s1;
|
||||
const wchar_t *s2;
|
||||
float score_cutoff = 0;
|
||||
bool preprocess = true;
|
||||
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
|
||||
&s1, &s2, &score_cutoff, &preprocess))
|
||||
return NULL;
|
||||
|
||||
double result;
|
||||
if (preprocess) {
|
||||
result = fuzz::partial_token_ratio(
|
||||
utils::default_process(s1),
|
||||
utils::default_process(s2),
|
||||
score_cutoff);
|
||||
} else {
|
||||
result = fuzz::partial_token_ratio(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
return PyFloat_FromDouble(result);
|
||||
}
|
||||
|
||||
PyObject* WRatio(PyObject *self, PyObject *args, PyObject *keywds) {
|
||||
const wchar_t *s1;
|
||||
const wchar_t *s2;
|
||||
float score_cutoff = 0;
|
||||
bool preprocess = true;
|
||||
static const char *kwlist[] = {"s1", "s2", "score_cutoff", "preprocess", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|fp", const_cast<char **>(kwlist),
|
||||
&s1, &s2, &score_cutoff, &preprocess))
|
||||
return NULL;
|
||||
|
||||
double result;
|
||||
if (preprocess) {
|
||||
result = fuzz::WRatio(
|
||||
utils::default_process(s1),
|
||||
utils::default_process(s2),
|
||||
score_cutoff);
|
||||
} else {
|
||||
result = fuzz::WRatio(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
return PyFloat_FromDouble(result);
|
||||
}
|
||||
|
||||
|
||||
static PyMethodDef methods[] = {
|
||||
/* The cast of the function is necessary since PyCFunction values
|
||||
* only take two PyObject* parameters, and these functions take
|
||||
* three.
|
||||
*/
|
||||
{"ratio", (PyCFunction)(void(*)(void))ratio, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
calculates a simple ratio between two strings
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.ratio("this is a test", "this is a test!")
|
||||
96.55171966552734
|
||||
)pbdoc"},
|
||||
{"partial_ratio", (PyCFunction)(void(*)(void))partial_ratio, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
calculates a partial ratio between two strings
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.partial_ratio("this is a test", "this is a test!")
|
||||
100.0
|
||||
)pbdoc"},
|
||||
{"token_sort_ratio", (PyCFunction)(void(*)(void))token_sort_ratio, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
sorts the words in the string and calculates the fuzz.ratio between them
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
|
||||
100.0
|
||||
)pbdoc"},
|
||||
{"partial_token_sort_ratio", (PyCFunction)(void(*)(void))partial_token_sort_ratio, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
sorts the words in the strings and calculates the fuzz.partial_ratio between them
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc"},
|
||||
{"token_set_ratio", (PyCFunction)(void(*)(void))token_set_ratio, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
Compares the words in the strings based on unique and common words between them using fuzz.ratio
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
||||
83.8709716796875
|
||||
>>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
||||
100.0
|
||||
)pbdoc"},
|
||||
{"partial_token_sort_ratio", (PyCFunction)(void(*)(void))partial_token_sort_ratio, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
Compares the words in the strings based on unique and common words between them using fuzz.partial_ratio
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc"},
|
||||
{"token_ratio", (PyCFunction)(void(*)(void))token_ratio, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio
|
||||
(faster than manually executing the two functions)
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc"},
|
||||
{"partial_token_ratio", (PyCFunction)(void(*)(void))partial_token_ratio, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
Helper method that returns the maximum of fuzz.partial_token_set_ratio and fuzz.partial_token_sort_ratio
|
||||
(faster than manually executing the two functions)
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc"},
|
||||
{"QRatio", (PyCFunction)(void(*)(void))ratio, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
calculates a quick ratio between two strings using fuzz.ratio
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.ratio("this is a test", "this is a test!")
|
||||
96.55171966552734
|
||||
)pbdoc"},
|
||||
{"WRatio", (PyCFunction)(void(*)(void))WRatio, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
Calculates a weighted ratio based on the other ratio algorithms
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc"},
|
||||
{NULL, NULL, 0, NULL} /* sentinel */
|
||||
};
|
||||
|
||||
static struct PyModuleDef moduledef = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
"rapidfuzz.fuzz",
|
||||
NULL,
|
||||
-1,
|
||||
methods
|
||||
};
|
||||
|
||||
PyMODINIT_FUNC PyInit_fuzz(void) {
|
||||
return PyModule_Create(&moduledef);
|
||||
}
|
|
@ -0,0 +1,169 @@
|
|||
#define PY_SSIZE_T_CLEAN /* Make "s#" use Py_ssize_t rather than int. */
|
||||
#include <Python.h>
|
||||
#include <string>
|
||||
#include "levenshtein.hpp"
|
||||
|
||||
|
||||
PyObject* distance(PyObject *self, PyObject *args, PyObject *keywds) {
|
||||
const wchar_t *s1;
|
||||
const wchar_t *s2;
|
||||
static const char *kwlist[] = {"s1", "s2", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu", const_cast<char **>(kwlist),
|
||||
&s1, &s2))
|
||||
return NULL;
|
||||
|
||||
std::size_t result = levenshtein::distance(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)));
|
||||
return PyLong_FromSize_t(result);
|
||||
}
|
||||
|
||||
PyObject* normalized_distance(PyObject *self, PyObject *args, PyObject *keywds) {
|
||||
const wchar_t *s1;
|
||||
const wchar_t *s2;
|
||||
float score_cutoff = 0;
|
||||
static const char *kwlist[] = {"s1", "s2", "score_cutoff", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|f", const_cast<char **>(kwlist),
|
||||
&s1, &s2, &score_cutoff))
|
||||
return NULL;
|
||||
|
||||
double result = levenshtein::normalized_distance(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)),
|
||||
score_cutoff/100);
|
||||
return PyFloat_FromDouble(result*100);
|
||||
}
|
||||
|
||||
PyObject* weighted_distance(PyObject *self, PyObject *args, PyObject *keywds) {
|
||||
const wchar_t *s1;
|
||||
const wchar_t *s2;
|
||||
std::size_t insert_cost = 1;
|
||||
std::size_t delete_cost = 1;
|
||||
std::size_t replace_cost = 1;
|
||||
|
||||
static const char *kwlist[] = {"s1", "s2", "insert_cost", "delete_cost", "replace_cost", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|nnn", const_cast<char **>(kwlist),
|
||||
&s1, &s2, &insert_cost, &delete_cost, &replace_cost))
|
||||
return NULL;
|
||||
|
||||
if (insert_cost == 1 && delete_cost == 1) {
|
||||
if (replace_cost == 1) {
|
||||
std::size_t result = levenshtein::distance(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)));
|
||||
return PyLong_FromSize_t(result);
|
||||
} else if (replace_cost == 2) {
|
||||
std::size_t result = levenshtein::weighted_distance(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)));
|
||||
return PyLong_FromSize_t(result);
|
||||
}
|
||||
}
|
||||
std::size_t result = levenshtein::generic_distance(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)),
|
||||
insert_cost, delete_cost, replace_cost);
|
||||
return PyLong_FromSize_t(result);
|
||||
}
|
||||
|
||||
PyObject* normalized_weighted_distance(PyObject *self, PyObject *args, PyObject *keywds) {
|
||||
const wchar_t *s1;
|
||||
const wchar_t *s2;
|
||||
float score_cutoff = 0;
|
||||
static const char *kwlist[] = {"s1", "s2", "score_cutoff", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "uu|f", const_cast<char **>(kwlist),
|
||||
&s1, &s2, &score_cutoff))
|
||||
return NULL;
|
||||
|
||||
double result = levenshtein::normalized_weighted_distance(
|
||||
std::wstring_view(s1, wcslen(s1)),
|
||||
std::wstring_view(s2, wcslen(s2)),
|
||||
score_cutoff/100);
|
||||
return PyFloat_FromDouble(result*100);
|
||||
}
|
||||
|
||||
|
||||
static PyMethodDef methods[] = {
|
||||
/* The cast of the function is necessary since PyCFunction values
|
||||
* only take two PyObject* parameters, and these functions take
|
||||
* three.
|
||||
*/
|
||||
{"distance", (PyCFunction)(void(*)(void))distance, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
Calculates the minimum number of insertions, deletions, and substitutions
|
||||
required to change one sequence into the other according to Levenshtein.
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
|
||||
Returns:
|
||||
int: levenshtein distance between s1 and s2
|
||||
)pbdoc"},
|
||||
{"normalized_distance", (PyCFunction)(void(*)(void))normalized_distance, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
Calculates a normalized levenshtein distance based on levenshtein.distance
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
float: normalized levenshtein distance between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc"},
|
||||
{"weighted_distance", (PyCFunction)(void(*)(void))weighted_distance, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
Calculates the minimum number of insertions, deletions, and substitutions
|
||||
required to change one sequence into the other according to Levenshtein with custom
|
||||
costs for insertion, deletion and substitution
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
insert_cost (int): cost for insertions
|
||||
delete_cost (int): cost for deletions
|
||||
replace_cost (int): cost for substitutions
|
||||
|
||||
Returns:
|
||||
int: weighted levenshtein distance between s1 and s2
|
||||
)pbdoc"},
|
||||
{"normalized_weighted_distance", (PyCFunction)(void(*)(void))normalized_weighted_distance, METH_VARARGS | METH_KEYWORDS,
|
||||
R"pbdoc(
|
||||
Calculates a normalized levenshtein distance based on levenshtein.weighted_distance
|
||||
It uses the following costs for edit operations:
|
||||
|
||||
edit operation | cost
|
||||
:------------- | :---
|
||||
Insert | 1
|
||||
Remove | 1
|
||||
Replace | 2
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
float: normalized weighted levenshtein distance between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc"},
|
||||
{NULL, NULL, 0, NULL} /* sentinel */
|
||||
};
|
||||
|
||||
static struct PyModuleDef moduledef = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
"rapidfuzz.levenshtein",
|
||||
NULL,
|
||||
-1,
|
||||
methods
|
||||
};
|
||||
|
||||
PyMODINIT_FUNC PyInit_levenshtein(void) {
|
||||
return PyModule_Create(&moduledef);
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
#define PY_SSIZE_T_CLEAN /* Make "s#" use Py_ssize_t rather than int. */
|
||||
#include <Python.h>
|
||||
#include <string>
|
||||
#include "process.hpp"
|
||||
|
||||
static PyMethodDef methods[] = {
|
||||
/* The cast of the function is necessary since PyCFunction values
|
||||
* only take two PyObject* parameters, and these functions take
|
||||
* three.
|
||||
*/
|
||||
|
||||
{NULL, NULL, 0, NULL} /* sentinel */
|
||||
};
|
||||
|
||||
static struct PyModuleDef moduledef = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
"rapidfuzz._process",
|
||||
NULL,
|
||||
-1,
|
||||
methods
|
||||
};
|
||||
|
||||
PyMODINIT_FUNC PyInit__process(void) {
|
||||
return PyModule_Create(&moduledef);
|
||||
}
|
|
@ -1,287 +0,0 @@
|
|||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl.h>
|
||||
#include <string>
|
||||
#include "process.hpp"
|
||||
#include "fuzz.hpp"
|
||||
#include "utils.hpp"
|
||||
#include "levenshtein.hpp"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
PYBIND11_MODULE(_rapidfuzz_cpp, m) {
|
||||
m.doc() = R"pbdoc(
|
||||
rapid string matching library
|
||||
)pbdoc";
|
||||
|
||||
m.attr("__version__") = VERSION_INFO;
|
||||
|
||||
/********************************************************/
|
||||
/* process module */
|
||||
/********************************************************/
|
||||
auto mprocess = m.def_submodule("process");
|
||||
mprocess.def("extract", &process::extract);
|
||||
mprocess.def("extractOne", &process::extractOne);
|
||||
|
||||
/********************************************************/
|
||||
/* fuzz module */
|
||||
/********************************************************/
|
||||
auto mfuzz = m.def_submodule("fuzz");
|
||||
mfuzz.def("ratio", &fuzz::ratio,
|
||||
R"pbdoc(
|
||||
calculates a simple ratio between two strings
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio.
|
||||
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.ratio("this is a test", "this is a test!")
|
||||
96.55171966552734
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
|
||||
|
||||
mfuzz.def("partial_ratio", &fuzz::partial_ratio,
|
||||
R"pbdoc(
|
||||
calculates a partial ratio between two strings
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio.
|
||||
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.partial_ratio("this is a test", "this is a test!")
|
||||
100.0
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
|
||||
|
||||
mfuzz.def("token_sort_ratio", &fuzz::token_sort_ratio,
|
||||
R"pbdoc(
|
||||
sorts the words in the string and calculates the fuzz.ratio between them
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio.
|
||||
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
|
||||
100.0
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
|
||||
|
||||
|
||||
mfuzz.def("partial_token_sort_ratio", &fuzz::partial_token_sort_ratio,
|
||||
R"pbdoc(
|
||||
sorts the words in the strings and calculates the fuzz.partial_ratio between them
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio.
|
||||
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
|
||||
|
||||
mfuzz.def("token_set_ratio", &fuzz::token_set_ratio,
|
||||
R"pbdoc(
|
||||
Compares the words in the strings based on unique and common words between them using fuzz.ratio
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio.
|
||||
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
||||
83.8709716796875
|
||||
>>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
||||
100.0
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
|
||||
|
||||
mfuzz.def("partial_token_set_ratio", &fuzz::partial_token_set_ratio,
|
||||
R"pbdoc(
|
||||
Compares the words in the strings based on unique and common words between them using fuzz.partial_ratio
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio.
|
||||
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
|
||||
|
||||
mfuzz.def("token_ratio", &fuzz::token_ratio,
|
||||
R"pbdoc(
|
||||
Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio
|
||||
(faster than manually executing the two functions)
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio.
|
||||
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
|
||||
|
||||
mfuzz.def("partial_token_ratio", &fuzz::partial_token_ratio,
|
||||
R"pbdoc(
|
||||
Helper method that returns the maximum of fuzz.partial_token_set_ratio and fuzz.partial_token_sort_ratio
|
||||
(faster than manually executing the two functions)
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio.
|
||||
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
|
||||
|
||||
mfuzz.def("WRatio", &fuzz::WRatio,
|
||||
R"pbdoc(
|
||||
Calculates a weighted ratio based on the other ratio algorithms
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio.
|
||||
preprocess (bool): Specify whether the strings should be preprocessed using utils.default_process.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff"), py::arg("preprocess"));
|
||||
|
||||
|
||||
/********************************************************/
|
||||
/* levenshtein module */
|
||||
/********************************************************/
|
||||
auto mlevenshtein = m.def_submodule("levenshtein");
|
||||
|
||||
mlevenshtein.def("distance",
|
||||
[](std::wstring_view s1, std::wstring_view s2){
|
||||
return levenshtein::distance(s1, s2);
|
||||
},
|
||||
R"pbdoc(
|
||||
Calculates the minimum number of insertions, deletions, and substitutions
|
||||
required to change one sequence into the other according to Levenshtein.
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
|
||||
Returns:
|
||||
int: levenshtein distance between s1 and s2
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"));
|
||||
|
||||
mlevenshtein.def("normalized_distance",
|
||||
[](std::wstring_view s1, std::wstring_view s2, float score_cutoff){
|
||||
return levenshtein::normalized_distance(s1, s2, score_cutoff/100)*100;
|
||||
},
|
||||
R"pbdoc(
|
||||
Calculates a normalized levenshtein distance based on levenshtein.distance
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
float: normalized levenshtein distance between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff") = 0);
|
||||
|
||||
mlevenshtein.def("weighted_distance",
|
||||
[](std::wstring_view s1, std::wstring_view s2, size_t insert_cost, size_t delete_cost, size_t replace_cost){
|
||||
if (insert_cost == 1 && delete_cost == 1) {
|
||||
if (replace_cost == 1) {
|
||||
return levenshtein::distance(s1, s2);
|
||||
} else if (replace_cost == 2) {
|
||||
return levenshtein::weighted_distance(s1, s2);
|
||||
}
|
||||
}
|
||||
return levenshtein::generic_distance(s1, s2, insert_cost, delete_cost, replace_cost);
|
||||
},
|
||||
R"pbdoc(
|
||||
Calculates the minimum number of insertions, deletions, and substitutions
|
||||
required to change one sequence into the other according to Levenshtein with custom
|
||||
costs for insertion, deletion and substitution
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
insert_cost (int): cost for insertions
|
||||
delete_cost (int): cost for deletions
|
||||
replace_cost (int): cost for substitutions
|
||||
|
||||
Returns:
|
||||
int: weighted levenshtein distance between s1 and s2
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"), py::arg("insert_cost")=1, py::arg("delete_cost")=1, py::arg("replace_cost")=1);
|
||||
|
||||
mlevenshtein.def("normalized_weighted_distance",
|
||||
[](std::wstring_view s1, std::wstring_view s2, float score_cutoff){
|
||||
return levenshtein::normalized_weighted_distance(s1, s2, score_cutoff/100)*100;
|
||||
},
|
||||
R"pbdoc(
|
||||
Calculates a normalized levenshtein distance based on levenshtein.weighted_distance
|
||||
It uses the following costs for edit operations:
|
||||
|
||||
edit operation | cost
|
||||
:------------- | :---
|
||||
Insert | 1
|
||||
Remove | 1
|
||||
Replace | 2
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
float: normalized weighted levenshtein distance between s1 and s2 as a float between 0 and 100
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff") = 0);
|
||||
}
|
|
@ -1,205 +0,0 @@
|
|||
import _rapidfuzz_cpp.fuzz as fuzz_cpp
|
||||
|
||||
|
||||
def ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
|
||||
"""
|
||||
calculates a simple ratio between two strings
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.ratio("this is a test", "this is a test!")
|
||||
96.55171966552734
|
||||
"""
|
||||
return fuzz_cpp.ratio(s1, s2, score_cutoff, preprocess)
|
||||
|
||||
|
||||
def partial_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
|
||||
"""
|
||||
calculates a partial ratio between two strings
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.partial_ratio("this is a test", "this is a test!")
|
||||
100.0
|
||||
"""
|
||||
return fuzz_cpp.partial_ratio(s1, s2, score_cutoff, preprocess)
|
||||
|
||||
|
||||
def token_sort_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
|
||||
"""
|
||||
sorts the words in the string and calculates the fuzz.ratio between them
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
|
||||
100.0
|
||||
"""
|
||||
return fuzz_cpp.token_sort_ratio(s1, s2, score_cutoff, preprocess)
|
||||
|
||||
|
||||
def partial_token_sort_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
|
||||
"""
|
||||
sorts the words in the strings and calculates the fuzz.partial_ratio between them
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
"""
|
||||
return fuzz_cpp.partial_token_sort_ratio(s1, s2, score_cutoff, preprocess)
|
||||
|
||||
|
||||
def token_set_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
|
||||
"""
|
||||
Compares the words in the strings based on unique and common words between them using fuzz.ratio
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
||||
83.8709716796875
|
||||
>>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
||||
100.0
|
||||
"""
|
||||
return fuzz_cpp.token_set_ratio(s1, s2, score_cutoff, preprocess)
|
||||
|
||||
|
||||
def partial_token_set_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
|
||||
"""
|
||||
Compares the words in the strings based on unique and common words between them using fuzz.partial_ratio
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
"""
|
||||
return fuzz_cpp.partial_token_set_ratio(s1, s2, score_cutoff, preprocess)
|
||||
|
||||
|
||||
def token_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
|
||||
"""
|
||||
Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio
|
||||
(faster than manually executing the two functions)
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
"""
|
||||
return fuzz_cpp.token_ratio(s1, s2, score_cutoff, preprocess)
|
||||
|
||||
|
||||
def partial_token_ratio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
|
||||
"""
|
||||
Helper method that returns the maximum of fuzz.partial_token_set_ratio and fuzz.partial_token_sort_ratio
|
||||
(faster than manually executing the two functions)
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
"""
|
||||
return fuzz_cpp.partial_token_ratio(s1, s2, score_cutoff, preprocess)
|
||||
|
||||
|
||||
def QRatio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
|
||||
"""
|
||||
calculates a quick ratio between two strings using fuzz.ratio
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Example:
|
||||
>>> fuzz.ratio("this is a test", "this is a test!")
|
||||
96.55171966552734
|
||||
"""
|
||||
return fuzz_cpp.ratio(s1, s2, score_cutoff, preprocess)
|
||||
|
||||
|
||||
def WRatio(s1: str, s2: str, score_cutoff: float = 0, preprocess: bool = True):
|
||||
"""
|
||||
Calculates a weighted ratio based on the other ratio algorithms
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead of the ratio. Defaults to 0.
|
||||
preprocess (bool): Optional argument to specify whether the strings should be preprocessed
|
||||
using utils.default_process. Defaults to True.
|
||||
|
||||
Returns:
|
||||
float: ratio between s1 and s2 as a float between 0 and 100
|
||||
"""
|
||||
return fuzz_cpp.WRatio(s1, s2, score_cutoff, preprocess)
|
|
@ -1 +0,0 @@
|
|||
from _rapidfuzz_cpp.levenshtein import *
|
|
@ -1,4 +1,4 @@
|
|||
import _rapidfuzz_cpp.process
|
||||
import rapidfuzz._process
|
||||
from rapidfuzz import fuzz, utils
|
||||
from typing import Iterable, List, Tuple, Optional, Union, Callable
|
||||
import heapq
|
||||
|
@ -22,8 +22,8 @@ def extract(query: str, choices: Iterable, scorer: Callable = fuzz.WRatio, proce
|
|||
List[Tuple[str, float]]: returns a list of all matches that have a score >= score_cutoff
|
||||
|
||||
"""
|
||||
if (not scorer or scorer == fuzz.WRatio) and (not processor or processor == utils.default_process):
|
||||
return _rapidfuzz_cpp.process.extract(query, list(choices), limit, score_cutoff, bool(processor))
|
||||
#if (not scorer or scorer == fuzz.WRatio) and (not processor or processor == utils.default_process):
|
||||
# return _rapidfuzz_cpp.process.extract(query, list(choices), limit, score_cutoff, bool(processor))
|
||||
|
||||
# evaluate score inside python since scorer is a python function and so it would be required
|
||||
# to add the python layer from C++ aswell
|
||||
|
@ -63,8 +63,8 @@ def extractOne(query: str, choices: Iterable, scorer: Callable = fuzz.WRatio, pr
|
|||
Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is
|
||||
no match with a score >= score_cutoff
|
||||
"""
|
||||
if (not scorer or scorer == fuzz.WRatio) and (not processor or processor == utils.default_process):
|
||||
return _rapidfuzz_cpp.process.extractOne(query, list(choices), score_cutoff, bool(processor))
|
||||
#if (not scorer or scorer == fuzz.WRatio) and (not processor or processor == utils.default_process):
|
||||
# return _rapidfuzz_cpp.process.extractOne(query, list(choices), score_cutoff, bool(processor))
|
||||
|
||||
# evaluate score inside python since scorer is a python function and so it would be required
|
||||
# to add the python layer from C++ aswell
|
||||
|
|
91
setup.py
91
setup.py
|
@ -1,7 +1,6 @@
|
|||
from setuptools import setup, Extension
|
||||
from setuptools.command.build_ext import build_ext
|
||||
import sys
|
||||
import setuptools
|
||||
|
||||
from os import path
|
||||
this_dir = path.abspath(path.dirname(__file__))
|
||||
|
@ -12,64 +11,11 @@ with open(path.join(this_dir, "VERSION"), encoding='utf-8') as version_file:
|
|||
with open(path.join(this_dir, 'README.md'), encoding='utf-8') as f:
|
||||
long_description = f.read()
|
||||
|
||||
|
||||
class get_pybind_include(object):
|
||||
"""Helper class to determine the pybind11 include path
|
||||
|
||||
The purpose of this class is to postpone importing pybind11
|
||||
until it is actually installed, so that the ``get_include()``
|
||||
method can be invoked. """
|
||||
|
||||
def __init__(self, user=False):
|
||||
self.user = user
|
||||
|
||||
def __str__(self):
|
||||
import pybind11
|
||||
return pybind11.get_include(self.user)
|
||||
|
||||
|
||||
ext_modules = [
|
||||
Extension(
|
||||
'_rapidfuzz_cpp',
|
||||
[
|
||||
'python/src/rapidfuzz.cpp',
|
||||
'cpp/src/fuzz.cpp',
|
||||
'cpp/src/process.cpp',
|
||||
'cpp/src/levenshtein.cpp',
|
||||
'cpp/src/utils.cpp'
|
||||
],
|
||||
include_dirs=[
|
||||
# Path to pybind11 headers
|
||||
get_pybind_include(),
|
||||
get_pybind_include(user=True),
|
||||
"cpp/src"
|
||||
],
|
||||
language='c++',
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# As of Python 3.6, CCompiler has a `has_flag` method.
|
||||
# cf http://bugs.python.org/issue26689
|
||||
def has_flag(compiler, flagname):
|
||||
"""Return a boolean indicating whether a flag name is supported on
|
||||
the specified compiler.
|
||||
"""
|
||||
import tempfile
|
||||
with tempfile.NamedTemporaryFile('w', suffix='.cpp') as f:
|
||||
f.write('int main (int argc, char **argv) { return 0; }')
|
||||
try:
|
||||
compiler.compile([f.name], extra_postargs=[flagname])
|
||||
except setuptools.distutils.errors.CompileError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class BuildExt(build_ext):
|
||||
"""A custom build extension for adding compiler-specific options."""
|
||||
c_opts = {
|
||||
'msvc': ['/EHsc', '/O2', '/std:c++17'],
|
||||
'unix': ['-O3', '-std=c++17'],
|
||||
'unix': ['-O3', '-std=c++17', '-Werror'],
|
||||
}
|
||||
l_opts = {
|
||||
'msvc': [],
|
||||
|
@ -87,8 +33,6 @@ class BuildExt(build_ext):
|
|||
link_opts = self.l_opts.get(ct, [])
|
||||
if ct == 'unix':
|
||||
opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version())
|
||||
if has_flag(self.compiler, '-fvisibility=hidden'):
|
||||
opts.append('-fvisibility=hidden')
|
||||
elif ct == 'msvc':
|
||||
opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
|
||||
for ext in self.extensions:
|
||||
|
@ -105,14 +49,31 @@ setup(
|
|||
description='rapid fuzzy string matching',
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/markdown',
|
||||
ext_modules=ext_modules,
|
||||
install_requires=['pybind11>=2.4'],
|
||||
setup_requires=['pybind11>=2.4'],
|
||||
cmdclass={'build_ext': BuildExt},
|
||||
package_data={'': ['LICENSE', 'VERSION']},
|
||||
ext_modules = [
|
||||
Extension(
|
||||
'rapidfuzz.levenshtein',
|
||||
['python/src/py_levenshtein.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'],
|
||||
include_dirs=["cpp/src"],
|
||||
language='c++',
|
||||
),
|
||||
Extension(
|
||||
'rapidfuzz.fuzz',
|
||||
['python/src/py_fuzz.cpp', 'cpp/src/fuzz.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'],
|
||||
include_dirs=["cpp/src"],
|
||||
language='c++',
|
||||
),
|
||||
Extension(
|
||||
'rapidfuzz._process',
|
||||
['python/src/py_process.cpp', 'cpp/src/process.cpp', 'cpp/src/fuzz.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'],
|
||||
include_dirs=["cpp/src"],
|
||||
language='c++',
|
||||
),
|
||||
],
|
||||
cmdclass={'build_ext': BuildExt},
|
||||
package_data={'': ['LICENSE', 'VERSION']},
|
||||
package_dir={'': 'python/src'},
|
||||
packages=['rapidfuzz'],
|
||||
include_package_data=True,
|
||||
packages=['rapidfuzz'],
|
||||
include_package_data=True,
|
||||
zip_safe=False,
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
|
@ -123,4 +84,4 @@ setup(
|
|||
"License :: OSI Approved :: MIT License",
|
||||
],
|
||||
python_requires=">=3.5",
|
||||
)
|
||||
)
|
Loading…
Reference in New Issue