diff --git a/MANIFEST.in b/MANIFEST.in index ce0afe3..afa31b5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,6 @@ include README.md include VERSION include LICENSE -recursive-include cpp/src *.hpp +recursive-include cpp/src *.hpp *.txx recursive-include cpp/extern/boost * recursive-include python/src *.hpp diff --git a/VERSION b/VERSION index 844f6a9..d2b13eb 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.6.3 +0.6.4 diff --git a/cpp/src/fuzz.hpp b/cpp/src/fuzz.hpp index 7614732..c5257ef 100644 --- a/cpp/src/fuzz.hpp +++ b/cpp/src/fuzz.hpp @@ -3,23 +3,110 @@ #include "utils.hpp" namespace fuzz { -percent ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff = 0); -percent partial_ratio(boost::wstring_view s1, boost::wstring_view s2, percent score_cutoff = 0); +template +percent ratio( + const boost::basic_string_view& s1, + const boost::basic_string_view& s2, + percent score_cutoff = 0); -percent token_sort_ratio(const boost::wstring_view& a, const boost::wstring_view& b, percent score_cutoff = 0); -percent partial_token_sort_ratio(const boost::wstring_view& a, const boost::wstring_view& b, percent score_cutoff = 0); +template +percent ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff = 0); -percent token_set_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff = 0); -percent partial_token_set_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff = 0); +template +percent partial_ratio( + boost::basic_string_view s1, + boost::basic_string_view s2, + percent score_cutoff = 0); -percent token_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0); -percent partial_token_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff = 0); +template +percent partial_ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff = 0); +template +percent token_sort_ratio( + const boost::basic_string_view& s1, + const boost::basic_string_view& s2, + percent score_cutoff = 0); -std::size_t bitmap_distance(const Sentence& s1, const Sentence& s2); -percent bitmap_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0); -percent length_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0); -percent quick_lev_estimate(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0); +template +percent token_sort_ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff = 0); -percent WRatio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0); +template +percent partial_token_sort_ratio( + const boost::basic_string_view& s1, + const boost::basic_string_view& s2, + percent score_cutoff = 0); + +template +percent partial_token_sort_ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff = 0); + +template +percent token_set_ratio( + const boost::basic_string_view& s1, + const boost::basic_string_view& s2, + percent score_cutoff = 0); + +template +percent token_set_ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff = 0); + +template +percent partial_token_set_ratio( + const boost::basic_string_view& s1, + const boost::basic_string_view& s2, + percent score_cutoff = 0); + +template +percent partial_token_set_ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff = 0); + +template +percent token_ratio( + const Sentence& s1, + const Sentence& s2, + percent score_cutoff = 0); + +template +percent partial_token_ratio( + const boost::basic_string_view& s1, + const boost::basic_string_view& s2, + percent score_cutoff = 0); + +template +percent partial_token_ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff = 0); + +template +std::size_t bitmap_distance(const Sentence& s1, const Sentence& s2); + +template +percent bitmap_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0); + +template +percent length_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0); + +template +percent quick_lev_estimate(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0); + +template +percent WRatio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0); } + +#include "fuzz.txx" diff --git a/cpp/src/fuzz.cpp b/cpp/src/fuzz.txx similarity index 66% rename from cpp/src/fuzz.cpp rename to cpp/src/fuzz.txx index b6aac16..40b9d6d 100644 --- a/cpp/src/fuzz.cpp +++ b/cpp/src/fuzz.txx @@ -7,7 +7,33 @@ #include #include -percent fuzz::partial_ratio(boost::wstring_view s1, boost::wstring_view s2, percent score_cutoff) +template +inline percent fuzz::ratio( + const boost::basic_string_view& s1, + const boost::basic_string_view& s2, + percent score_cutoff) +{ + double result = levenshtein::normalized_weighted_distance(s1, s2, score_cutoff / 100); + return utils::result_cutoff(result * 100, score_cutoff); +} + +template +inline percent fuzz::ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff) +{ + return ratio( + boost::basic_string_view(s1), + boost::basic_string_view(s2), + score_cutoff); +} + +template +inline percent fuzz::partial_ratio( + boost::basic_string_view s1, + boost::basic_string_view s2, + percent score_cutoff) { if (s1.empty() || s2.empty() || score_cutoff > 100) { return 0; @@ -37,21 +63,104 @@ percent fuzz::partial_ratio(boost::wstring_view s1, boost::wstring_view s2, perc return utils::result_cutoff(max_ratio * 100, score_cutoff); } -percent fuzz::ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff) +template +inline percent fuzz::partial_ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff) { - double result = levenshtein::normalized_weighted_distance(s1, s2, score_cutoff / 100); - return utils::result_cutoff(result * 100, score_cutoff); + return partial_ratio( + boost::basic_string_view(s1), + boost::basic_string_view(s2), + score_cutoff); } -percent fuzz::token_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff) +template +percent _token_sort( + const boost::basic_string_view& s1, + const boost::basic_string_view& s2, + bool partial, + percent score_cutoff = 0.0) { if (score_cutoff > 100) { return 0; } - std::vector tokens_a = utils::splitSV(s1.sentence); + string_view_vec tokens_a = utils::splitSV(s1); std::sort(tokens_a.begin(), tokens_a.end()); - std::vector tokens_b = utils::splitSV(s2.sentence); + string_view_vec tokens_b = utils::splitSV(s2); + std::sort(tokens_b.begin(), tokens_b.end()); + + if (partial) { + return fuzz::partial_ratio( + utils::join(tokens_a), + utils::join(tokens_b), + score_cutoff); + } + else { + double result = levenshtein::normalized_weighted_distance( + utils::join(tokens_a), + utils::join(tokens_b), + score_cutoff / 100); + return utils::result_cutoff(result * 100, score_cutoff); + } +} + +template +percent fuzz::token_sort_ratio( + const boost::basic_string_view& s1, + const boost::basic_string_view& s2, + percent score_cutoff) +{ + return _token_sort(s1, s2, false, score_cutoff); +} + +template +percent fuzz::token_sort_ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff) +{ + return _token_sort( + boost::basic_string_view(s1), + boost::basic_string_view(s2), + false, score_cutoff); +} + +template +percent fuzz::partial_token_sort_ratio( + const boost::basic_string_view& s1, + const boost::basic_string_view& s2, + percent score_cutoff) +{ + return _token_sort(s1, s2, true, score_cutoff); +} + +template +percent fuzz::partial_token_sort_ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff) +{ + return _token_sort( + boost::basic_string_view(s1), + boost::basic_string_view(s2), + true, score_cutoff); +} + +template +percent fuzz::token_set_ratio( + const boost::basic_string_view& s1, + const boost::basic_string_view& s2, + percent score_cutoff) +{ + if (score_cutoff > 100) { + return 0; + } + + string_view_vec tokens_a = utils::splitSV(s1); + std::sort(tokens_a.begin(), tokens_a.end()); + string_view_vec tokens_b = utils::splitSV(s2); std::sort(tokens_b.begin(), tokens_b.end()); auto decomposition = utils::set_decomposition(tokens_a, tokens_b); @@ -59,8 +168,124 @@ percent fuzz::token_ratio(const Sentence& s1, const Sentence& s2, percent score_ auto difference_ab = decomposition.difference_ab; auto difference_ba = decomposition.difference_ba; - std::wstring diff_ab_joined = utils::join(difference_ab); - std::wstring diff_ba_joined = utils::join(difference_ba); + std::basic_string diff_ab_joined = utils::join(difference_ab); + std::basic_string diff_ba_joined = utils::join(difference_ba); + + std::size_t ab_len = diff_ab_joined.length(); + std::size_t ba_len = diff_ba_joined.length(); + std::size_t sect_len = utils::joined_size(intersection); + + // exit early since this will always result in a ratio of 1 + if (sect_len && (!ab_len || !ba_len)) { + return 100; + } + + // string length sect+ab <-> sect and sect+ba <-> sect + std::size_t sect_ab_lensum = sect_len + !!sect_len + ab_len; + std::size_t sect_ba_lensum = sect_len + !!sect_len + ba_len; + + std::size_t sect_distance = levenshtein::weighted_distance(diff_ab_joined, diff_ba_joined); + double result = 0; + if (sect_distance != std::numeric_limits::max()) { + result = std::max(result, 1.0 - sect_distance / static_cast(sect_ab_lensum + sect_ba_lensum)); + } + + // exit early since the other ratios are 0 + if (!sect_len) { + return utils::result_cutoff(result * 100, score_cutoff); + } + + // levenshtein distance sect+ab <-> sect and sect+ba <-> sect + // would exit early after removing the prefix sect, so the distance can be directly calculated + std::size_t sect_ab_distance = !!sect_len + ab_len; + std::size_t sect_ba_distance = !!sect_len + ba_len; + + result = std::max({ result, + 1.0 - sect_ab_distance / static_cast(sect_len + sect_ab_lensum), + 1.0 - sect_ba_distance / static_cast(sect_len + sect_ba_lensum) }); + return utils::result_cutoff(result * 100, score_cutoff); +} + +template +percent fuzz::token_set_ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff) +{ + return token_set_ratio( + boost::basic_string_view(s1), + boost::basic_string_view(s1), + score_cutoff); +} + +template +percent fuzz::partial_token_set_ratio( + const boost::basic_string_view& s1, + const boost::basic_string_view& s2, + percent score_cutoff) +{ + if (score_cutoff > 100) { + return 0; + } + + string_view_vec tokens_a = utils::splitSV(s1); + std::sort(tokens_a.begin(), tokens_a.end()); + string_view_vec tokens_b = utils::splitSV(s2); + std::sort(tokens_b.begin(), tokens_b.end()); + + tokens_a.erase(std::unique(tokens_a.begin(), tokens_a.end()), tokens_a.end()); + tokens_b.erase(std::unique(tokens_b.begin(), tokens_b.end()), tokens_b.end()); + + string_view_vec difference_ab; + string_view_vec difference_ba; + + std::set_difference(tokens_a.begin(), tokens_a.end(), tokens_b.begin(), tokens_b.end(), + std::inserter(difference_ab, difference_ab.begin())); + std::set_difference(tokens_b.begin(), tokens_b.end(), tokens_a.begin(), tokens_a.end(), + std::inserter(difference_ba, difference_ba.begin())); + + // exit early when there is a common word in both sequences + if (difference_ab.size() < tokens_a.size()) { + return 100; + } + + return partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff); +} + +template +percent fuzz::partial_token_set_ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff) +{ + return partial_token_set_ratio( + boost::basic_string_view(s1), + boost::basic_string_view(s1), + score_cutoff); +} + +template +percent fuzz::token_ratio( + const Sentence& s1, + const Sentence& s2, + percent score_cutoff) +{ + if (score_cutoff > 100) { + return 0; + } + + string_view_vec tokens_a = utils::splitSV(s1.sentence); + std::sort(tokens_a.begin(), tokens_a.end()); + string_view_vec tokens_b = utils::splitSV(s2.sentence); + std::sort(tokens_b.begin(), tokens_b.end()); + + auto decomposition = utils::set_decomposition(tokens_a, tokens_b); + auto intersection = decomposition.intersection; + auto difference_ab = decomposition.difference_ab; + auto difference_ba = decomposition.difference_ba; + + std::basic_string diff_ab_joined = utils::join(difference_ab); + std::basic_string diff_ba_joined = utils::join(difference_ba); std::size_t ab_len = diff_ab_joined.length(); std::size_t ba_len = diff_ba_joined.length(); @@ -83,8 +308,8 @@ percent fuzz::token_ratio(const Sentence& s1, const Sentence& s2, percent score_ std::size_t sect_ab_lensum = sect_len + !!sect_len + ab_len; std::size_t sect_ba_lensum = sect_len + !!sect_len + ba_len; - Sentence diff_ab{diff_ab_joined, bitmap_create(diff_ab_joined)}; - Sentence diff_ba{diff_ba_joined, bitmap_create(diff_ba_joined)}; + Sentence diff_ab{diff_ab_joined, utils::bitmap_create(diff_ab_joined)}; + Sentence diff_ba{diff_ba_joined, utils::bitmap_create(diff_ba_joined)}; double bm_ratio = 1.0 - bitmap_distance(diff_ab, diff_ba) / static_cast(sect_ab_lensum + sect_ba_lensum); if (bm_ratio >= score_cutoff) { std::size_t sect_distance = levenshtein::weighted_distance(diff_ab_joined, diff_ba_joined); @@ -111,7 +336,11 @@ percent fuzz::token_ratio(const Sentence& s1, const Sentence& s2, percent score_ // combines token_set and token_sort ratio from fuzzywuzzy so it is only required to // do a lot of operations once -percent fuzz::partial_token_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff) +template +percent fuzz::partial_token_ratio( + const boost::basic_string_view& s1, + const boost::basic_string_view& s2, + percent score_cutoff) { if (score_cutoff > 100) { return 0; @@ -152,127 +381,21 @@ percent fuzz::partial_token_ratio(const boost::wstring_view& s1, const boost::ws partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff)); } -percent _token_sort(const boost::wstring_view& s1, const boost::wstring_view& s2, bool partial, percent score_cutoff = 0.0) +template +percent fuzz::partial_token_ratio( + const std::basic_string& s1, + const std::basic_string& s2, + percent score_cutoff) { - if (score_cutoff > 100) { - return 0; - } - - std::vector tokens_a = utils::splitSV(s1); - std::sort(tokens_a.begin(), tokens_a.end()); - std::vector tokens_b = utils::splitSV(s2); - std::sort(tokens_b.begin(), tokens_b.end()); - - if (partial) { - return fuzz::partial_ratio( - utils::join(tokens_a), - utils::join(tokens_b), - score_cutoff); - } - else { - double result = levenshtein::normalized_weighted_distance( - utils::join(tokens_a), - utils::join(tokens_b), - score_cutoff / 100); - return utils::result_cutoff(result * 100, score_cutoff); - } + return partial_token_ratio( + boost::basic_string_view(s1), + boost::basic_string_view(s2), + score_cutoff); } -percent fuzz::token_sort_ratio(const boost::wstring_view& a, const boost::wstring_view& b, percent score_cutoff) +template +std::size_t fuzz::bitmap_distance(const Sentence& s1, const Sentence& s2) { - return _token_sort(a, b, false, score_cutoff); -} - -percent fuzz::partial_token_sort_ratio(const boost::wstring_view& a, const boost::wstring_view& b, percent score_cutoff) -{ - return _token_sort(a, b, true, score_cutoff); -} - -percent fuzz::token_set_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff) -{ - if (score_cutoff > 100) { - return 0; - } - - std::vector tokens_a = utils::splitSV(s1); - std::sort(tokens_a.begin(), tokens_a.end()); - std::vector tokens_b = utils::splitSV(s2); - std::sort(tokens_b.begin(), tokens_b.end()); - - auto decomposition = utils::set_decomposition(tokens_a, tokens_b); - auto intersection = decomposition.intersection; - auto difference_ab = decomposition.difference_ab; - auto difference_ba = decomposition.difference_ba; - - std::wstring diff_ab_joined = utils::join(difference_ab); - std::wstring diff_ba_joined = utils::join(difference_ba); - - std::size_t ab_len = diff_ab_joined.length(); - std::size_t ba_len = diff_ba_joined.length(); - std::size_t sect_len = utils::joined_size(intersection); - - // exit early since this will always result in a ratio of 1 - if (sect_len && (!ab_len || !ba_len)) { - return 100; - } - - // string length sect+ab <-> sect and sect+ba <-> sect - std::size_t sect_ab_lensum = sect_len + !!sect_len + ab_len; - std::size_t sect_ba_lensum = sect_len + !!sect_len + ba_len; - - std::size_t sect_distance = levenshtein::weighted_distance(diff_ab_joined, diff_ba_joined); - double result = 0; - if (sect_distance != std::numeric_limits::max()) { - result = std::max(result, 1.0 - sect_distance / static_cast(sect_ab_lensum + sect_ba_lensum)); - } - - // exit early since the other ratios are 0 - if (!sect_len) { - return utils::result_cutoff(result * 100, score_cutoff); - } - - // levenshtein distance sect+ab <-> sect and sect+ba <-> sect - // would exit early after removing the prefix sect, so the distance can be directly calculated - std::size_t sect_ab_distance = !!sect_len + ab_len; - std::size_t sect_ba_distance = !!sect_len + ba_len; - - result = std::max({ result, - 1.0 - sect_ab_distance / static_cast(sect_len + sect_ab_lensum), - 1.0 - sect_ba_distance / static_cast(sect_len + sect_ba_lensum) }); - return utils::result_cutoff(result * 100, score_cutoff); -} - -percent fuzz::partial_token_set_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff) -{ - if (score_cutoff > 100) { - return 0; - } - - std::vector tokens_a = utils::splitSV(s1); - std::sort(tokens_a.begin(), tokens_a.end()); - std::vector tokens_b = utils::splitSV(s2); - std::sort(tokens_b.begin(), tokens_b.end()); - - tokens_a.erase(std::unique(tokens_a.begin(), tokens_a.end()), tokens_a.end()); - tokens_b.erase(std::unique(tokens_b.begin(), tokens_b.end()), tokens_b.end()); - - std::vector difference_ab; - std::vector difference_ba; - - std::set_difference(tokens_a.begin(), tokens_a.end(), tokens_b.begin(), tokens_b.end(), - std::inserter(difference_ab, difference_ab.begin())); - std::set_difference(tokens_b.begin(), tokens_b.end(), tokens_a.begin(), tokens_a.end(), - std::inserter(difference_ba, difference_ba.begin())); - - // exit early when there is a common word in both sequences - if (difference_ab.size() < tokens_a.size()) { - return 100; - } - - return partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff); -} - -std::size_t fuzz::bitmap_distance(const Sentence& s1, const Sentence& s2) { uint64_t bitmap1 = s1.bitmap; uint64_t bitmap2 = s2.bitmap; @@ -287,7 +410,9 @@ std::size_t fuzz::bitmap_distance(const Sentence& s1, const Sentence& s2) { return distance; } -percent fuzz::bitmap_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff) { +template +percent fuzz::bitmap_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff) +{ std::size_t distance = bitmap_distance(s1, s2); std::size_t lensum = s1.sentence.length() + s2.sentence.length(); percent result = 1.0 - static_cast(distance) / lensum; @@ -296,7 +421,9 @@ percent fuzz::bitmap_ratio(const Sentence& s1, const Sentence& s2, percent score } -percent fuzz::length_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff) { +template +percent fuzz::length_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff) +{ std::size_t s1_len = s1.sentence.length(); std::size_t s2_len = s2.sentence.length(); std::size_t distance = (s1_len > s2_len) @@ -308,7 +435,9 @@ percent fuzz::length_ratio(const Sentence& s1, const Sentence& s2, percent score return utils::result_cutoff(result * 100, score_cutoff); } -percent fuzz::quick_lev_estimate(const Sentence& s1, const Sentence& s2, percent score_cutoff) { +template +percent fuzz::quick_lev_estimate(const Sentence& s1, const Sentence& s2, percent score_cutoff) +{ if (s1.bitmap || s2.bitmap) { return bitmap_ratio(s1, s2, score_cutoff); } else { @@ -316,7 +445,8 @@ percent fuzz::quick_lev_estimate(const Sentence& s1, const Sentence& s2, percent } } -percent fuzz::WRatio(const Sentence& s1, const Sentence& s2, percent score_cutoff) +template +percent fuzz::WRatio(const Sentence& s1, const Sentence& s2, percent score_cutoff) { if (score_cutoff > 100) { return 0; diff --git a/cpp/src/levenshtein.hpp b/cpp/src/levenshtein.hpp index b8eb573..673d284 100644 --- a/cpp/src/levenshtein.hpp +++ b/cpp/src/levenshtein.hpp @@ -19,17 +19,6 @@ enum EditType { EditDelete, }; -struct EditOp { - EditType op_type; - std::size_t first_start; - std::size_t second_start; - EditOp(EditType op_type, std::size_t first_start, std::size_t second_start) - : op_type(op_type) - , first_start(first_start) - , second_start(second_start) - {} -}; - struct Matrix { std::size_t prefix_len; std::vector matrix; @@ -37,10 +26,6 @@ struct Matrix { std::size_t matrix_rows; }; -Matrix matrix(boost::wstring_view sentence1, boost::wstring_view sentence2); - -std::vector editops(boost::wstring_view sentence1, boost::wstring_view sentence2); - struct MatchingBlock { std::size_t first_start; std::size_t second_start; @@ -52,11 +37,49 @@ struct MatchingBlock { {} }; -std::vector matching_blocks(boost::wstring_view sentence1, boost::wstring_view sentence2); -double normalized_distance(boost::wstring_view sentence1, boost::wstring_view sentence2, double min_ratio = 0.0); +template +Matrix matrix( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2); + +template +Matrix matrix( + const std::basic_string& sentence1, + const std::basic_string& sentence2); + +template +std::vector matching_blocks( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2); + +template +std::vector matching_blocks( + const std::basic_string& sentence1, + const std::basic_string& sentence2); + +template +double normalized_distance( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2, + double min_ratio = 0.0); + +template +double normalized_distance( + const std::basic_string& sentence1, + const std::basic_string& sentence2, + double min_ratio = 0.0); + +template +std::size_t distance( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2); + +template +std::size_t distance( + const std::basic_string& sentence1, + const std::basic_string& sentence2); -std::size_t distance(boost::wstring_view sentence1, boost::wstring_view sentence2); /** * Calculates the minimum number of insertions, deletions, and substitutions @@ -74,13 +97,43 @@ std::size_t distance(boost::wstring_view sentence1, boost::wstring_view sentence * @param sentence2 second sentence to match (can be either a string type or a vector of strings) * @return weighted levenshtein distance */ -std::size_t weighted_distance(boost::wstring_view sentence1, boost::wstring_view sentence2); +template +std::size_t weighted_distance( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2); -std::size_t generic_distance(boost::wstring_view source, boost::wstring_view target, WeightTable weights = { 1, 1, 1 }); +template +std::size_t weighted_distance( + const std::basic_string& sentence1, + const std::basic_string& sentence2); + +template +std::size_t generic_distance( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2, + WeightTable weights = { 1, 1, 1 }); + +template +std::size_t generic_distance( + const std::basic_string& sentence1, + const std::basic_string& sentence2, + WeightTable weights = { 1, 1, 1 }); /** * Calculates a normalized score of the weighted Levenshtein algorithm between 0.0 and * 1.0 (inclusive), where 1.0 means the sequences are the same. */ -double normalized_weighted_distance(const boost::wstring_view& sentence1, const boost::wstring_view& sentence2, double min_ratio = 0.0); +template +double normalized_weighted_distance( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2, + double min_ratio = 0.0); + +template +double normalized_weighted_distance( + const std::basic_string& sentence1, + const std::basic_string& sentence2, + double min_ratio = 0.0); + } +#include "levenshtein.txx" diff --git a/cpp/src/levenshtein.cpp b/cpp/src/levenshtein.txx similarity index 57% rename from cpp/src/levenshtein.cpp rename to cpp/src/levenshtein.txx index d5f049a..c90d7f0 100644 --- a/cpp/src/levenshtein.cpp +++ b/cpp/src/levenshtein.txx @@ -2,7 +2,11 @@ #include #include -levenshtein::Matrix levenshtein::matrix(boost::wstring_view sentence1, boost::wstring_view sentence2) + +template +levenshtein::Matrix levenshtein::matrix( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2) { Affix affix = utils::remove_common_affix(sentence1, sentence2); @@ -42,20 +46,20 @@ levenshtein::Matrix levenshtein::matrix(boost::wstring_view sentence1, boost::ws }; } -std::vector levenshtein::editops(boost::wstring_view sentence1, boost::wstring_view sentence2) +template +levenshtein::Matrix levenshtein::matrix( + const std::basic_string& sentence1, + const std::basic_string& sentence2) { - auto m = matrix(sentence1, sentence2); - std::size_t matrix_columns = m.matrix_columns; - std::size_t matrix_rows = m.matrix_rows; - std::size_t prefix_len = m.prefix_len; - auto lev_matrix = m.matrix; + return matrix( + boost::basic_string_view(sentence1), + boost::basic_string_view(sentence2)); +} - std::vector ops; - ops.reserve(lev_matrix[matrix_columns * matrix_rows - 1]); - - std::size_t i = matrix_columns - 1; - std::size_t j = matrix_rows - 1; - std::size_t position = matrix_columns * matrix_rows - 1; +levenshtein::EditType get_EditType(levenshtein::Matrix matrix, std::size_t row, std::size_t column) +{ + auto lev_matrix = matrix.matrix; + std::size_t matrix_rows = matrix.matrix_rows; auto is_replace = [=](std::size_t pos) { return lev_matrix[pos - matrix_rows - 1] < lev_matrix[pos]; @@ -70,58 +74,67 @@ std::vector levenshtein::editops(boost::wstring_view senten return lev_matrix[pos - matrix_rows - 1] == lev_matrix[pos]; }; - while (i > 0 || j > 0) { - EditType op_type; + std::size_t position = column*matrix_rows + row; - if (i && j && is_replace(position)) { - op_type = EditType::EditReplace; - --i; - --j; - position -= matrix_rows + 1; - } else if (j && is_insert(position)) { - op_type = EditType::EditInsert; - --j; - --position; - } else if (i && is_delete(position)) { - op_type = EditType::EditDelete; - --i; - position -= matrix_rows; - } else if (is_keep(position)) { - --i; - --j; - position -= matrix_rows + 1; - // EditKeep does not has to be stored - continue; - } else { - throw std::logic_error("something went wrong extracting the editops from the levenshtein matrix"); - } - - ops.emplace_back(op_type, i + prefix_len, j + prefix_len); + if (column && row && is_replace(position)) { + return levenshtein::EditType::EditReplace; + } else if (row && is_insert(position)) { + return levenshtein::EditType::EditInsert; + } else if (column && is_delete(position)) { + return levenshtein::EditType::EditDelete; + } else if (is_keep(position)) { + return levenshtein::EditType::EditKeep; + } else { + throw std::logic_error("something went wrong extracting the editops from the levenshtein matrix"); } - - std::reverse(ops.begin(), ops.end()); - return ops; } -std::vector levenshtein::matching_blocks(boost::wstring_view sentence1, boost::wstring_view sentence2) +template +std::vector levenshtein::matching_blocks( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2) { - auto edit_ops = editops(sentence1, sentence2); + auto m = matrix(sentence1, sentence2); + std::size_t prefix_len = m.prefix_len; + + // current position in the the levenshtein matrix + std::size_t matrix_column = m.matrix_columns - 1; + std::size_t matrix_row = m.matrix_rows - 1; + std::size_t first_start = 0; std::size_t second_start = 0; std::vector mblocks; + mblocks.emplace_back(sentence1.length(), sentence2.length(), 0); - for (const auto& op : edit_ops) { - if (op.op_type == EditType::EditKeep) { + while (matrix_column > 0 || matrix_row > 0) { + EditType op_type = get_EditType(m, matrix_row, matrix_column); + + switch (op_type) { + case EditType::EditReplace: + --matrix_column; + --matrix_row; + break; + case EditType::EditInsert: + --matrix_row; + break; + case EditType::EditDelete: + --matrix_column; + break; + case EditType::EditKeep: + --matrix_column; + --matrix_row; continue; } - if (first_start < op.first_start || second_start < op.second_start) { - mblocks.emplace_back(first_start, second_start, op.first_start - first_start); - first_start = op.first_start; - second_start = op.second_start; + std::size_t cur_first_start = matrix_column + prefix_len; + std::size_t cur_second_start = matrix_row + prefix_len; + if (first_start < cur_first_start || second_start < cur_second_start) { + mblocks.emplace_back(first_start, second_start, cur_first_start - first_start); + first_start = cur_first_start; + second_start = cur_second_start; } - switch (op.op_type) { + switch (op_type) { case EditType::EditReplace: first_start += 1; second_start += 1; @@ -132,16 +145,29 @@ std::vector levenshtein::matching_blocks(boost::wstr case EditType::EditInsert: second_start += 1; break; - case EditType::EditKeep: + default: break; } } - mblocks.emplace_back(sentence1.length(), sentence2.length(), 0); + std::reverse(mblocks.begin(), mblocks.end()); return mblocks; } -std::size_t levenshtein::distance(boost::wstring_view sentence1, boost::wstring_view sentence2) +template +std::vector levenshtein::matching_blocks( + const std::basic_string& sentence1, + const std::basic_string& sentence2) +{ + return matching_blocks( + boost::basic_string_view(sentence1), + boost::basic_string_view(sentence2)); +} + +template +std::size_t levenshtein::weighted_distance( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2) { utils::remove_common_affix(sentence1, sentence2); @@ -174,7 +200,20 @@ std::size_t levenshtein::distance(boost::wstring_view sentence1, boost::wstring_ return cache.back(); } -std::size_t levenshtein::weighted_distance(boost::wstring_view sentence1, boost::wstring_view sentence2) +template +std::size_t levenshtein::weighted_distance( + const std::basic_string& sentence1, + const std::basic_string& sentence2) +{ + return weighted_distance( + boost::basic_string_view(sentence1), + boost::basic_string_view(sentence2)); +} + +template +std::size_t levenshtein::distance( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2) { utils::remove_common_affix(sentence1, sentence2); @@ -214,7 +253,21 @@ std::size_t levenshtein::weighted_distance(boost::wstring_view sentence1, boost: return cache.back(); } -std::size_t levenshtein::generic_distance(boost::wstring_view sentence1, boost::wstring_view sentence2, WeightTable weights) +template +std::size_t levenshtein::distance( + const std::basic_string& sentence1, + const std::basic_string& sentence2) +{ + return distance( + boost::basic_string_view(sentence1), + boost::basic_string_view(sentence2)); +} + +template +std::size_t levenshtein::generic_distance( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2, + WeightTable weights) { utils::remove_common_affix(sentence1, sentence2); if (sentence1.size() > sentence2.size()) { @@ -248,7 +301,23 @@ std::size_t levenshtein::generic_distance(boost::wstring_view sentence1, boost:: return cache.back(); } -double levenshtein::normalized_distance(boost::wstring_view sentence1, boost::wstring_view sentence2, double min_ratio) +template +std::size_t levenshtein::generic_distance( + const std::basic_string& sentence1, + const std::basic_string& sentence2, + WeightTable weights) +{ + return generic_distance( + boost::basic_string_view(sentence1), + boost::basic_string_view(sentence2), + weights); +} + +template +double levenshtein::normalized_distance( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2, + double min_ratio) { if (sentence1.empty() || sentence2.empty()) { return sentence1.empty() && sentence2.empty(); @@ -275,7 +344,24 @@ double levenshtein::normalized_distance(boost::wstring_view sentence1, boost::ws return (ratio >= min_ratio) ? ratio : 0.0; } -double levenshtein::normalized_weighted_distance(const boost::wstring_view& sentence1, const boost::wstring_view& sentence2, double min_ratio) +template +double levenshtein::normalized_distance( + const std::basic_string& sentence1, + const std::basic_string& sentence2, + double min_ratio) +{ + return normalized_distance( + boost::basic_string_view(sentence1), + boost::basic_string_view(sentence2), + min_ratio); +} + + +template +double levenshtein::normalized_weighted_distance( + boost::basic_string_view sentence1, + boost::basic_string_view sentence2, + double min_ratio) { if (sentence1.empty() || sentence2.empty()) { return sentence1.empty() && sentence2.empty(); @@ -304,3 +390,15 @@ double levenshtein::normalized_weighted_distance(const boost::wstring_view& sent double ratio = 1.0 - static_cast(dist) / lensum; return (ratio >= min_ratio) ? ratio : 0.0; } + +template +double levenshtein::normalized_weighted_distance( + const std::basic_string& sentence1, + const std::basic_string& sentence2, + double min_ratio) +{ + return normalized_weighted_distance( + boost::basic_string_view(sentence1), + boost::basic_string_view(sentence2), + min_ratio); +} diff --git a/cpp/src/process.cpp b/cpp/src/process.cpp index da7ddc0..c2fef43 100644 --- a/cpp/src/process.cpp +++ b/cpp/src/process.cpp @@ -15,7 +15,7 @@ process::extract(const std::wstring& query, const std::vector& cho for (const auto& choice : choices) { std::wstring b = (preprocess) ? utils::default_process(choice) : choice; - double score = fuzz::WRatio({query}, {choice}, score_cutoff); + double score = fuzz::WRatio(Sentence(query), Sentence(choice), score_cutoff); if (score >= score_cutoff) { results.emplace_back(std::make_pair(choice, score)); } @@ -46,7 +46,7 @@ process::extractOne(const std::wstring& query, const std::vector& for (const auto& choice : choices) { std::wstring b = (preprocess) ? utils::default_process(choice) : choice; - double score = fuzz::WRatio({a}, {b}, score_cutoff); + double score = fuzz::WRatio(Sentence(a), Sentence(b), score_cutoff); if (score >= score_cutoff) { score_cutoff = score; match_found = true; diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp deleted file mode 100644 index 63f4b8d..0000000 --- a/cpp/src/utils.cpp +++ /dev/null @@ -1,179 +0,0 @@ -#include "utils.hpp" -#include -#include - -/** - * Finds the longest common prefix between two ranges - */ -template -inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1, - InputIterator2 first2, InputIterator2 last2) -{ - return std::distance(first1, std::mismatch(first1, last1, first2, last2).first); -} - -/** - * Removes common prefix of two string views - */ -std::size_t remove_common_prefix(boost::wstring_view& a, boost::wstring_view& b) -{ - auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end()); - a.remove_prefix(prefix); - b.remove_prefix(prefix); - return prefix; -} - -/** - * Removes common suffix of two string views - */ -std::size_t remove_common_suffix(boost::wstring_view& a, boost::wstring_view& b) -{ - auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend()); - a.remove_suffix(suffix); - b.remove_suffix(suffix); - return suffix; -} - -/** - * Removes common affix of two string views - */ -Affix utils::remove_common_affix(boost::wstring_view& a, boost::wstring_view& b) -{ - return Affix{ - remove_common_prefix(a, b), - remove_common_suffix(a, b) - }; -} - -template -void vec_remove_common_affix(T& a, T& b) -{ - auto prefix = std::mismatch(a.begin(), a.end(), b.begin(), b.end()); - a.erase(a.begin(), prefix.first); - b.erase(b.begin(), prefix.second); - - auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend()); - a.erase(a.end() - suffix, a.end()); - b.erase(b.end() - suffix, b.end()); -} - -void utils::remove_common_affix(std::vector& a, std::vector& b) -{ - vec_remove_common_affix(a, b); - if (!a.empty() && !b.empty()) { - remove_common_prefix(a.front(), b.front()); - remove_common_suffix(a.back(), b.back()); - } -} - -std::wstring utils::join(const std::vector& sentence) -{ - if (sentence.empty()) { - return std::wstring(); - } - - auto sentence_iter = sentence.begin(); - std::wstring result{ *sentence_iter }; - const std::wstring whitespace{ 0x20 }; - ++sentence_iter; - for (; sentence_iter != sentence.end(); ++sentence_iter) { - result.append(whitespace).append(std::wstring{ *sentence_iter }); - } - return result; -} - -percent utils::result_cutoff(double result, percent score_cutoff) -{ - return (result >= score_cutoff) ? result : 0; -} - -// trim from start (in place) -void ltrim(std::wstring& s) -{ - s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](const wchar_t &ch) { - return !std::iswspace(ch); - })); -} - -// trim from end (in place) -void rtrim(std::wstring& s) -{ - s.erase(std::find_if(s.rbegin(), s.rend(), [](const wchar_t &ch) { - return !std::iswspace(ch); - }).base(), s.end()); -} - -// trim from both ends (in place) -void utils::trim(std::wstring& s) -{ - ltrim(s); - rtrim(s); -} - -void utils::lower_case(std::wstring& s) -{ - std::transform(s.begin(), s.end(), s.begin(), ::tolower); -} - -std::wstring utils::default_process(std::wstring s) -{ - // replace embedded null terminators - std::replace( s.begin(), s.end(), {'\x00'}, ' '); - trim(s); - lower_case(s); - return s; -} - -DecomposedSet utils::set_decomposition(std::vector a, std::vector b) -{ - std::vector intersection; - std::vector difference_ab; - a.erase(std::unique(a.begin(), a.end()), a.end()); - b.erase(std::unique(b.begin(), b.end()), b.end()); - - for (const auto& current_a : a) { - auto element_b = std::find(b.begin(), b.end(), current_a); - if (element_b != b.end()) { - b.erase(element_b); - intersection.emplace_back(current_a); - } else { - difference_ab.emplace_back(current_a); - } - } - - return DecomposedSet{ intersection, difference_ab, b }; -} - -std::size_t utils::joined_size(const std::vector& x) -{ - if (x.empty()) { - return 0; - } - - // there is a whitespace between each word - std::size_t result = x.size() - 1; - for (const auto& y : x) { - result += y.size(); - } - - return result; -} - -std::vector utils::splitSV(const boost::wstring_view& str) -{ - std::vector output; - // assume a word length of 6 + 1 whitespace - output.reserve(str.size() / 7); - - auto first = str.data(), second = str.data(), last = first + str.size(); - for (; second != last && first != last; first = second + 1) { - // maybe use localisation - second = std::find_if(first, last, [](const wchar_t &c) { return std::iswspace(c); }); - - if (first != second) { - output.emplace_back(first, second - first); - } - } - - return output; -} diff --git a/cpp/src/utils.hpp b/cpp/src/utils.hpp index 1f4ef4f..c827da5 100644 --- a/cpp/src/utils.hpp +++ b/cpp/src/utils.hpp @@ -5,21 +5,29 @@ /* 0.0% - 100.0% */ using percent = double; +template +using string_view_vec = std::vector>; +template struct Sentence { - boost::wstring_view sentence; + boost::basic_string_view sentence; uint64_t bitmap = 0; - Sentence(boost::wstring_view sentence, uint64_t bitmap) + Sentence(boost::basic_string_view sentence, uint64_t bitmap) : sentence(sentence), bitmap(bitmap) {} - Sentence(boost::wstring_view sentence) + Sentence(boost::basic_string_view sentence) : sentence(sentence), bitmap(0) {} + Sentence(std::basic_string sentence, uint64_t bitmap) + : sentence(boost::basic_string_view(sentence)), bitmap(bitmap) {} + Sentence(std::basic_string sentence) + : sentence(boost::basic_string_view(sentence)), bitmap(0) {} }; +template struct DecomposedSet { - std::vector intersection; - std::vector difference_ab; - std::vector difference_ba; - DecomposedSet(std::vector intersection, std::vector difference_ab, std::vector difference_ba) + string_view_vec intersection; + string_view_vec difference_ab; + string_view_vec difference_ba; + DecomposedSet(string_view_vec intersection, string_view_vec difference_ab, string_view_vec difference_ba) : intersection(std::move(intersection)) , difference_ab(std::move(difference_ab)) , difference_ba(std::move(difference_ba)) @@ -33,38 +41,40 @@ struct Affix { namespace utils { -std::vector splitSV(const boost::wstring_view& str); +template +string_view_vec splitSV(const boost::basic_string_view& str); -DecomposedSet set_decomposition(std::vector a, std::vector b); +template +string_view_vec splitSV(const std::basic_string& str); -std::size_t joined_size(const std::vector& x); +template +std::size_t joined_size(const string_view_vec& x); -std::wstring join(const std::vector& sentence); +template +std::basic_string join(const string_view_vec& sentence); + +template +DecomposedSet set_decomposition(string_view_vec a, string_view_vec b); + +template +Affix remove_common_affix(boost::basic_string_view& a, boost::basic_string_view& b); + +template +void trim(std::basic_string& s); + +template +void lower_case(std::basic_string& s); + +template +std::basic_string default_process(std::basic_string s); + +template +uint64_t bitmap_create(const boost::basic_string_view& sentence); + +template +uint64_t bitmap_create(const std::basic_string& sentence); percent result_cutoff(double result, percent score_cutoff); - -void trim(std::wstring& s); - -void lower_case(std::wstring& s); - -std::wstring default_process(std::wstring s); - -Affix remove_common_affix(boost::wstring_view& a, boost::wstring_view& b); - -void remove_common_affix(std::vector& a, std::vector& b); } -inline uint64_t bitmap_create(const boost::wstring_view& sentence) { - uint64_t bitmap = 0; - for (const unsigned int& letter : sentence) { - uint8_t shift = (letter % 16) * 4; - - // make sure there is no overflow when more than 8 characters - // with the same shift exist - uint64_t bitmask = static_cast(0b1111) << shift; - if ((bitmap & bitmask) != bitmask) { - bitmap += static_cast(1) << shift; - } - } - return bitmap; -} \ No newline at end of file +#include "utils.txx" \ No newline at end of file diff --git a/cpp/src/utils.txx b/cpp/src/utils.txx new file mode 100644 index 0000000..c827327 --- /dev/null +++ b/cpp/src/utils.txx @@ -0,0 +1,195 @@ +#include "utils.hpp" +#include +#include + +template +string_view_vec utils::splitSV(const boost::basic_string_view& str) +{ + string_view_vec output; + + auto first = str.data(), second = str.data(), last = first + str.size(); + for (; second != last && first != last; first = second + 1) { + // TODO: maybe use localisation + second = std::find_if(first, last, [](const CharT& c) { return std::isspace(c); }); + + if (first != second) { + output.emplace_back(first, second - first); + } + } + + return output; +} + +template +string_view_vec splitSV(const std::basic_string& str) +{ + return splitSV(boost::basic_string_view(str)); +} + +template +std::size_t utils::joined_size(const string_view_vec& x) +{ + if (x.empty()) { + return 0; + } + + // there is a whitespace between each word + std::size_t result = x.size() - 1; + for (const auto& y : x) { + result += y.size(); + } + + return result; +} + +template +std::basic_string utils::join(const string_view_vec& sentence) +{ + if (sentence.empty()) { + return std::basic_string(); + } + + auto sentence_iter = sentence.begin(); + std::basic_string result{ *sentence_iter }; + const std::basic_string whitespace{ 0x20 }; + ++sentence_iter; + for (; sentence_iter != sentence.end(); ++sentence_iter) { + result.append(whitespace).append(std::basic_string{ *sentence_iter }); + } + return result; +} + +template +DecomposedSet utils::set_decomposition(string_view_vec a, string_view_vec b) +{ + string_view_vec intersection; + string_view_vec difference_ab; + a.erase(std::unique(a.begin(), a.end()), a.end()); + b.erase(std::unique(b.begin(), b.end()), b.end()); + + for (const auto& current_a : a) { + auto element_b = std::find(b.begin(), b.end(), current_a); + if (element_b != b.end()) { + b.erase(element_b); + intersection.emplace_back(current_a); + } else { + difference_ab.emplace_back(current_a); + } + } + + return DecomposedSet{ intersection, difference_ab, b }; +} + + +/** + * Finds the longest common prefix between two ranges + */ +template +inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1, + InputIterator2 first2, InputIterator2 last2) +{ + return std::distance(first1, std::mismatch(first1, last1, first2, last2).first); +} + +/** + * Removes common prefix of two string views + */ +template +std::size_t remove_common_prefix(boost::basic_string_view& a, boost::basic_string_view& b) +{ + auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end()); + a.remove_prefix(prefix); + b.remove_prefix(prefix); + return prefix; +} + +/** + * Removes common suffix of two string views + */ +template +std::size_t remove_common_suffix(boost::basic_string_view& a, boost::basic_string_view& b) +{ + auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend()); + a.remove_suffix(suffix); + b.remove_suffix(suffix); + return suffix; +} + +/** + * Removes common affix of two string views + */ +template +Affix utils::remove_common_affix(boost::basic_string_view& a, boost::basic_string_view& b) +{ + return Affix{ + remove_common_prefix(a, b), + remove_common_suffix(a, b) + }; +} + +template +void ltrim(std::basic_string& s) +{ + s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](const CharT& ch) { + return !std::isspace(ch); + })); +} + +template +void rtrim(std::basic_string& s) +{ + s.erase(std::find_if(s.rbegin(), s.rend(), [](const CharT& ch) { + return !std::isspace(ch); + }).base(), s.end()); +} + +template +void utils::trim(std::basic_string& s) +{ + ltrim(s); + rtrim(s); +} + +template +void utils::lower_case(std::basic_string& s) +{ + std::transform(s.begin(), s.end(), s.begin(), ::tolower); +} + +template +std::basic_string utils::default_process(std::basic_string s) +{ + // replace embedded null terminators + std::replace( s.begin(), s.end(), CharT{0}, CharT{0x20}); + trim(s); + lower_case(s); + return s; +} + +template +uint64_t utils::bitmap_create(const boost::basic_string_view& sentence) +{ + uint64_t bitmap = 0; + for (const unsigned int& letter : sentence) { + uint8_t shift = (letter % 16) * 4; + + // make sure there is no overflow when more than 8 characters + // with the same shift exist + uint64_t bitmask = static_cast(0b1111) << shift; + if ((bitmap & bitmask) != bitmask) { + bitmap += static_cast(1) << shift; + } + } + return bitmap; +} + +template +uint64_t utils::bitmap_create(const std::basic_string& sentence) +{ + return bitmap_create(boost::basic_string_view(sentence)); +} + +inline percent utils::result_cutoff(double result, percent score_cutoff) +{ + return (result >= score_cutoff) ? result : 0; +} \ No newline at end of file diff --git a/python/src/py_fuzz.cpp b/python/src/py_fuzz.cpp index 519b668..643b7b9 100644 --- a/python/src/py_fuzz.cpp +++ b/python/src/py_fuzz.cpp @@ -368,13 +368,13 @@ static PyObject* token_ratio(PyObject *self, PyObject *args, PyObject *keywds) { double result; if (preprocess) { result = fuzz::token_ratio( - {s1}, - {s2}, + Sentence(s1), + Sentence(s2), score_cutoff); } else { result = fuzz::token_ratio( - {s1}, - {s2}, + Sentence(s1), + Sentence(s2), score_cutoff); } @@ -493,8 +493,8 @@ static PyObject* WRatio(PyObject *self, PyObject *args, PyObject *keywds) { std::wstring s2 = PyObject_To_Wstring(py_s2, preprocess); double result = fuzz::WRatio( - {s1}, - {s2}, + Sentence(s1), + Sentence(s2), score_cutoff); return PyFloat_FromDouble(result); diff --git a/python/src/py_process.cpp b/python/src/py_process.cpp index 2a65b23..d0af842 100644 --- a/python/src/py_process.cpp +++ b/python/src/py_process.cpp @@ -28,11 +28,12 @@ PyObject* extract(PyObject *self, PyObject *args, PyObject *keywds) { int preprocess = 1; static const char *kwlist[] = {"query", "choices", "score_cutoff", "preprocess", NULL}; - if (!PyArg_ParseTupleAndKeywords(args, keywds, "UO|dp", const_cast(kwlist), + if (!PyArg_ParseTupleAndKeywords(args, keywds, "UO|dh", const_cast(kwlist), &py_query, &py_choices, &score_cutoff, &preprocess)) { return NULL; } + PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings"); if (!choices) { return NULL; @@ -44,7 +45,7 @@ PyObject* extract(PyObject *self, PyObject *args, PyObject *keywds) { } std::wstring cleaned_query = PyObject_To_Wstring(py_query, preprocess); - uint64_t query_bitmap = bitmap_create(cleaned_query); + uint64_t query_bitmap = utils::bitmap_create(cleaned_query); PyObject* results = PyList_New(0); @@ -62,12 +63,12 @@ PyObject* extract(PyObject *self, PyObject *args, PyObject *keywds) { std::wstring choice(buffer, len); PyMem_Free(buffer); - boost::wstring_view cleaned_choice = (preprocess) ? utils::default_process(choice) : choice; - uint64_t choice_bitmap = bitmap_create(cleaned_choice); + std::wstring cleaned_choice = (preprocess) ? utils::default_process(choice) : choice; + uint64_t choice_bitmap = utils::bitmap_create(cleaned_choice); double score= fuzz::WRatio( - {cleaned_query, query_bitmap}, - {cleaned_choice, choice_bitmap}, + Sentence(cleaned_query, query_bitmap), + Sentence(cleaned_choice, choice_bitmap), score_cutoff); if (score >= score_cutoff) { @@ -117,7 +118,7 @@ PyObject* extractOne(PyObject *self, PyObject *args, PyObject *keywds) { } std::wstring cleaned_query = PyObject_To_Wstring(py_query, preprocess); - uint64_t query_bitmap = bitmap_create(cleaned_query); + uint64_t query_bitmap = utils::bitmap_create(cleaned_query); double end_score = 0; std::wstring result_choice; @@ -136,12 +137,12 @@ PyObject* extractOne(PyObject *self, PyObject *args, PyObject *keywds) { std::wstring choice(buffer, len); PyMem_Free(buffer); - boost::wstring_view cleaned_choice = (preprocess) ? utils::default_process(choice) : choice; - uint64_t choice_bitmap = bitmap_create(cleaned_choice); + std::wstring cleaned_choice = (preprocess) ? utils::default_process(choice) : choice; + uint64_t choice_bitmap = utils::bitmap_create(cleaned_choice); double score = fuzz::WRatio( - {cleaned_query, query_bitmap}, - {cleaned_choice, choice_bitmap}, + Sentence(cleaned_query, query_bitmap), + Sentence(cleaned_choice, choice_bitmap), score_cutoff); if (score >= score_cutoff) { diff --git a/setup.py b/setup.py index d496473..635e831 100644 --- a/setup.py +++ b/setup.py @@ -52,19 +52,19 @@ setup( ext_modules = [ Extension( 'rapidfuzz.levenshtein', - ['python/src/py_levenshtein.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'], + ['python/src/py_levenshtein.cpp'], include_dirs=["cpp/src", "cpp/extern"], language='c++', ), Extension( 'rapidfuzz.fuzz', - ['python/src/py_fuzz.cpp', 'cpp/src/fuzz.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'], + ['python/src/py_fuzz.cpp'], include_dirs=["cpp/src", "cpp/extern"], language='c++', ), Extension( 'rapidfuzz._process', - ['python/src/py_process.cpp', 'cpp/src/fuzz.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'], + ['python/src/py_process.cpp'], include_dirs=["cpp/src", "cpp/extern"], language='c++', ),