fix string view usage
This commit is contained in:
parent
cae67851e5
commit
4da4234f73
|
@ -1,6 +1,6 @@
|
|||
include README.md
|
||||
include VERSION
|
||||
include LICENSE
|
||||
recursive-include cpp/src *.hpp
|
||||
recursive-include cpp/src *.hpp *.txx
|
||||
recursive-include cpp/extern/boost *
|
||||
recursive-include python/src *.hpp
|
||||
|
|
113
cpp/src/fuzz.hpp
113
cpp/src/fuzz.hpp
|
@ -3,23 +3,110 @@
|
|||
#include "utils.hpp"
|
||||
|
||||
namespace fuzz {
|
||||
percent ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff = 0);
|
||||
percent partial_ratio(boost::wstring_view s1, boost::wstring_view s2, percent score_cutoff = 0);
|
||||
template<typename CharT>
|
||||
percent ratio(
|
||||
const boost::basic_string_view<CharT>& s1,
|
||||
const boost::basic_string_view<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
percent token_sort_ratio(const boost::wstring_view& a, const boost::wstring_view& b, percent score_cutoff = 0);
|
||||
percent partial_token_sort_ratio(const boost::wstring_view& a, const boost::wstring_view& b, percent score_cutoff = 0);
|
||||
template<typename CharT>
|
||||
percent ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
percent token_set_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff = 0);
|
||||
percent partial_token_set_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff = 0);
|
||||
template<typename CharT>
|
||||
percent partial_ratio(
|
||||
boost::basic_string_view<CharT> s1,
|
||||
boost::basic_string_view<CharT> s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
percent token_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0);
|
||||
percent partial_token_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff = 0);
|
||||
template<typename CharT>
|
||||
percent partial_ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
template<typename CharT>
|
||||
percent token_sort_ratio(
|
||||
const boost::basic_string_view<CharT>& s1,
|
||||
const boost::basic_string_view<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
std::size_t bitmap_distance(const Sentence& s1, const Sentence& s2);
|
||||
percent bitmap_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0);
|
||||
percent length_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0);
|
||||
percent quick_lev_estimate(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0);
|
||||
template<typename CharT>
|
||||
percent token_sort_ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
percent WRatio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0);
|
||||
template<typename CharT>
|
||||
percent partial_token_sort_ratio(
|
||||
const boost::basic_string_view<CharT>& s1,
|
||||
const boost::basic_string_view<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
template<typename CharT>
|
||||
percent partial_token_sort_ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
template<typename CharT>
|
||||
percent token_set_ratio(
|
||||
const boost::basic_string_view<CharT>& s1,
|
||||
const boost::basic_string_view<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
template<typename CharT>
|
||||
percent token_set_ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
template<typename CharT>
|
||||
percent partial_token_set_ratio(
|
||||
const boost::basic_string_view<CharT>& s1,
|
||||
const boost::basic_string_view<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
template<typename CharT>
|
||||
percent partial_token_set_ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
template<typename CharT>
|
||||
percent token_ratio(
|
||||
const Sentence<CharT>& s1,
|
||||
const Sentence<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
template<typename CharT>
|
||||
percent partial_token_ratio(
|
||||
const boost::basic_string_view<CharT>& s1,
|
||||
const boost::basic_string_view<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
template<typename CharT>
|
||||
percent partial_token_ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff = 0);
|
||||
|
||||
template<typename CharT>
|
||||
std::size_t bitmap_distance(const Sentence<CharT>& s1, const Sentence<CharT>& s2);
|
||||
|
||||
template<typename CharT>
|
||||
percent bitmap_ratio(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff = 0);
|
||||
|
||||
template<typename CharT>
|
||||
percent length_ratio(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff = 0);
|
||||
|
||||
template<typename CharT>
|
||||
percent quick_lev_estimate(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff = 0);
|
||||
|
||||
template<typename CharT>
|
||||
percent WRatio(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff = 0);
|
||||
}
|
||||
|
||||
#include "fuzz.txx"
|
||||
|
|
|
@ -7,7 +7,33 @@
|
|||
#include <tuple>
|
||||
#include <iterator>
|
||||
|
||||
percent fuzz::partial_ratio(boost::wstring_view s1, boost::wstring_view s2, percent score_cutoff)
|
||||
template<typename CharT>
|
||||
inline percent fuzz::ratio(
|
||||
const boost::basic_string_view<CharT>& s1,
|
||||
const boost::basic_string_view<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
double result = levenshtein::normalized_weighted_distance(s1, s2, score_cutoff / 100);
|
||||
return utils::result_cutoff(result * 100, score_cutoff);
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
inline percent fuzz::ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
return ratio(
|
||||
boost::basic_string_view<CharT>(s1),
|
||||
boost::basic_string_view<CharT>(s2),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
inline percent fuzz::partial_ratio(
|
||||
boost::basic_string_view<CharT> s1,
|
||||
boost::basic_string_view<CharT> s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
if (s1.empty() || s2.empty() || score_cutoff > 100) {
|
||||
return 0;
|
||||
|
@ -37,21 +63,104 @@ percent fuzz::partial_ratio(boost::wstring_view s1, boost::wstring_view s2, perc
|
|||
return utils::result_cutoff(max_ratio * 100, score_cutoff);
|
||||
}
|
||||
|
||||
percent fuzz::ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff)
|
||||
template<typename CharT>
|
||||
inline percent fuzz::partial_ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
double result = levenshtein::normalized_weighted_distance(s1, s2, score_cutoff / 100);
|
||||
return utils::result_cutoff(result * 100, score_cutoff);
|
||||
return partial_ratio(
|
||||
boost::basic_string_view<CharT>(s1),
|
||||
boost::basic_string_view<CharT>(s2),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
percent fuzz::token_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff)
|
||||
template<typename CharT>
|
||||
percent _token_sort(
|
||||
const boost::basic_string_view<CharT>& s1,
|
||||
const boost::basic_string_view<CharT>& s2,
|
||||
bool partial,
|
||||
percent score_cutoff = 0.0)
|
||||
{
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::vector<boost::wstring_view> tokens_a = utils::splitSV(s1.sentence);
|
||||
string_view_vec<CharT> tokens_a = utils::splitSV(s1);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
std::vector<boost::wstring_view> tokens_b = utils::splitSV(s2.sentence);
|
||||
string_view_vec<CharT> tokens_b = utils::splitSV(s2);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
if (partial) {
|
||||
return fuzz::partial_ratio(
|
||||
utils::join(tokens_a),
|
||||
utils::join(tokens_b),
|
||||
score_cutoff);
|
||||
}
|
||||
else {
|
||||
double result = levenshtein::normalized_weighted_distance(
|
||||
utils::join(tokens_a),
|
||||
utils::join(tokens_b),
|
||||
score_cutoff / 100);
|
||||
return utils::result_cutoff(result * 100, score_cutoff);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
percent fuzz::token_sort_ratio(
|
||||
const boost::basic_string_view<CharT>& s1,
|
||||
const boost::basic_string_view<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
return _token_sort(s1, s2, false, score_cutoff);
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
percent fuzz::token_sort_ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
return _token_sort(
|
||||
boost::basic_string_view<CharT>(s1),
|
||||
boost::basic_string_view<CharT>(s2),
|
||||
false, score_cutoff);
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
percent fuzz::partial_token_sort_ratio(
|
||||
const boost::basic_string_view<CharT>& s1,
|
||||
const boost::basic_string_view<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
return _token_sort(s1, s2, true, score_cutoff);
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
percent fuzz::partial_token_sort_ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
return _token_sort(
|
||||
boost::basic_string_view<CharT>(s1),
|
||||
boost::basic_string_view<CharT>(s2),
|
||||
true, score_cutoff);
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
percent fuzz::token_set_ratio(
|
||||
const boost::basic_string_view<CharT>& s1,
|
||||
const boost::basic_string_view<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
string_view_vec<CharT> tokens_a = utils::splitSV(s1);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
string_view_vec<CharT> tokens_b = utils::splitSV(s2);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
auto decomposition = utils::set_decomposition(tokens_a, tokens_b);
|
||||
|
@ -59,8 +168,124 @@ percent fuzz::token_ratio(const Sentence& s1, const Sentence& s2, percent score_
|
|||
auto difference_ab = decomposition.difference_ab;
|
||||
auto difference_ba = decomposition.difference_ba;
|
||||
|
||||
std::wstring diff_ab_joined = utils::join(difference_ab);
|
||||
std::wstring diff_ba_joined = utils::join(difference_ba);
|
||||
std::basic_string<CharT> diff_ab_joined = utils::join(difference_ab);
|
||||
std::basic_string<CharT> diff_ba_joined = utils::join(difference_ba);
|
||||
|
||||
std::size_t ab_len = diff_ab_joined.length();
|
||||
std::size_t ba_len = diff_ba_joined.length();
|
||||
std::size_t sect_len = utils::joined_size(intersection);
|
||||
|
||||
// exit early since this will always result in a ratio of 1
|
||||
if (sect_len && (!ab_len || !ba_len)) {
|
||||
return 100;
|
||||
}
|
||||
|
||||
// string length sect+ab <-> sect and sect+ba <-> sect
|
||||
std::size_t sect_ab_lensum = sect_len + !!sect_len + ab_len;
|
||||
std::size_t sect_ba_lensum = sect_len + !!sect_len + ba_len;
|
||||
|
||||
std::size_t sect_distance = levenshtein::weighted_distance(diff_ab_joined, diff_ba_joined);
|
||||
double result = 0;
|
||||
if (sect_distance != std::numeric_limits<std::size_t>::max()) {
|
||||
result = std::max(result, 1.0 - sect_distance / static_cast<double>(sect_ab_lensum + sect_ba_lensum));
|
||||
}
|
||||
|
||||
// exit early since the other ratios are 0
|
||||
if (!sect_len) {
|
||||
return utils::result_cutoff(result * 100, score_cutoff);
|
||||
}
|
||||
|
||||
// levenshtein distance sect+ab <-> sect and sect+ba <-> sect
|
||||
// would exit early after removing the prefix sect, so the distance can be directly calculated
|
||||
std::size_t sect_ab_distance = !!sect_len + ab_len;
|
||||
std::size_t sect_ba_distance = !!sect_len + ba_len;
|
||||
|
||||
result = std::max({ result,
|
||||
1.0 - sect_ab_distance / static_cast<double>(sect_len + sect_ab_lensum),
|
||||
1.0 - sect_ba_distance / static_cast<double>(sect_len + sect_ba_lensum) });
|
||||
return utils::result_cutoff(result * 100, score_cutoff);
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
percent fuzz::token_set_ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
return token_set_ratio(
|
||||
boost::basic_string_view<CharT>(s1),
|
||||
boost::basic_string_view<CharT>(s1),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
percent fuzz::partial_token_set_ratio(
|
||||
const boost::basic_string_view<CharT>& s1,
|
||||
const boost::basic_string_view<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
string_view_vec<CharT> tokens_a = utils::splitSV(s1);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
string_view_vec<CharT> tokens_b = utils::splitSV(s2);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
tokens_a.erase(std::unique(tokens_a.begin(), tokens_a.end()), tokens_a.end());
|
||||
tokens_b.erase(std::unique(tokens_b.begin(), tokens_b.end()), tokens_b.end());
|
||||
|
||||
string_view_vec<CharT> difference_ab;
|
||||
string_view_vec<CharT> difference_ba;
|
||||
|
||||
std::set_difference(tokens_a.begin(), tokens_a.end(), tokens_b.begin(), tokens_b.end(),
|
||||
std::inserter(difference_ab, difference_ab.begin()));
|
||||
std::set_difference(tokens_b.begin(), tokens_b.end(), tokens_a.begin(), tokens_a.end(),
|
||||
std::inserter(difference_ba, difference_ba.begin()));
|
||||
|
||||
// exit early when there is a common word in both sequences
|
||||
if (difference_ab.size() < tokens_a.size()) {
|
||||
return 100;
|
||||
}
|
||||
|
||||
return partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff);
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
percent fuzz::partial_token_set_ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
return partial_token_set_ratio(
|
||||
boost::basic_string_view<CharT>(s1),
|
||||
boost::basic_string_view<CharT>(s1),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
percent fuzz::token_ratio(
|
||||
const Sentence<CharT>& s1,
|
||||
const Sentence<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
string_view_vec<CharT> tokens_a = utils::splitSV(s1.sentence);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
string_view_vec<CharT> tokens_b = utils::splitSV(s2.sentence);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
auto decomposition = utils::set_decomposition(tokens_a, tokens_b);
|
||||
auto intersection = decomposition.intersection;
|
||||
auto difference_ab = decomposition.difference_ab;
|
||||
auto difference_ba = decomposition.difference_ba;
|
||||
|
||||
std::basic_string<CharT> diff_ab_joined = utils::join(difference_ab);
|
||||
std::basic_string<CharT> diff_ba_joined = utils::join(difference_ba);
|
||||
|
||||
std::size_t ab_len = diff_ab_joined.length();
|
||||
std::size_t ba_len = diff_ba_joined.length();
|
||||
|
@ -83,8 +308,8 @@ percent fuzz::token_ratio(const Sentence& s1, const Sentence& s2, percent score_
|
|||
std::size_t sect_ab_lensum = sect_len + !!sect_len + ab_len;
|
||||
std::size_t sect_ba_lensum = sect_len + !!sect_len + ba_len;
|
||||
|
||||
Sentence diff_ab{diff_ab_joined, bitmap_create(diff_ab_joined)};
|
||||
Sentence diff_ba{diff_ba_joined, bitmap_create(diff_ba_joined)};
|
||||
Sentence<CharT> diff_ab{diff_ab_joined, utils::bitmap_create(diff_ab_joined)};
|
||||
Sentence<CharT> diff_ba{diff_ba_joined, utils::bitmap_create(diff_ba_joined)};
|
||||
double bm_ratio = 1.0 - bitmap_distance(diff_ab, diff_ba) / static_cast<double>(sect_ab_lensum + sect_ba_lensum);
|
||||
if (bm_ratio >= score_cutoff) {
|
||||
std::size_t sect_distance = levenshtein::weighted_distance(diff_ab_joined, diff_ba_joined);
|
||||
|
@ -111,7 +336,11 @@ percent fuzz::token_ratio(const Sentence& s1, const Sentence& s2, percent score_
|
|||
|
||||
// combines token_set and token_sort ratio from fuzzywuzzy so it is only required to
|
||||
// do a lot of operations once
|
||||
percent fuzz::partial_token_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff)
|
||||
template<typename CharT>
|
||||
percent fuzz::partial_token_ratio(
|
||||
const boost::basic_string_view<CharT>& s1,
|
||||
const boost::basic_string_view<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
|
@ -152,127 +381,21 @@ percent fuzz::partial_token_ratio(const boost::wstring_view& s1, const boost::ws
|
|||
partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff));
|
||||
}
|
||||
|
||||
percent _token_sort(const boost::wstring_view& s1, const boost::wstring_view& s2, bool partial, percent score_cutoff = 0.0)
|
||||
template<typename CharT>
|
||||
percent fuzz::partial_token_ratio(
|
||||
const std::basic_string<CharT>& s1,
|
||||
const std::basic_string<CharT>& s2,
|
||||
percent score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::vector<boost::wstring_view> tokens_a = utils::splitSV(s1);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
std::vector<boost::wstring_view> tokens_b = utils::splitSV(s2);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
if (partial) {
|
||||
return fuzz::partial_ratio(
|
||||
utils::join(tokens_a),
|
||||
utils::join(tokens_b),
|
||||
score_cutoff);
|
||||
}
|
||||
else {
|
||||
double result = levenshtein::normalized_weighted_distance(
|
||||
utils::join(tokens_a),
|
||||
utils::join(tokens_b),
|
||||
score_cutoff / 100);
|
||||
return utils::result_cutoff(result * 100, score_cutoff);
|
||||
}
|
||||
return partial_token_ratio(
|
||||
boost::basic_string_view<CharT>(s1),
|
||||
boost::basic_string_view<CharT>(s2),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
percent fuzz::token_sort_ratio(const boost::wstring_view& a, const boost::wstring_view& b, percent score_cutoff)
|
||||
template<typename CharT>
|
||||
std::size_t fuzz::bitmap_distance(const Sentence<CharT>& s1, const Sentence<CharT>& s2)
|
||||
{
|
||||
return _token_sort(a, b, false, score_cutoff);
|
||||
}
|
||||
|
||||
percent fuzz::partial_token_sort_ratio(const boost::wstring_view& a, const boost::wstring_view& b, percent score_cutoff)
|
||||
{
|
||||
return _token_sort(a, b, true, score_cutoff);
|
||||
}
|
||||
|
||||
percent fuzz::token_set_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::vector<boost::wstring_view> tokens_a = utils::splitSV(s1);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
std::vector<boost::wstring_view> tokens_b = utils::splitSV(s2);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
auto decomposition = utils::set_decomposition(tokens_a, tokens_b);
|
||||
auto intersection = decomposition.intersection;
|
||||
auto difference_ab = decomposition.difference_ab;
|
||||
auto difference_ba = decomposition.difference_ba;
|
||||
|
||||
std::wstring diff_ab_joined = utils::join(difference_ab);
|
||||
std::wstring diff_ba_joined = utils::join(difference_ba);
|
||||
|
||||
std::size_t ab_len = diff_ab_joined.length();
|
||||
std::size_t ba_len = diff_ba_joined.length();
|
||||
std::size_t sect_len = utils::joined_size(intersection);
|
||||
|
||||
// exit early since this will always result in a ratio of 1
|
||||
if (sect_len && (!ab_len || !ba_len)) {
|
||||
return 100;
|
||||
}
|
||||
|
||||
// string length sect+ab <-> sect and sect+ba <-> sect
|
||||
std::size_t sect_ab_lensum = sect_len + !!sect_len + ab_len;
|
||||
std::size_t sect_ba_lensum = sect_len + !!sect_len + ba_len;
|
||||
|
||||
std::size_t sect_distance = levenshtein::weighted_distance(diff_ab_joined, diff_ba_joined);
|
||||
double result = 0;
|
||||
if (sect_distance != std::numeric_limits<std::size_t>::max()) {
|
||||
result = std::max(result, 1.0 - sect_distance / static_cast<double>(sect_ab_lensum + sect_ba_lensum));
|
||||
}
|
||||
|
||||
// exit early since the other ratios are 0
|
||||
if (!sect_len) {
|
||||
return utils::result_cutoff(result * 100, score_cutoff);
|
||||
}
|
||||
|
||||
// levenshtein distance sect+ab <-> sect and sect+ba <-> sect
|
||||
// would exit early after removing the prefix sect, so the distance can be directly calculated
|
||||
std::size_t sect_ab_distance = !!sect_len + ab_len;
|
||||
std::size_t sect_ba_distance = !!sect_len + ba_len;
|
||||
|
||||
result = std::max({ result,
|
||||
1.0 - sect_ab_distance / static_cast<double>(sect_len + sect_ab_lensum),
|
||||
1.0 - sect_ba_distance / static_cast<double>(sect_len + sect_ba_lensum) });
|
||||
return utils::result_cutoff(result * 100, score_cutoff);
|
||||
}
|
||||
|
||||
percent fuzz::partial_token_set_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::vector<boost::wstring_view> tokens_a = utils::splitSV(s1);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
std::vector<boost::wstring_view> tokens_b = utils::splitSV(s2);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
tokens_a.erase(std::unique(tokens_a.begin(), tokens_a.end()), tokens_a.end());
|
||||
tokens_b.erase(std::unique(tokens_b.begin(), tokens_b.end()), tokens_b.end());
|
||||
|
||||
std::vector<boost::wstring_view> difference_ab;
|
||||
std::vector<boost::wstring_view> difference_ba;
|
||||
|
||||
std::set_difference(tokens_a.begin(), tokens_a.end(), tokens_b.begin(), tokens_b.end(),
|
||||
std::inserter(difference_ab, difference_ab.begin()));
|
||||
std::set_difference(tokens_b.begin(), tokens_b.end(), tokens_a.begin(), tokens_a.end(),
|
||||
std::inserter(difference_ba, difference_ba.begin()));
|
||||
|
||||
// exit early when there is a common word in both sequences
|
||||
if (difference_ab.size() < tokens_a.size()) {
|
||||
return 100;
|
||||
}
|
||||
|
||||
return partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff);
|
||||
}
|
||||
|
||||
std::size_t fuzz::bitmap_distance(const Sentence& s1, const Sentence& s2) {
|
||||
uint64_t bitmap1 = s1.bitmap;
|
||||
uint64_t bitmap2 = s2.bitmap;
|
||||
|
||||
|
@ -287,7 +410,9 @@ std::size_t fuzz::bitmap_distance(const Sentence& s1, const Sentence& s2) {
|
|||
return distance;
|
||||
}
|
||||
|
||||
percent fuzz::bitmap_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff) {
|
||||
template<typename CharT>
|
||||
percent fuzz::bitmap_ratio(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff)
|
||||
{
|
||||
std::size_t distance = bitmap_distance(s1, s2);
|
||||
std::size_t lensum = s1.sentence.length() + s2.sentence.length();
|
||||
percent result = 1.0 - static_cast<double>(distance) / lensum;
|
||||
|
@ -296,7 +421,9 @@ percent fuzz::bitmap_ratio(const Sentence& s1, const Sentence& s2, percent score
|
|||
}
|
||||
|
||||
|
||||
percent fuzz::length_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff) {
|
||||
template<typename CharT>
|
||||
percent fuzz::length_ratio(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff)
|
||||
{
|
||||
std::size_t s1_len = s1.sentence.length();
|
||||
std::size_t s2_len = s2.sentence.length();
|
||||
std::size_t distance = (s1_len > s2_len)
|
||||
|
@ -308,7 +435,9 @@ percent fuzz::length_ratio(const Sentence& s1, const Sentence& s2, percent score
|
|||
return utils::result_cutoff(result * 100, score_cutoff);
|
||||
}
|
||||
|
||||
percent fuzz::quick_lev_estimate(const Sentence& s1, const Sentence& s2, percent score_cutoff) {
|
||||
template<typename CharT>
|
||||
percent fuzz::quick_lev_estimate(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff)
|
||||
{
|
||||
if (s1.bitmap || s2.bitmap) {
|
||||
return bitmap_ratio(s1, s2, score_cutoff);
|
||||
} else {
|
||||
|
@ -316,7 +445,8 @@ percent fuzz::quick_lev_estimate(const Sentence& s1, const Sentence& s2, percent
|
|||
}
|
||||
}
|
||||
|
||||
percent fuzz::WRatio(const Sentence& s1, const Sentence& s2, percent score_cutoff)
|
||||
template<typename CharT>
|
||||
percent fuzz::WRatio(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff)
|
||||
{
|
||||
if (score_cutoff > 100) {
|
||||
return 0;
|
|
@ -19,17 +19,6 @@ enum EditType {
|
|||
EditDelete,
|
||||
};
|
||||
|
||||
struct EditOp {
|
||||
EditType op_type;
|
||||
std::size_t first_start;
|
||||
std::size_t second_start;
|
||||
EditOp(EditType op_type, std::size_t first_start, std::size_t second_start)
|
||||
: op_type(op_type)
|
||||
, first_start(first_start)
|
||||
, second_start(second_start)
|
||||
{}
|
||||
};
|
||||
|
||||
struct Matrix {
|
||||
std::size_t prefix_len;
|
||||
std::vector<std::size_t> matrix;
|
||||
|
@ -37,10 +26,6 @@ struct Matrix {
|
|||
std::size_t matrix_rows;
|
||||
};
|
||||
|
||||
Matrix matrix(boost::wstring_view sentence1, boost::wstring_view sentence2);
|
||||
|
||||
std::vector<EditOp> editops(boost::wstring_view sentence1, boost::wstring_view sentence2);
|
||||
|
||||
struct MatchingBlock {
|
||||
std::size_t first_start;
|
||||
std::size_t second_start;
|
||||
|
@ -52,11 +37,49 @@ struct MatchingBlock {
|
|||
{}
|
||||
};
|
||||
|
||||
std::vector<MatchingBlock> matching_blocks(boost::wstring_view sentence1, boost::wstring_view sentence2);
|
||||
|
||||
double normalized_distance(boost::wstring_view sentence1, boost::wstring_view sentence2, double min_ratio = 0.0);
|
||||
template<typename CharT>
|
||||
Matrix matrix(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2);
|
||||
|
||||
template <typename CharT>
|
||||
Matrix matrix(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2);
|
||||
|
||||
template<typename CharT>
|
||||
std::vector<MatchingBlock> matching_blocks(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2);
|
||||
|
||||
template <typename CharT>
|
||||
std::vector<MatchingBlock> matching_blocks(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2);
|
||||
|
||||
template<typename CharT>
|
||||
double normalized_distance(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2,
|
||||
double min_ratio = 0.0);
|
||||
|
||||
template <typename CharT>
|
||||
double normalized_distance(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2,
|
||||
double min_ratio = 0.0);
|
||||
|
||||
template<typename CharT>
|
||||
std::size_t distance(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2);
|
||||
|
||||
template <typename CharT>
|
||||
std::size_t distance(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2);
|
||||
|
||||
std::size_t distance(boost::wstring_view sentence1, boost::wstring_view sentence2);
|
||||
|
||||
/**
|
||||
* Calculates the minimum number of insertions, deletions, and substitutions
|
||||
|
@ -74,13 +97,43 @@ std::size_t distance(boost::wstring_view sentence1, boost::wstring_view sentence
|
|||
* @param sentence2 second sentence to match (can be either a string type or a vector of strings)
|
||||
* @return weighted levenshtein distance
|
||||
*/
|
||||
std::size_t weighted_distance(boost::wstring_view sentence1, boost::wstring_view sentence2);
|
||||
template<typename CharT>
|
||||
std::size_t weighted_distance(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2);
|
||||
|
||||
std::size_t generic_distance(boost::wstring_view source, boost::wstring_view target, WeightTable weights = { 1, 1, 1 });
|
||||
template <typename CharT>
|
||||
std::size_t weighted_distance(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2);
|
||||
|
||||
template<typename CharT>
|
||||
std::size_t generic_distance(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2,
|
||||
WeightTable weights = { 1, 1, 1 });
|
||||
|
||||
template <typename CharT>
|
||||
std::size_t generic_distance(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2,
|
||||
WeightTable weights = { 1, 1, 1 });
|
||||
|
||||
/**
|
||||
* Calculates a normalized score of the weighted Levenshtein algorithm between 0.0 and
|
||||
* 1.0 (inclusive), where 1.0 means the sequences are the same.
|
||||
*/
|
||||
double normalized_weighted_distance(const boost::wstring_view& sentence1, const boost::wstring_view& sentence2, double min_ratio = 0.0);
|
||||
template<typename CharT>
|
||||
double normalized_weighted_distance(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2,
|
||||
double min_ratio = 0.0);
|
||||
|
||||
template <typename CharT>
|
||||
double normalized_weighted_distance(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2,
|
||||
double min_ratio = 0.0);
|
||||
|
||||
}
|
||||
#include "levenshtein.txx"
|
||||
|
|
|
@ -2,7 +2,11 @@
|
|||
#include <algorithm>
|
||||
#include <stdexcept>
|
||||
|
||||
levenshtein::Matrix levenshtein::matrix(boost::wstring_view sentence1, boost::wstring_view sentence2)
|
||||
|
||||
template<typename CharT>
|
||||
levenshtein::Matrix levenshtein::matrix(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2)
|
||||
{
|
||||
Affix affix = utils::remove_common_affix(sentence1, sentence2);
|
||||
|
||||
|
@ -42,20 +46,20 @@ levenshtein::Matrix levenshtein::matrix(boost::wstring_view sentence1, boost::ws
|
|||
};
|
||||
}
|
||||
|
||||
std::vector<levenshtein::EditOp> levenshtein::editops(boost::wstring_view sentence1, boost::wstring_view sentence2)
|
||||
template <typename CharT>
|
||||
levenshtein::Matrix levenshtein::matrix(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2)
|
||||
{
|
||||
auto m = matrix(sentence1, sentence2);
|
||||
std::size_t matrix_columns = m.matrix_columns;
|
||||
std::size_t matrix_rows = m.matrix_rows;
|
||||
std::size_t prefix_len = m.prefix_len;
|
||||
auto lev_matrix = m.matrix;
|
||||
return matrix(
|
||||
boost::basic_string_view<CharT>(sentence1),
|
||||
boost::basic_string_view<CharT>(sentence2));
|
||||
}
|
||||
|
||||
std::vector<EditOp> ops;
|
||||
ops.reserve(lev_matrix[matrix_columns * matrix_rows - 1]);
|
||||
|
||||
std::size_t i = matrix_columns - 1;
|
||||
std::size_t j = matrix_rows - 1;
|
||||
std::size_t position = matrix_columns * matrix_rows - 1;
|
||||
levenshtein::EditType get_EditType(levenshtein::Matrix matrix, std::size_t row, std::size_t column)
|
||||
{
|
||||
auto lev_matrix = matrix.matrix;
|
||||
std::size_t matrix_rows = matrix.matrix_rows;
|
||||
|
||||
auto is_replace = [=](std::size_t pos) {
|
||||
return lev_matrix[pos - matrix_rows - 1] < lev_matrix[pos];
|
||||
|
@ -70,58 +74,67 @@ std::vector<levenshtein::EditOp> levenshtein::editops(boost::wstring_view senten
|
|||
return lev_matrix[pos - matrix_rows - 1] == lev_matrix[pos];
|
||||
};
|
||||
|
||||
while (i > 0 || j > 0) {
|
||||
EditType op_type;
|
||||
std::size_t position = column*matrix_rows + row;
|
||||
|
||||
if (i && j && is_replace(position)) {
|
||||
op_type = EditType::EditReplace;
|
||||
--i;
|
||||
--j;
|
||||
position -= matrix_rows + 1;
|
||||
} else if (j && is_insert(position)) {
|
||||
op_type = EditType::EditInsert;
|
||||
--j;
|
||||
--position;
|
||||
} else if (i && is_delete(position)) {
|
||||
op_type = EditType::EditDelete;
|
||||
--i;
|
||||
position -= matrix_rows;
|
||||
} else if (is_keep(position)) {
|
||||
--i;
|
||||
--j;
|
||||
position -= matrix_rows + 1;
|
||||
// EditKeep does not has to be stored
|
||||
continue;
|
||||
} else {
|
||||
throw std::logic_error("something went wrong extracting the editops from the levenshtein matrix");
|
||||
}
|
||||
|
||||
ops.emplace_back(op_type, i + prefix_len, j + prefix_len);
|
||||
if (column && row && is_replace(position)) {
|
||||
return levenshtein::EditType::EditReplace;
|
||||
} else if (row && is_insert(position)) {
|
||||
return levenshtein::EditType::EditInsert;
|
||||
} else if (column && is_delete(position)) {
|
||||
return levenshtein::EditType::EditDelete;
|
||||
} else if (is_keep(position)) {
|
||||
return levenshtein::EditType::EditKeep;
|
||||
} else {
|
||||
throw std::logic_error("something went wrong extracting the editops from the levenshtein matrix");
|
||||
}
|
||||
|
||||
std::reverse(ops.begin(), ops.end());
|
||||
return ops;
|
||||
}
|
||||
|
||||
std::vector<levenshtein::MatchingBlock> levenshtein::matching_blocks(boost::wstring_view sentence1, boost::wstring_view sentence2)
|
||||
template<typename CharT>
|
||||
std::vector<levenshtein::MatchingBlock> levenshtein::matching_blocks(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2)
|
||||
{
|
||||
auto edit_ops = editops(sentence1, sentence2);
|
||||
auto m = matrix(sentence1, sentence2);
|
||||
std::size_t prefix_len = m.prefix_len;
|
||||
|
||||
// current position in the the levenshtein matrix
|
||||
std::size_t matrix_column = m.matrix_columns - 1;
|
||||
std::size_t matrix_row = m.matrix_rows - 1;
|
||||
|
||||
std::size_t first_start = 0;
|
||||
std::size_t second_start = 0;
|
||||
std::vector<MatchingBlock> mblocks;
|
||||
mblocks.emplace_back(sentence1.length(), sentence2.length(), 0);
|
||||
|
||||
for (const auto& op : edit_ops) {
|
||||
if (op.op_type == EditType::EditKeep) {
|
||||
while (matrix_column > 0 || matrix_row > 0) {
|
||||
EditType op_type = get_EditType(m, matrix_row, matrix_column);
|
||||
|
||||
switch (op_type) {
|
||||
case EditType::EditReplace:
|
||||
--matrix_column;
|
||||
--matrix_row;
|
||||
break;
|
||||
case EditType::EditInsert:
|
||||
--matrix_row;
|
||||
break;
|
||||
case EditType::EditDelete:
|
||||
--matrix_column;
|
||||
break;
|
||||
case EditType::EditKeep:
|
||||
--matrix_column;
|
||||
--matrix_row;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (first_start < op.first_start || second_start < op.second_start) {
|
||||
mblocks.emplace_back(first_start, second_start, op.first_start - first_start);
|
||||
first_start = op.first_start;
|
||||
second_start = op.second_start;
|
||||
std::size_t cur_first_start = matrix_column + prefix_len;
|
||||
std::size_t cur_second_start = matrix_row + prefix_len;
|
||||
if (first_start < cur_first_start || second_start < cur_second_start) {
|
||||
mblocks.emplace_back(first_start, second_start, cur_first_start - first_start);
|
||||
first_start = cur_first_start;
|
||||
second_start = cur_second_start;
|
||||
}
|
||||
|
||||
switch (op.op_type) {
|
||||
switch (op_type) {
|
||||
case EditType::EditReplace:
|
||||
first_start += 1;
|
||||
second_start += 1;
|
||||
|
@ -132,16 +145,29 @@ std::vector<levenshtein::MatchingBlock> levenshtein::matching_blocks(boost::wstr
|
|||
case EditType::EditInsert:
|
||||
second_start += 1;
|
||||
break;
|
||||
case EditType::EditKeep:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
mblocks.emplace_back(sentence1.length(), sentence2.length(), 0);
|
||||
std::reverse(mblocks.begin(), mblocks.end());
|
||||
return mblocks;
|
||||
}
|
||||
|
||||
std::size_t levenshtein::distance(boost::wstring_view sentence1, boost::wstring_view sentence2)
|
||||
template <typename CharT>
|
||||
std::vector<levenshtein::MatchingBlock> levenshtein::matching_blocks(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2)
|
||||
{
|
||||
return matching_blocks(
|
||||
boost::basic_string_view<CharT>(sentence1),
|
||||
boost::basic_string_view<CharT>(sentence2));
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
std::size_t levenshtein::weighted_distance(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2)
|
||||
{
|
||||
utils::remove_common_affix(sentence1, sentence2);
|
||||
|
||||
|
@ -174,7 +200,20 @@ std::size_t levenshtein::distance(boost::wstring_view sentence1, boost::wstring_
|
|||
return cache.back();
|
||||
}
|
||||
|
||||
std::size_t levenshtein::weighted_distance(boost::wstring_view sentence1, boost::wstring_view sentence2)
|
||||
template <typename CharT>
|
||||
std::size_t levenshtein::weighted_distance(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2)
|
||||
{
|
||||
return weighted_distance(
|
||||
boost::basic_string_view<CharT>(sentence1),
|
||||
boost::basic_string_view<CharT>(sentence2));
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
std::size_t levenshtein::distance(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2)
|
||||
{
|
||||
utils::remove_common_affix(sentence1, sentence2);
|
||||
|
||||
|
@ -214,7 +253,21 @@ std::size_t levenshtein::weighted_distance(boost::wstring_view sentence1, boost:
|
|||
return cache.back();
|
||||
}
|
||||
|
||||
std::size_t levenshtein::generic_distance(boost::wstring_view sentence1, boost::wstring_view sentence2, WeightTable weights)
|
||||
template <typename CharT>
|
||||
std::size_t levenshtein::distance(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2)
|
||||
{
|
||||
return distance(
|
||||
boost::basic_string_view<CharT>(sentence1),
|
||||
boost::basic_string_view<CharT>(sentence2));
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
std::size_t levenshtein::generic_distance(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2,
|
||||
WeightTable weights)
|
||||
{
|
||||
utils::remove_common_affix(sentence1, sentence2);
|
||||
if (sentence1.size() > sentence2.size()) {
|
||||
|
@ -248,7 +301,23 @@ std::size_t levenshtein::generic_distance(boost::wstring_view sentence1, boost::
|
|||
return cache.back();
|
||||
}
|
||||
|
||||
double levenshtein::normalized_distance(boost::wstring_view sentence1, boost::wstring_view sentence2, double min_ratio)
|
||||
template <typename CharT>
|
||||
std::size_t levenshtein::generic_distance(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2,
|
||||
WeightTable weights)
|
||||
{
|
||||
return generic_distance(
|
||||
boost::basic_string_view<CharT>(sentence1),
|
||||
boost::basic_string_view<CharT>(sentence2),
|
||||
weights);
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
double levenshtein::normalized_distance(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2,
|
||||
double min_ratio)
|
||||
{
|
||||
if (sentence1.empty() || sentence2.empty()) {
|
||||
return sentence1.empty() && sentence2.empty();
|
||||
|
@ -275,7 +344,24 @@ double levenshtein::normalized_distance(boost::wstring_view sentence1, boost::ws
|
|||
return (ratio >= min_ratio) ? ratio : 0.0;
|
||||
}
|
||||
|
||||
double levenshtein::normalized_weighted_distance(const boost::wstring_view& sentence1, const boost::wstring_view& sentence2, double min_ratio)
|
||||
template <typename CharT>
|
||||
double levenshtein::normalized_distance(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2,
|
||||
double min_ratio)
|
||||
{
|
||||
return normalized_distance(
|
||||
boost::basic_string_view<CharT>(sentence1),
|
||||
boost::basic_string_view<CharT>(sentence2),
|
||||
min_ratio);
|
||||
}
|
||||
|
||||
|
||||
template<typename CharT>
|
||||
double levenshtein::normalized_weighted_distance(
|
||||
boost::basic_string_view<CharT> sentence1,
|
||||
boost::basic_string_view<CharT> sentence2,
|
||||
double min_ratio)
|
||||
{
|
||||
if (sentence1.empty() || sentence2.empty()) {
|
||||
return sentence1.empty() && sentence2.empty();
|
||||
|
@ -304,3 +390,15 @@ double levenshtein::normalized_weighted_distance(const boost::wstring_view& sent
|
|||
double ratio = 1.0 - static_cast<double>(dist) / lensum;
|
||||
return (ratio >= min_ratio) ? ratio : 0.0;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
double levenshtein::normalized_weighted_distance(
|
||||
const std::basic_string<CharT>& sentence1,
|
||||
const std::basic_string<CharT>& sentence2,
|
||||
double min_ratio)
|
||||
{
|
||||
return normalized_weighted_distance(
|
||||
boost::basic_string_view<CharT>(sentence1),
|
||||
boost::basic_string_view<CharT>(sentence2),
|
||||
min_ratio);
|
||||
}
|
|
@ -15,7 +15,7 @@ process::extract(const std::wstring& query, const std::vector<std::wstring>& cho
|
|||
for (const auto& choice : choices) {
|
||||
std::wstring b = (preprocess) ? utils::default_process(choice) : choice;
|
||||
|
||||
double score = fuzz::WRatio({query}, {choice}, score_cutoff);
|
||||
double score = fuzz::WRatio(Sentence<wchar_t>(query), Sentence<wchar_t>(choice), score_cutoff);
|
||||
if (score >= score_cutoff) {
|
||||
results.emplace_back(std::make_pair(choice, score));
|
||||
}
|
||||
|
@ -46,7 +46,7 @@ process::extractOne(const std::wstring& query, const std::vector<std::wstring>&
|
|||
for (const auto& choice : choices) {
|
||||
std::wstring b = (preprocess) ? utils::default_process(choice) : choice;
|
||||
|
||||
double score = fuzz::WRatio({a}, {b}, score_cutoff);
|
||||
double score = fuzz::WRatio(Sentence<wchar_t>(a), Sentence<wchar_t>(b), score_cutoff);
|
||||
if (score >= score_cutoff) {
|
||||
score_cutoff = score;
|
||||
match_found = true;
|
||||
|
|
|
@ -1,179 +0,0 @@
|
|||
#include "utils.hpp"
|
||||
#include <algorithm>
|
||||
#include <cwctype>
|
||||
|
||||
/**
|
||||
* Finds the longest common prefix between two ranges
|
||||
*/
|
||||
template <typename InputIterator1, typename InputIterator2>
|
||||
inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1,
|
||||
InputIterator2 first2, InputIterator2 last2)
|
||||
{
|
||||
return std::distance(first1, std::mismatch(first1, last1, first2, last2).first);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common prefix of two string views
|
||||
*/
|
||||
std::size_t remove_common_prefix(boost::wstring_view& a, boost::wstring_view& b)
|
||||
{
|
||||
auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end());
|
||||
a.remove_prefix(prefix);
|
||||
b.remove_prefix(prefix);
|
||||
return prefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common suffix of two string views
|
||||
*/
|
||||
std::size_t remove_common_suffix(boost::wstring_view& a, boost::wstring_view& b)
|
||||
{
|
||||
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
|
||||
a.remove_suffix(suffix);
|
||||
b.remove_suffix(suffix);
|
||||
return suffix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common affix of two string views
|
||||
*/
|
||||
Affix utils::remove_common_affix(boost::wstring_view& a, boost::wstring_view& b)
|
||||
{
|
||||
return Affix{
|
||||
remove_common_prefix(a, b),
|
||||
remove_common_suffix(a, b)
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void vec_remove_common_affix(T& a, T& b)
|
||||
{
|
||||
auto prefix = std::mismatch(a.begin(), a.end(), b.begin(), b.end());
|
||||
a.erase(a.begin(), prefix.first);
|
||||
b.erase(b.begin(), prefix.second);
|
||||
|
||||
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
|
||||
a.erase(a.end() - suffix, a.end());
|
||||
b.erase(b.end() - suffix, b.end());
|
||||
}
|
||||
|
||||
void utils::remove_common_affix(std::vector<boost::wstring_view>& a, std::vector<boost::wstring_view>& b)
|
||||
{
|
||||
vec_remove_common_affix(a, b);
|
||||
if (!a.empty() && !b.empty()) {
|
||||
remove_common_prefix(a.front(), b.front());
|
||||
remove_common_suffix(a.back(), b.back());
|
||||
}
|
||||
}
|
||||
|
||||
std::wstring utils::join(const std::vector<boost::wstring_view>& sentence)
|
||||
{
|
||||
if (sentence.empty()) {
|
||||
return std::wstring();
|
||||
}
|
||||
|
||||
auto sentence_iter = sentence.begin();
|
||||
std::wstring result{ *sentence_iter };
|
||||
const std::wstring whitespace{ 0x20 };
|
||||
++sentence_iter;
|
||||
for (; sentence_iter != sentence.end(); ++sentence_iter) {
|
||||
result.append(whitespace).append(std::wstring{ *sentence_iter });
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
percent utils::result_cutoff(double result, percent score_cutoff)
|
||||
{
|
||||
return (result >= score_cutoff) ? result : 0;
|
||||
}
|
||||
|
||||
// trim from start (in place)
|
||||
void ltrim(std::wstring& s)
|
||||
{
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](const wchar_t &ch) {
|
||||
return !std::iswspace(ch);
|
||||
}));
|
||||
}
|
||||
|
||||
// trim from end (in place)
|
||||
void rtrim(std::wstring& s)
|
||||
{
|
||||
s.erase(std::find_if(s.rbegin(), s.rend(), [](const wchar_t &ch) {
|
||||
return !std::iswspace(ch);
|
||||
}).base(), s.end());
|
||||
}
|
||||
|
||||
// trim from both ends (in place)
|
||||
void utils::trim(std::wstring& s)
|
||||
{
|
||||
ltrim(s);
|
||||
rtrim(s);
|
||||
}
|
||||
|
||||
void utils::lower_case(std::wstring& s)
|
||||
{
|
||||
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
|
||||
}
|
||||
|
||||
std::wstring utils::default_process(std::wstring s)
|
||||
{
|
||||
// replace embedded null terminators
|
||||
std::replace( s.begin(), s.end(), {'\x00'}, ' ');
|
||||
trim(s);
|
||||
lower_case(s);
|
||||
return s;
|
||||
}
|
||||
|
||||
DecomposedSet utils::set_decomposition(std::vector<boost::wstring_view> a, std::vector<boost::wstring_view> b)
|
||||
{
|
||||
std::vector<boost::wstring_view> intersection;
|
||||
std::vector<boost::wstring_view> difference_ab;
|
||||
a.erase(std::unique(a.begin(), a.end()), a.end());
|
||||
b.erase(std::unique(b.begin(), b.end()), b.end());
|
||||
|
||||
for (const auto& current_a : a) {
|
||||
auto element_b = std::find(b.begin(), b.end(), current_a);
|
||||
if (element_b != b.end()) {
|
||||
b.erase(element_b);
|
||||
intersection.emplace_back(current_a);
|
||||
} else {
|
||||
difference_ab.emplace_back(current_a);
|
||||
}
|
||||
}
|
||||
|
||||
return DecomposedSet{ intersection, difference_ab, b };
|
||||
}
|
||||
|
||||
std::size_t utils::joined_size(const std::vector<boost::wstring_view>& x)
|
||||
{
|
||||
if (x.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// there is a whitespace between each word
|
||||
std::size_t result = x.size() - 1;
|
||||
for (const auto& y : x) {
|
||||
result += y.size();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<boost::wstring_view> utils::splitSV(const boost::wstring_view& str)
|
||||
{
|
||||
std::vector<boost::wstring_view> output;
|
||||
// assume a word length of 6 + 1 whitespace
|
||||
output.reserve(str.size() / 7);
|
||||
|
||||
auto first = str.data(), second = str.data(), last = first + str.size();
|
||||
for (; second != last && first != last; first = second + 1) {
|
||||
// maybe use localisation
|
||||
second = std::find_if(first, last, [](const wchar_t &c) { return std::iswspace(c); });
|
||||
|
||||
if (first != second) {
|
||||
output.emplace_back(first, second - first);
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
|
@ -5,21 +5,29 @@
|
|||
/* 0.0% - 100.0% */
|
||||
using percent = double;
|
||||
|
||||
template<typename CharT>
|
||||
using string_view_vec = std::vector<boost::basic_string_view<CharT>>;
|
||||
|
||||
template<typename CharT>
|
||||
struct Sentence {
|
||||
boost::wstring_view sentence;
|
||||
boost::basic_string_view<CharT> sentence;
|
||||
uint64_t bitmap = 0;
|
||||
Sentence(boost::wstring_view sentence, uint64_t bitmap)
|
||||
Sentence(boost::basic_string_view<CharT> sentence, uint64_t bitmap)
|
||||
: sentence(sentence), bitmap(bitmap) {}
|
||||
Sentence(boost::wstring_view sentence)
|
||||
Sentence(boost::basic_string_view<CharT> sentence)
|
||||
: sentence(sentence), bitmap(0) {}
|
||||
Sentence(std::basic_string<CharT> sentence, uint64_t bitmap)
|
||||
: sentence(boost::basic_string_view<CharT>(sentence)), bitmap(bitmap) {}
|
||||
Sentence(std::basic_string<CharT> sentence)
|
||||
: sentence(boost::basic_string_view<CharT>(sentence)), bitmap(0) {}
|
||||
};
|
||||
|
||||
template<typename CharT>
|
||||
struct DecomposedSet {
|
||||
std::vector<boost::wstring_view> intersection;
|
||||
std::vector<boost::wstring_view> difference_ab;
|
||||
std::vector<boost::wstring_view> difference_ba;
|
||||
DecomposedSet(std::vector<boost::wstring_view> intersection, std::vector<boost::wstring_view> difference_ab, std::vector<boost::wstring_view> difference_ba)
|
||||
string_view_vec<CharT> intersection;
|
||||
string_view_vec<CharT> difference_ab;
|
||||
string_view_vec<CharT> difference_ba;
|
||||
DecomposedSet(string_view_vec<CharT> intersection, string_view_vec<CharT> difference_ab, string_view_vec<CharT> difference_ba)
|
||||
: intersection(std::move(intersection))
|
||||
, difference_ab(std::move(difference_ab))
|
||||
, difference_ba(std::move(difference_ba))
|
||||
|
@ -33,38 +41,40 @@ struct Affix {
|
|||
|
||||
namespace utils {
|
||||
|
||||
std::vector<boost::wstring_view> splitSV(const boost::wstring_view& str);
|
||||
template<typename CharT>
|
||||
string_view_vec<CharT> splitSV(const boost::basic_string_view<CharT>& str);
|
||||
|
||||
DecomposedSet set_decomposition(std::vector<boost::wstring_view> a, std::vector<boost::wstring_view> b);
|
||||
template<typename CharT>
|
||||
string_view_vec<CharT> splitSV(const std::basic_string<CharT>& str);
|
||||
|
||||
std::size_t joined_size(const std::vector<boost::wstring_view>& x);
|
||||
template<typename CharT>
|
||||
std::size_t joined_size(const string_view_vec<CharT>& x);
|
||||
|
||||
std::wstring join(const std::vector<boost::wstring_view>& sentence);
|
||||
template<typename CharT>
|
||||
std::basic_string<CharT> join(const string_view_vec<CharT>& sentence);
|
||||
|
||||
template<typename CharT>
|
||||
DecomposedSet<CharT> set_decomposition(string_view_vec<CharT> a, string_view_vec<CharT> b);
|
||||
|
||||
template<typename CharT>
|
||||
Affix remove_common_affix(boost::basic_string_view<CharT>& a, boost::basic_string_view<CharT>& b);
|
||||
|
||||
template<typename CharT>
|
||||
void trim(std::basic_string<CharT>& s);
|
||||
|
||||
template<typename CharT>
|
||||
void lower_case(std::basic_string<CharT>& s);
|
||||
|
||||
template<typename CharT>
|
||||
std::basic_string<CharT> default_process(std::basic_string<CharT> s);
|
||||
|
||||
template<typename CharT>
|
||||
uint64_t bitmap_create(const boost::basic_string_view<CharT>& sentence);
|
||||
|
||||
template<typename CharT>
|
||||
uint64_t bitmap_create(const std::basic_string<CharT>& sentence);
|
||||
|
||||
percent result_cutoff(double result, percent score_cutoff);
|
||||
|
||||
void trim(std::wstring& s);
|
||||
|
||||
void lower_case(std::wstring& s);
|
||||
|
||||
std::wstring default_process(std::wstring s);
|
||||
|
||||
Affix remove_common_affix(boost::wstring_view& a, boost::wstring_view& b);
|
||||
|
||||
void remove_common_affix(std::vector<boost::wstring_view>& a, std::vector<boost::wstring_view>& b);
|
||||
}
|
||||
|
||||
inline uint64_t bitmap_create(const boost::wstring_view& sentence) {
|
||||
uint64_t bitmap = 0;
|
||||
for (const unsigned int& letter : sentence) {
|
||||
uint8_t shift = (letter % 16) * 4;
|
||||
|
||||
// make sure there is no overflow when more than 8 characters
|
||||
// with the same shift exist
|
||||
uint64_t bitmask = static_cast<uint64_t>(0b1111) << shift;
|
||||
if ((bitmap & bitmask) != bitmask) {
|
||||
bitmap += static_cast<uint64_t>(1) << shift;
|
||||
}
|
||||
}
|
||||
return bitmap;
|
||||
}
|
||||
#include "utils.txx"
|
|
@ -0,0 +1,195 @@
|
|||
#include "utils.hpp"
|
||||
#include <algorithm>
|
||||
#include <locale>
|
||||
|
||||
template<typename CharT>
|
||||
string_view_vec<CharT> utils::splitSV(const boost::basic_string_view<CharT>& str)
|
||||
{
|
||||
string_view_vec<CharT> output;
|
||||
|
||||
auto first = str.data(), second = str.data(), last = first + str.size();
|
||||
for (; second != last && first != last; first = second + 1) {
|
||||
// TODO: maybe use localisation
|
||||
second = std::find_if(first, last, [](const CharT& c) { return std::isspace(c); });
|
||||
|
||||
if (first != second) {
|
||||
output.emplace_back(first, second - first);
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
string_view_vec<CharT> splitSV(const std::basic_string<CharT>& str)
|
||||
{
|
||||
return splitSV(boost::basic_string_view<CharT>(str));
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
std::size_t utils::joined_size(const string_view_vec<CharT>& x)
|
||||
{
|
||||
if (x.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// there is a whitespace between each word
|
||||
std::size_t result = x.size() - 1;
|
||||
for (const auto& y : x) {
|
||||
result += y.size();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
std::basic_string<CharT> utils::join(const string_view_vec<CharT>& sentence)
|
||||
{
|
||||
if (sentence.empty()) {
|
||||
return std::basic_string<CharT>();
|
||||
}
|
||||
|
||||
auto sentence_iter = sentence.begin();
|
||||
std::basic_string<CharT> result{ *sentence_iter };
|
||||
const std::basic_string<CharT> whitespace{ 0x20 };
|
||||
++sentence_iter;
|
||||
for (; sentence_iter != sentence.end(); ++sentence_iter) {
|
||||
result.append(whitespace).append(std::basic_string<CharT>{ *sentence_iter });
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
DecomposedSet<CharT> utils::set_decomposition(string_view_vec<CharT> a, string_view_vec<CharT> b)
|
||||
{
|
||||
string_view_vec<CharT> intersection;
|
||||
string_view_vec<CharT> difference_ab;
|
||||
a.erase(std::unique(a.begin(), a.end()), a.end());
|
||||
b.erase(std::unique(b.begin(), b.end()), b.end());
|
||||
|
||||
for (const auto& current_a : a) {
|
||||
auto element_b = std::find(b.begin(), b.end(), current_a);
|
||||
if (element_b != b.end()) {
|
||||
b.erase(element_b);
|
||||
intersection.emplace_back(current_a);
|
||||
} else {
|
||||
difference_ab.emplace_back(current_a);
|
||||
}
|
||||
}
|
||||
|
||||
return DecomposedSet<CharT>{ intersection, difference_ab, b };
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finds the longest common prefix between two ranges
|
||||
*/
|
||||
template <typename InputIterator1, typename InputIterator2>
|
||||
inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1,
|
||||
InputIterator2 first2, InputIterator2 last2)
|
||||
{
|
||||
return std::distance(first1, std::mismatch(first1, last1, first2, last2).first);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common prefix of two string views
|
||||
*/
|
||||
template<typename CharT>
|
||||
std::size_t remove_common_prefix(boost::basic_string_view<CharT>& a, boost::basic_string_view<CharT>& b)
|
||||
{
|
||||
auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end());
|
||||
a.remove_prefix(prefix);
|
||||
b.remove_prefix(prefix);
|
||||
return prefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common suffix of two string views
|
||||
*/
|
||||
template<typename CharT>
|
||||
std::size_t remove_common_suffix(boost::basic_string_view<CharT>& a, boost::basic_string_view<CharT>& b)
|
||||
{
|
||||
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
|
||||
a.remove_suffix(suffix);
|
||||
b.remove_suffix(suffix);
|
||||
return suffix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common affix of two string views
|
||||
*/
|
||||
template<typename CharT>
|
||||
Affix utils::remove_common_affix(boost::basic_string_view<CharT>& a, boost::basic_string_view<CharT>& b)
|
||||
{
|
||||
return Affix{
|
||||
remove_common_prefix(a, b),
|
||||
remove_common_suffix(a, b)
|
||||
};
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
void ltrim(std::basic_string<CharT>& s)
|
||||
{
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](const CharT& ch) {
|
||||
return !std::isspace(ch);
|
||||
}));
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
void rtrim(std::basic_string<CharT>& s)
|
||||
{
|
||||
s.erase(std::find_if(s.rbegin(), s.rend(), [](const CharT& ch) {
|
||||
return !std::isspace(ch);
|
||||
}).base(), s.end());
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
void utils::trim(std::basic_string<CharT>& s)
|
||||
{
|
||||
ltrim(s);
|
||||
rtrim(s);
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
void utils::lower_case(std::basic_string<CharT>& s)
|
||||
{
|
||||
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
std::basic_string<CharT> utils::default_process(std::basic_string<CharT> s)
|
||||
{
|
||||
// replace embedded null terminators
|
||||
std::replace( s.begin(), s.end(), CharT{0}, CharT{0x20});
|
||||
trim(s);
|
||||
lower_case(s);
|
||||
return s;
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
uint64_t utils::bitmap_create(const boost::basic_string_view<CharT>& sentence)
|
||||
{
|
||||
uint64_t bitmap = 0;
|
||||
for (const unsigned int& letter : sentence) {
|
||||
uint8_t shift = (letter % 16) * 4;
|
||||
|
||||
// make sure there is no overflow when more than 8 characters
|
||||
// with the same shift exist
|
||||
uint64_t bitmask = static_cast<uint64_t>(0b1111) << shift;
|
||||
if ((bitmap & bitmask) != bitmask) {
|
||||
bitmap += static_cast<uint64_t>(1) << shift;
|
||||
}
|
||||
}
|
||||
return bitmap;
|
||||
}
|
||||
|
||||
template<typename CharT>
|
||||
uint64_t utils::bitmap_create(const std::basic_string<CharT>& sentence)
|
||||
{
|
||||
return bitmap_create(boost::basic_string_view<CharT>(sentence));
|
||||
}
|
||||
|
||||
inline percent utils::result_cutoff(double result, percent score_cutoff)
|
||||
{
|
||||
return (result >= score_cutoff) ? result : 0;
|
||||
}
|
|
@ -368,13 +368,13 @@ static PyObject* token_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
|
|||
double result;
|
||||
if (preprocess) {
|
||||
result = fuzz::token_ratio(
|
||||
{s1},
|
||||
{s2},
|
||||
Sentence<wchar_t>(s1),
|
||||
Sentence<wchar_t>(s2),
|
||||
score_cutoff);
|
||||
} else {
|
||||
result = fuzz::token_ratio(
|
||||
{s1},
|
||||
{s2},
|
||||
Sentence<wchar_t>(s1),
|
||||
Sentence<wchar_t>(s2),
|
||||
score_cutoff);
|
||||
}
|
||||
|
||||
|
@ -493,8 +493,8 @@ static PyObject* WRatio(PyObject *self, PyObject *args, PyObject *keywds) {
|
|||
std::wstring s2 = PyObject_To_Wstring(py_s2, preprocess);
|
||||
|
||||
double result = fuzz::WRatio(
|
||||
{s1},
|
||||
{s2},
|
||||
Sentence<wchar_t>(s1),
|
||||
Sentence<wchar_t>(s2),
|
||||
score_cutoff);
|
||||
|
||||
return PyFloat_FromDouble(result);
|
||||
|
|
|
@ -28,11 +28,12 @@ PyObject* extract(PyObject *self, PyObject *args, PyObject *keywds) {
|
|||
int preprocess = 1;
|
||||
static const char *kwlist[] = {"query", "choices", "score_cutoff", "preprocess", NULL};
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "UO|dp", const_cast<char **>(kwlist),
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "UO|dh", const_cast<char **>(kwlist),
|
||||
&py_query, &py_choices, &score_cutoff, &preprocess)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings");
|
||||
if (!choices) {
|
||||
return NULL;
|
||||
|
@ -44,7 +45,7 @@ PyObject* extract(PyObject *self, PyObject *args, PyObject *keywds) {
|
|||
}
|
||||
|
||||
std::wstring cleaned_query = PyObject_To_Wstring(py_query, preprocess);
|
||||
uint64_t query_bitmap = bitmap_create(cleaned_query);
|
||||
uint64_t query_bitmap = utils::bitmap_create(cleaned_query);
|
||||
|
||||
PyObject* results = PyList_New(0);
|
||||
|
||||
|
@ -62,12 +63,12 @@ PyObject* extract(PyObject *self, PyObject *args, PyObject *keywds) {
|
|||
std::wstring choice(buffer, len);
|
||||
PyMem_Free(buffer);
|
||||
|
||||
boost::wstring_view cleaned_choice = (preprocess) ? utils::default_process(choice) : choice;
|
||||
uint64_t choice_bitmap = bitmap_create(cleaned_choice);
|
||||
std::wstring cleaned_choice = (preprocess) ? utils::default_process(choice) : choice;
|
||||
uint64_t choice_bitmap = utils::bitmap_create(cleaned_choice);
|
||||
|
||||
double score= fuzz::WRatio(
|
||||
{cleaned_query, query_bitmap},
|
||||
{cleaned_choice, choice_bitmap},
|
||||
Sentence<wchar_t>(cleaned_query, query_bitmap),
|
||||
Sentence<wchar_t>(cleaned_choice, choice_bitmap),
|
||||
score_cutoff);
|
||||
|
||||
if (score >= score_cutoff) {
|
||||
|
@ -117,7 +118,7 @@ PyObject* extractOne(PyObject *self, PyObject *args, PyObject *keywds) {
|
|||
}
|
||||
|
||||
std::wstring cleaned_query = PyObject_To_Wstring(py_query, preprocess);
|
||||
uint64_t query_bitmap = bitmap_create(cleaned_query);
|
||||
uint64_t query_bitmap = utils::bitmap_create(cleaned_query);
|
||||
|
||||
double end_score = 0;
|
||||
std::wstring result_choice;
|
||||
|
@ -136,12 +137,12 @@ PyObject* extractOne(PyObject *self, PyObject *args, PyObject *keywds) {
|
|||
std::wstring choice(buffer, len);
|
||||
PyMem_Free(buffer);
|
||||
|
||||
boost::wstring_view cleaned_choice = (preprocess) ? utils::default_process(choice) : choice;
|
||||
uint64_t choice_bitmap = bitmap_create(cleaned_choice);
|
||||
std::wstring cleaned_choice = (preprocess) ? utils::default_process(choice) : choice;
|
||||
uint64_t choice_bitmap = utils::bitmap_create(cleaned_choice);
|
||||
|
||||
double score = fuzz::WRatio(
|
||||
{cleaned_query, query_bitmap},
|
||||
{cleaned_choice, choice_bitmap},
|
||||
Sentence<wchar_t>(cleaned_query, query_bitmap),
|
||||
Sentence<wchar_t>(cleaned_choice, choice_bitmap),
|
||||
score_cutoff);
|
||||
|
||||
if (score >= score_cutoff) {
|
||||
|
|
6
setup.py
6
setup.py
|
@ -52,19 +52,19 @@ setup(
|
|||
ext_modules = [
|
||||
Extension(
|
||||
'rapidfuzz.levenshtein',
|
||||
['python/src/py_levenshtein.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'],
|
||||
['python/src/py_levenshtein.cpp'],
|
||||
include_dirs=["cpp/src", "cpp/extern"],
|
||||
language='c++',
|
||||
),
|
||||
Extension(
|
||||
'rapidfuzz.fuzz',
|
||||
['python/src/py_fuzz.cpp', 'cpp/src/fuzz.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'],
|
||||
['python/src/py_fuzz.cpp'],
|
||||
include_dirs=["cpp/src", "cpp/extern"],
|
||||
language='c++',
|
||||
),
|
||||
Extension(
|
||||
'rapidfuzz._process',
|
||||
['python/src/py_process.cpp', 'cpp/src/fuzz.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'],
|
||||
['python/src/py_process.cpp'],
|
||||
include_dirs=["cpp/src", "cpp/extern"],
|
||||
language='c++',
|
||||
),
|
||||
|
|
Loading…
Reference in New Issue