fix string view usage

This commit is contained in:
maxbachmann 2020-04-05 02:44:07 +02:00
parent cae67851e5
commit 4da4234f73
No known key found for this signature in database
GPG Key ID: 60334E83C23820B8
13 changed files with 858 additions and 463 deletions

View File

@ -1,6 +1,6 @@
include README.md
include VERSION
include LICENSE
recursive-include cpp/src *.hpp
recursive-include cpp/src *.hpp *.txx
recursive-include cpp/extern/boost *
recursive-include python/src *.hpp

View File

@ -1 +1 @@
0.6.3
0.6.4

View File

@ -3,23 +3,110 @@
#include "utils.hpp"
namespace fuzz {
percent ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff = 0);
percent partial_ratio(boost::wstring_view s1, boost::wstring_view s2, percent score_cutoff = 0);
template<typename CharT>
percent ratio(
const boost::basic_string_view<CharT>& s1,
const boost::basic_string_view<CharT>& s2,
percent score_cutoff = 0);
percent token_sort_ratio(const boost::wstring_view& a, const boost::wstring_view& b, percent score_cutoff = 0);
percent partial_token_sort_ratio(const boost::wstring_view& a, const boost::wstring_view& b, percent score_cutoff = 0);
template<typename CharT>
percent ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff = 0);
percent token_set_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff = 0);
percent partial_token_set_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff = 0);
template<typename CharT>
percent partial_ratio(
boost::basic_string_view<CharT> s1,
boost::basic_string_view<CharT> s2,
percent score_cutoff = 0);
percent token_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0);
percent partial_token_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff = 0);
template<typename CharT>
percent partial_ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff = 0);
template<typename CharT>
percent token_sort_ratio(
const boost::basic_string_view<CharT>& s1,
const boost::basic_string_view<CharT>& s2,
percent score_cutoff = 0);
std::size_t bitmap_distance(const Sentence& s1, const Sentence& s2);
percent bitmap_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0);
percent length_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0);
percent quick_lev_estimate(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0);
template<typename CharT>
percent token_sort_ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff = 0);
percent WRatio(const Sentence& s1, const Sentence& s2, percent score_cutoff = 0);
template<typename CharT>
percent partial_token_sort_ratio(
const boost::basic_string_view<CharT>& s1,
const boost::basic_string_view<CharT>& s2,
percent score_cutoff = 0);
template<typename CharT>
percent partial_token_sort_ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff = 0);
template<typename CharT>
percent token_set_ratio(
const boost::basic_string_view<CharT>& s1,
const boost::basic_string_view<CharT>& s2,
percent score_cutoff = 0);
template<typename CharT>
percent token_set_ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff = 0);
template<typename CharT>
percent partial_token_set_ratio(
const boost::basic_string_view<CharT>& s1,
const boost::basic_string_view<CharT>& s2,
percent score_cutoff = 0);
template<typename CharT>
percent partial_token_set_ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff = 0);
template<typename CharT>
percent token_ratio(
const Sentence<CharT>& s1,
const Sentence<CharT>& s2,
percent score_cutoff = 0);
template<typename CharT>
percent partial_token_ratio(
const boost::basic_string_view<CharT>& s1,
const boost::basic_string_view<CharT>& s2,
percent score_cutoff = 0);
template<typename CharT>
percent partial_token_ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff = 0);
template<typename CharT>
std::size_t bitmap_distance(const Sentence<CharT>& s1, const Sentence<CharT>& s2);
template<typename CharT>
percent bitmap_ratio(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff = 0);
template<typename CharT>
percent length_ratio(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff = 0);
template<typename CharT>
percent quick_lev_estimate(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff = 0);
template<typename CharT>
percent WRatio(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff = 0);
}
#include "fuzz.txx"

View File

@ -7,7 +7,33 @@
#include <tuple>
#include <iterator>
percent fuzz::partial_ratio(boost::wstring_view s1, boost::wstring_view s2, percent score_cutoff)
template<typename CharT>
inline percent fuzz::ratio(
const boost::basic_string_view<CharT>& s1,
const boost::basic_string_view<CharT>& s2,
percent score_cutoff)
{
double result = levenshtein::normalized_weighted_distance(s1, s2, score_cutoff / 100);
return utils::result_cutoff(result * 100, score_cutoff);
}
template<typename CharT>
inline percent fuzz::ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff)
{
return ratio(
boost::basic_string_view<CharT>(s1),
boost::basic_string_view<CharT>(s2),
score_cutoff);
}
template<typename CharT>
inline percent fuzz::partial_ratio(
boost::basic_string_view<CharT> s1,
boost::basic_string_view<CharT> s2,
percent score_cutoff)
{
if (s1.empty() || s2.empty() || score_cutoff > 100) {
return 0;
@ -37,21 +63,104 @@ percent fuzz::partial_ratio(boost::wstring_view s1, boost::wstring_view s2, perc
return utils::result_cutoff(max_ratio * 100, score_cutoff);
}
percent fuzz::ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff)
template<typename CharT>
inline percent fuzz::partial_ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff)
{
double result = levenshtein::normalized_weighted_distance(s1, s2, score_cutoff / 100);
return utils::result_cutoff(result * 100, score_cutoff);
return partial_ratio(
boost::basic_string_view<CharT>(s1),
boost::basic_string_view<CharT>(s2),
score_cutoff);
}
percent fuzz::token_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff)
template<typename CharT>
percent _token_sort(
const boost::basic_string_view<CharT>& s1,
const boost::basic_string_view<CharT>& s2,
bool partial,
percent score_cutoff = 0.0)
{
if (score_cutoff > 100) {
return 0;
}
std::vector<boost::wstring_view> tokens_a = utils::splitSV(s1.sentence);
string_view_vec<CharT> tokens_a = utils::splitSV(s1);
std::sort(tokens_a.begin(), tokens_a.end());
std::vector<boost::wstring_view> tokens_b = utils::splitSV(s2.sentence);
string_view_vec<CharT> tokens_b = utils::splitSV(s2);
std::sort(tokens_b.begin(), tokens_b.end());
if (partial) {
return fuzz::partial_ratio(
utils::join(tokens_a),
utils::join(tokens_b),
score_cutoff);
}
else {
double result = levenshtein::normalized_weighted_distance(
utils::join(tokens_a),
utils::join(tokens_b),
score_cutoff / 100);
return utils::result_cutoff(result * 100, score_cutoff);
}
}
template<typename CharT>
percent fuzz::token_sort_ratio(
const boost::basic_string_view<CharT>& s1,
const boost::basic_string_view<CharT>& s2,
percent score_cutoff)
{
return _token_sort(s1, s2, false, score_cutoff);
}
template<typename CharT>
percent fuzz::token_sort_ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff)
{
return _token_sort(
boost::basic_string_view<CharT>(s1),
boost::basic_string_view<CharT>(s2),
false, score_cutoff);
}
template<typename CharT>
percent fuzz::partial_token_sort_ratio(
const boost::basic_string_view<CharT>& s1,
const boost::basic_string_view<CharT>& s2,
percent score_cutoff)
{
return _token_sort(s1, s2, true, score_cutoff);
}
template<typename CharT>
percent fuzz::partial_token_sort_ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff)
{
return _token_sort(
boost::basic_string_view<CharT>(s1),
boost::basic_string_view<CharT>(s2),
true, score_cutoff);
}
template<typename CharT>
percent fuzz::token_set_ratio(
const boost::basic_string_view<CharT>& s1,
const boost::basic_string_view<CharT>& s2,
percent score_cutoff)
{
if (score_cutoff > 100) {
return 0;
}
string_view_vec<CharT> tokens_a = utils::splitSV(s1);
std::sort(tokens_a.begin(), tokens_a.end());
string_view_vec<CharT> tokens_b = utils::splitSV(s2);
std::sort(tokens_b.begin(), tokens_b.end());
auto decomposition = utils::set_decomposition(tokens_a, tokens_b);
@ -59,8 +168,124 @@ percent fuzz::token_ratio(const Sentence& s1, const Sentence& s2, percent score_
auto difference_ab = decomposition.difference_ab;
auto difference_ba = decomposition.difference_ba;
std::wstring diff_ab_joined = utils::join(difference_ab);
std::wstring diff_ba_joined = utils::join(difference_ba);
std::basic_string<CharT> diff_ab_joined = utils::join(difference_ab);
std::basic_string<CharT> diff_ba_joined = utils::join(difference_ba);
std::size_t ab_len = diff_ab_joined.length();
std::size_t ba_len = diff_ba_joined.length();
std::size_t sect_len = utils::joined_size(intersection);
// exit early since this will always result in a ratio of 1
if (sect_len && (!ab_len || !ba_len)) {
return 100;
}
// string length sect+ab <-> sect and sect+ba <-> sect
std::size_t sect_ab_lensum = sect_len + !!sect_len + ab_len;
std::size_t sect_ba_lensum = sect_len + !!sect_len + ba_len;
std::size_t sect_distance = levenshtein::weighted_distance(diff_ab_joined, diff_ba_joined);
double result = 0;
if (sect_distance != std::numeric_limits<std::size_t>::max()) {
result = std::max(result, 1.0 - sect_distance / static_cast<double>(sect_ab_lensum + sect_ba_lensum));
}
// exit early since the other ratios are 0
if (!sect_len) {
return utils::result_cutoff(result * 100, score_cutoff);
}
// levenshtein distance sect+ab <-> sect and sect+ba <-> sect
// would exit early after removing the prefix sect, so the distance can be directly calculated
std::size_t sect_ab_distance = !!sect_len + ab_len;
std::size_t sect_ba_distance = !!sect_len + ba_len;
result = std::max({ result,
1.0 - sect_ab_distance / static_cast<double>(sect_len + sect_ab_lensum),
1.0 - sect_ba_distance / static_cast<double>(sect_len + sect_ba_lensum) });
return utils::result_cutoff(result * 100, score_cutoff);
}
template<typename CharT>
percent fuzz::token_set_ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff)
{
return token_set_ratio(
boost::basic_string_view<CharT>(s1),
boost::basic_string_view<CharT>(s1),
score_cutoff);
}
template<typename CharT>
percent fuzz::partial_token_set_ratio(
const boost::basic_string_view<CharT>& s1,
const boost::basic_string_view<CharT>& s2,
percent score_cutoff)
{
if (score_cutoff > 100) {
return 0;
}
string_view_vec<CharT> tokens_a = utils::splitSV(s1);
std::sort(tokens_a.begin(), tokens_a.end());
string_view_vec<CharT> tokens_b = utils::splitSV(s2);
std::sort(tokens_b.begin(), tokens_b.end());
tokens_a.erase(std::unique(tokens_a.begin(), tokens_a.end()), tokens_a.end());
tokens_b.erase(std::unique(tokens_b.begin(), tokens_b.end()), tokens_b.end());
string_view_vec<CharT> difference_ab;
string_view_vec<CharT> difference_ba;
std::set_difference(tokens_a.begin(), tokens_a.end(), tokens_b.begin(), tokens_b.end(),
std::inserter(difference_ab, difference_ab.begin()));
std::set_difference(tokens_b.begin(), tokens_b.end(), tokens_a.begin(), tokens_a.end(),
std::inserter(difference_ba, difference_ba.begin()));
// exit early when there is a common word in both sequences
if (difference_ab.size() < tokens_a.size()) {
return 100;
}
return partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff);
}
template<typename CharT>
percent fuzz::partial_token_set_ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff)
{
return partial_token_set_ratio(
boost::basic_string_view<CharT>(s1),
boost::basic_string_view<CharT>(s1),
score_cutoff);
}
template<typename CharT>
percent fuzz::token_ratio(
const Sentence<CharT>& s1,
const Sentence<CharT>& s2,
percent score_cutoff)
{
if (score_cutoff > 100) {
return 0;
}
string_view_vec<CharT> tokens_a = utils::splitSV(s1.sentence);
std::sort(tokens_a.begin(), tokens_a.end());
string_view_vec<CharT> tokens_b = utils::splitSV(s2.sentence);
std::sort(tokens_b.begin(), tokens_b.end());
auto decomposition = utils::set_decomposition(tokens_a, tokens_b);
auto intersection = decomposition.intersection;
auto difference_ab = decomposition.difference_ab;
auto difference_ba = decomposition.difference_ba;
std::basic_string<CharT> diff_ab_joined = utils::join(difference_ab);
std::basic_string<CharT> diff_ba_joined = utils::join(difference_ba);
std::size_t ab_len = diff_ab_joined.length();
std::size_t ba_len = diff_ba_joined.length();
@ -83,8 +308,8 @@ percent fuzz::token_ratio(const Sentence& s1, const Sentence& s2, percent score_
std::size_t sect_ab_lensum = sect_len + !!sect_len + ab_len;
std::size_t sect_ba_lensum = sect_len + !!sect_len + ba_len;
Sentence diff_ab{diff_ab_joined, bitmap_create(diff_ab_joined)};
Sentence diff_ba{diff_ba_joined, bitmap_create(diff_ba_joined)};
Sentence<CharT> diff_ab{diff_ab_joined, utils::bitmap_create(diff_ab_joined)};
Sentence<CharT> diff_ba{diff_ba_joined, utils::bitmap_create(diff_ba_joined)};
double bm_ratio = 1.0 - bitmap_distance(diff_ab, diff_ba) / static_cast<double>(sect_ab_lensum + sect_ba_lensum);
if (bm_ratio >= score_cutoff) {
std::size_t sect_distance = levenshtein::weighted_distance(diff_ab_joined, diff_ba_joined);
@ -111,7 +336,11 @@ percent fuzz::token_ratio(const Sentence& s1, const Sentence& s2, percent score_
// combines token_set and token_sort ratio from fuzzywuzzy so it is only required to
// do a lot of operations once
percent fuzz::partial_token_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff)
template<typename CharT>
percent fuzz::partial_token_ratio(
const boost::basic_string_view<CharT>& s1,
const boost::basic_string_view<CharT>& s2,
percent score_cutoff)
{
if (score_cutoff > 100) {
return 0;
@ -152,127 +381,21 @@ percent fuzz::partial_token_ratio(const boost::wstring_view& s1, const boost::ws
partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff));
}
percent _token_sort(const boost::wstring_view& s1, const boost::wstring_view& s2, bool partial, percent score_cutoff = 0.0)
template<typename CharT>
percent fuzz::partial_token_ratio(
const std::basic_string<CharT>& s1,
const std::basic_string<CharT>& s2,
percent score_cutoff)
{
if (score_cutoff > 100) {
return 0;
}
std::vector<boost::wstring_view> tokens_a = utils::splitSV(s1);
std::sort(tokens_a.begin(), tokens_a.end());
std::vector<boost::wstring_view> tokens_b = utils::splitSV(s2);
std::sort(tokens_b.begin(), tokens_b.end());
if (partial) {
return fuzz::partial_ratio(
utils::join(tokens_a),
utils::join(tokens_b),
score_cutoff);
}
else {
double result = levenshtein::normalized_weighted_distance(
utils::join(tokens_a),
utils::join(tokens_b),
score_cutoff / 100);
return utils::result_cutoff(result * 100, score_cutoff);
}
return partial_token_ratio(
boost::basic_string_view<CharT>(s1),
boost::basic_string_view<CharT>(s2),
score_cutoff);
}
percent fuzz::token_sort_ratio(const boost::wstring_view& a, const boost::wstring_view& b, percent score_cutoff)
template<typename CharT>
std::size_t fuzz::bitmap_distance(const Sentence<CharT>& s1, const Sentence<CharT>& s2)
{
return _token_sort(a, b, false, score_cutoff);
}
percent fuzz::partial_token_sort_ratio(const boost::wstring_view& a, const boost::wstring_view& b, percent score_cutoff)
{
return _token_sort(a, b, true, score_cutoff);
}
percent fuzz::token_set_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff)
{
if (score_cutoff > 100) {
return 0;
}
std::vector<boost::wstring_view> tokens_a = utils::splitSV(s1);
std::sort(tokens_a.begin(), tokens_a.end());
std::vector<boost::wstring_view> tokens_b = utils::splitSV(s2);
std::sort(tokens_b.begin(), tokens_b.end());
auto decomposition = utils::set_decomposition(tokens_a, tokens_b);
auto intersection = decomposition.intersection;
auto difference_ab = decomposition.difference_ab;
auto difference_ba = decomposition.difference_ba;
std::wstring diff_ab_joined = utils::join(difference_ab);
std::wstring diff_ba_joined = utils::join(difference_ba);
std::size_t ab_len = diff_ab_joined.length();
std::size_t ba_len = diff_ba_joined.length();
std::size_t sect_len = utils::joined_size(intersection);
// exit early since this will always result in a ratio of 1
if (sect_len && (!ab_len || !ba_len)) {
return 100;
}
// string length sect+ab <-> sect and sect+ba <-> sect
std::size_t sect_ab_lensum = sect_len + !!sect_len + ab_len;
std::size_t sect_ba_lensum = sect_len + !!sect_len + ba_len;
std::size_t sect_distance = levenshtein::weighted_distance(diff_ab_joined, diff_ba_joined);
double result = 0;
if (sect_distance != std::numeric_limits<std::size_t>::max()) {
result = std::max(result, 1.0 - sect_distance / static_cast<double>(sect_ab_lensum + sect_ba_lensum));
}
// exit early since the other ratios are 0
if (!sect_len) {
return utils::result_cutoff(result * 100, score_cutoff);
}
// levenshtein distance sect+ab <-> sect and sect+ba <-> sect
// would exit early after removing the prefix sect, so the distance can be directly calculated
std::size_t sect_ab_distance = !!sect_len + ab_len;
std::size_t sect_ba_distance = !!sect_len + ba_len;
result = std::max({ result,
1.0 - sect_ab_distance / static_cast<double>(sect_len + sect_ab_lensum),
1.0 - sect_ba_distance / static_cast<double>(sect_len + sect_ba_lensum) });
return utils::result_cutoff(result * 100, score_cutoff);
}
percent fuzz::partial_token_set_ratio(const boost::wstring_view& s1, const boost::wstring_view& s2, percent score_cutoff)
{
if (score_cutoff > 100) {
return 0;
}
std::vector<boost::wstring_view> tokens_a = utils::splitSV(s1);
std::sort(tokens_a.begin(), tokens_a.end());
std::vector<boost::wstring_view> tokens_b = utils::splitSV(s2);
std::sort(tokens_b.begin(), tokens_b.end());
tokens_a.erase(std::unique(tokens_a.begin(), tokens_a.end()), tokens_a.end());
tokens_b.erase(std::unique(tokens_b.begin(), tokens_b.end()), tokens_b.end());
std::vector<boost::wstring_view> difference_ab;
std::vector<boost::wstring_view> difference_ba;
std::set_difference(tokens_a.begin(), tokens_a.end(), tokens_b.begin(), tokens_b.end(),
std::inserter(difference_ab, difference_ab.begin()));
std::set_difference(tokens_b.begin(), tokens_b.end(), tokens_a.begin(), tokens_a.end(),
std::inserter(difference_ba, difference_ba.begin()));
// exit early when there is a common word in both sequences
if (difference_ab.size() < tokens_a.size()) {
return 100;
}
return partial_ratio(utils::join(difference_ab), utils::join(difference_ba), score_cutoff);
}
std::size_t fuzz::bitmap_distance(const Sentence& s1, const Sentence& s2) {
uint64_t bitmap1 = s1.bitmap;
uint64_t bitmap2 = s2.bitmap;
@ -287,7 +410,9 @@ std::size_t fuzz::bitmap_distance(const Sentence& s1, const Sentence& s2) {
return distance;
}
percent fuzz::bitmap_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff) {
template<typename CharT>
percent fuzz::bitmap_ratio(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff)
{
std::size_t distance = bitmap_distance(s1, s2);
std::size_t lensum = s1.sentence.length() + s2.sentence.length();
percent result = 1.0 - static_cast<double>(distance) / lensum;
@ -296,7 +421,9 @@ percent fuzz::bitmap_ratio(const Sentence& s1, const Sentence& s2, percent score
}
percent fuzz::length_ratio(const Sentence& s1, const Sentence& s2, percent score_cutoff) {
template<typename CharT>
percent fuzz::length_ratio(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff)
{
std::size_t s1_len = s1.sentence.length();
std::size_t s2_len = s2.sentence.length();
std::size_t distance = (s1_len > s2_len)
@ -308,7 +435,9 @@ percent fuzz::length_ratio(const Sentence& s1, const Sentence& s2, percent score
return utils::result_cutoff(result * 100, score_cutoff);
}
percent fuzz::quick_lev_estimate(const Sentence& s1, const Sentence& s2, percent score_cutoff) {
template<typename CharT>
percent fuzz::quick_lev_estimate(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff)
{
if (s1.bitmap || s2.bitmap) {
return bitmap_ratio(s1, s2, score_cutoff);
} else {
@ -316,7 +445,8 @@ percent fuzz::quick_lev_estimate(const Sentence& s1, const Sentence& s2, percent
}
}
percent fuzz::WRatio(const Sentence& s1, const Sentence& s2, percent score_cutoff)
template<typename CharT>
percent fuzz::WRatio(const Sentence<CharT>& s1, const Sentence<CharT>& s2, percent score_cutoff)
{
if (score_cutoff > 100) {
return 0;

View File

@ -19,17 +19,6 @@ enum EditType {
EditDelete,
};
struct EditOp {
EditType op_type;
std::size_t first_start;
std::size_t second_start;
EditOp(EditType op_type, std::size_t first_start, std::size_t second_start)
: op_type(op_type)
, first_start(first_start)
, second_start(second_start)
{}
};
struct Matrix {
std::size_t prefix_len;
std::vector<std::size_t> matrix;
@ -37,10 +26,6 @@ struct Matrix {
std::size_t matrix_rows;
};
Matrix matrix(boost::wstring_view sentence1, boost::wstring_view sentence2);
std::vector<EditOp> editops(boost::wstring_view sentence1, boost::wstring_view sentence2);
struct MatchingBlock {
std::size_t first_start;
std::size_t second_start;
@ -52,11 +37,49 @@ struct MatchingBlock {
{}
};
std::vector<MatchingBlock> matching_blocks(boost::wstring_view sentence1, boost::wstring_view sentence2);
double normalized_distance(boost::wstring_view sentence1, boost::wstring_view sentence2, double min_ratio = 0.0);
template<typename CharT>
Matrix matrix(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2);
template <typename CharT>
Matrix matrix(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2);
template<typename CharT>
std::vector<MatchingBlock> matching_blocks(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2);
template <typename CharT>
std::vector<MatchingBlock> matching_blocks(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2);
template<typename CharT>
double normalized_distance(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2,
double min_ratio = 0.0);
template <typename CharT>
double normalized_distance(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2,
double min_ratio = 0.0);
template<typename CharT>
std::size_t distance(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2);
template <typename CharT>
std::size_t distance(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2);
std::size_t distance(boost::wstring_view sentence1, boost::wstring_view sentence2);
/**
* Calculates the minimum number of insertions, deletions, and substitutions
@ -74,13 +97,43 @@ std::size_t distance(boost::wstring_view sentence1, boost::wstring_view sentence
* @param sentence2 second sentence to match (can be either a string type or a vector of strings)
* @return weighted levenshtein distance
*/
std::size_t weighted_distance(boost::wstring_view sentence1, boost::wstring_view sentence2);
template<typename CharT>
std::size_t weighted_distance(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2);
std::size_t generic_distance(boost::wstring_view source, boost::wstring_view target, WeightTable weights = { 1, 1, 1 });
template <typename CharT>
std::size_t weighted_distance(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2);
template<typename CharT>
std::size_t generic_distance(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2,
WeightTable weights = { 1, 1, 1 });
template <typename CharT>
std::size_t generic_distance(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2,
WeightTable weights = { 1, 1, 1 });
/**
* Calculates a normalized score of the weighted Levenshtein algorithm between 0.0 and
* 1.0 (inclusive), where 1.0 means the sequences are the same.
*/
double normalized_weighted_distance(const boost::wstring_view& sentence1, const boost::wstring_view& sentence2, double min_ratio = 0.0);
template<typename CharT>
double normalized_weighted_distance(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2,
double min_ratio = 0.0);
template <typename CharT>
double normalized_weighted_distance(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2,
double min_ratio = 0.0);
}
#include "levenshtein.txx"

View File

@ -2,7 +2,11 @@
#include <algorithm>
#include <stdexcept>
levenshtein::Matrix levenshtein::matrix(boost::wstring_view sentence1, boost::wstring_view sentence2)
template<typename CharT>
levenshtein::Matrix levenshtein::matrix(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2)
{
Affix affix = utils::remove_common_affix(sentence1, sentence2);
@ -42,20 +46,20 @@ levenshtein::Matrix levenshtein::matrix(boost::wstring_view sentence1, boost::ws
};
}
std::vector<levenshtein::EditOp> levenshtein::editops(boost::wstring_view sentence1, boost::wstring_view sentence2)
template <typename CharT>
levenshtein::Matrix levenshtein::matrix(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2)
{
auto m = matrix(sentence1, sentence2);
std::size_t matrix_columns = m.matrix_columns;
std::size_t matrix_rows = m.matrix_rows;
std::size_t prefix_len = m.prefix_len;
auto lev_matrix = m.matrix;
return matrix(
boost::basic_string_view<CharT>(sentence1),
boost::basic_string_view<CharT>(sentence2));
}
std::vector<EditOp> ops;
ops.reserve(lev_matrix[matrix_columns * matrix_rows - 1]);
std::size_t i = matrix_columns - 1;
std::size_t j = matrix_rows - 1;
std::size_t position = matrix_columns * matrix_rows - 1;
levenshtein::EditType get_EditType(levenshtein::Matrix matrix, std::size_t row, std::size_t column)
{
auto lev_matrix = matrix.matrix;
std::size_t matrix_rows = matrix.matrix_rows;
auto is_replace = [=](std::size_t pos) {
return lev_matrix[pos - matrix_rows - 1] < lev_matrix[pos];
@ -70,58 +74,67 @@ std::vector<levenshtein::EditOp> levenshtein::editops(boost::wstring_view senten
return lev_matrix[pos - matrix_rows - 1] == lev_matrix[pos];
};
while (i > 0 || j > 0) {
EditType op_type;
std::size_t position = column*matrix_rows + row;
if (i && j && is_replace(position)) {
op_type = EditType::EditReplace;
--i;
--j;
position -= matrix_rows + 1;
} else if (j && is_insert(position)) {
op_type = EditType::EditInsert;
--j;
--position;
} else if (i && is_delete(position)) {
op_type = EditType::EditDelete;
--i;
position -= matrix_rows;
} else if (is_keep(position)) {
--i;
--j;
position -= matrix_rows + 1;
// EditKeep does not has to be stored
continue;
} else {
throw std::logic_error("something went wrong extracting the editops from the levenshtein matrix");
}
ops.emplace_back(op_type, i + prefix_len, j + prefix_len);
if (column && row && is_replace(position)) {
return levenshtein::EditType::EditReplace;
} else if (row && is_insert(position)) {
return levenshtein::EditType::EditInsert;
} else if (column && is_delete(position)) {
return levenshtein::EditType::EditDelete;
} else if (is_keep(position)) {
return levenshtein::EditType::EditKeep;
} else {
throw std::logic_error("something went wrong extracting the editops from the levenshtein matrix");
}
std::reverse(ops.begin(), ops.end());
return ops;
}
std::vector<levenshtein::MatchingBlock> levenshtein::matching_blocks(boost::wstring_view sentence1, boost::wstring_view sentence2)
template<typename CharT>
std::vector<levenshtein::MatchingBlock> levenshtein::matching_blocks(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2)
{
auto edit_ops = editops(sentence1, sentence2);
auto m = matrix(sentence1, sentence2);
std::size_t prefix_len = m.prefix_len;
// current position in the the levenshtein matrix
std::size_t matrix_column = m.matrix_columns - 1;
std::size_t matrix_row = m.matrix_rows - 1;
std::size_t first_start = 0;
std::size_t second_start = 0;
std::vector<MatchingBlock> mblocks;
mblocks.emplace_back(sentence1.length(), sentence2.length(), 0);
for (const auto& op : edit_ops) {
if (op.op_type == EditType::EditKeep) {
while (matrix_column > 0 || matrix_row > 0) {
EditType op_type = get_EditType(m, matrix_row, matrix_column);
switch (op_type) {
case EditType::EditReplace:
--matrix_column;
--matrix_row;
break;
case EditType::EditInsert:
--matrix_row;
break;
case EditType::EditDelete:
--matrix_column;
break;
case EditType::EditKeep:
--matrix_column;
--matrix_row;
continue;
}
if (first_start < op.first_start || second_start < op.second_start) {
mblocks.emplace_back(first_start, second_start, op.first_start - first_start);
first_start = op.first_start;
second_start = op.second_start;
std::size_t cur_first_start = matrix_column + prefix_len;
std::size_t cur_second_start = matrix_row + prefix_len;
if (first_start < cur_first_start || second_start < cur_second_start) {
mblocks.emplace_back(first_start, second_start, cur_first_start - first_start);
first_start = cur_first_start;
second_start = cur_second_start;
}
switch (op.op_type) {
switch (op_type) {
case EditType::EditReplace:
first_start += 1;
second_start += 1;
@ -132,16 +145,29 @@ std::vector<levenshtein::MatchingBlock> levenshtein::matching_blocks(boost::wstr
case EditType::EditInsert:
second_start += 1;
break;
case EditType::EditKeep:
default:
break;
}
}
mblocks.emplace_back(sentence1.length(), sentence2.length(), 0);
std::reverse(mblocks.begin(), mblocks.end());
return mblocks;
}
std::size_t levenshtein::distance(boost::wstring_view sentence1, boost::wstring_view sentence2)
template <typename CharT>
std::vector<levenshtein::MatchingBlock> levenshtein::matching_blocks(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2)
{
return matching_blocks(
boost::basic_string_view<CharT>(sentence1),
boost::basic_string_view<CharT>(sentence2));
}
template<typename CharT>
std::size_t levenshtein::weighted_distance(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2)
{
utils::remove_common_affix(sentence1, sentence2);
@ -174,7 +200,20 @@ std::size_t levenshtein::distance(boost::wstring_view sentence1, boost::wstring_
return cache.back();
}
std::size_t levenshtein::weighted_distance(boost::wstring_view sentence1, boost::wstring_view sentence2)
template <typename CharT>
std::size_t levenshtein::weighted_distance(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2)
{
return weighted_distance(
boost::basic_string_view<CharT>(sentence1),
boost::basic_string_view<CharT>(sentence2));
}
template<typename CharT>
std::size_t levenshtein::distance(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2)
{
utils::remove_common_affix(sentence1, sentence2);
@ -214,7 +253,21 @@ std::size_t levenshtein::weighted_distance(boost::wstring_view sentence1, boost:
return cache.back();
}
std::size_t levenshtein::generic_distance(boost::wstring_view sentence1, boost::wstring_view sentence2, WeightTable weights)
template <typename CharT>
std::size_t levenshtein::distance(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2)
{
return distance(
boost::basic_string_view<CharT>(sentence1),
boost::basic_string_view<CharT>(sentence2));
}
template<typename CharT>
std::size_t levenshtein::generic_distance(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2,
WeightTable weights)
{
utils::remove_common_affix(sentence1, sentence2);
if (sentence1.size() > sentence2.size()) {
@ -248,7 +301,23 @@ std::size_t levenshtein::generic_distance(boost::wstring_view sentence1, boost::
return cache.back();
}
double levenshtein::normalized_distance(boost::wstring_view sentence1, boost::wstring_view sentence2, double min_ratio)
template <typename CharT>
std::size_t levenshtein::generic_distance(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2,
WeightTable weights)
{
return generic_distance(
boost::basic_string_view<CharT>(sentence1),
boost::basic_string_view<CharT>(sentence2),
weights);
}
template<typename CharT>
double levenshtein::normalized_distance(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2,
double min_ratio)
{
if (sentence1.empty() || sentence2.empty()) {
return sentence1.empty() && sentence2.empty();
@ -275,7 +344,24 @@ double levenshtein::normalized_distance(boost::wstring_view sentence1, boost::ws
return (ratio >= min_ratio) ? ratio : 0.0;
}
double levenshtein::normalized_weighted_distance(const boost::wstring_view& sentence1, const boost::wstring_view& sentence2, double min_ratio)
template <typename CharT>
double levenshtein::normalized_distance(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2,
double min_ratio)
{
return normalized_distance(
boost::basic_string_view<CharT>(sentence1),
boost::basic_string_view<CharT>(sentence2),
min_ratio);
}
template<typename CharT>
double levenshtein::normalized_weighted_distance(
boost::basic_string_view<CharT> sentence1,
boost::basic_string_view<CharT> sentence2,
double min_ratio)
{
if (sentence1.empty() || sentence2.empty()) {
return sentence1.empty() && sentence2.empty();
@ -304,3 +390,15 @@ double levenshtein::normalized_weighted_distance(const boost::wstring_view& sent
double ratio = 1.0 - static_cast<double>(dist) / lensum;
return (ratio >= min_ratio) ? ratio : 0.0;
}
template <typename CharT>
double levenshtein::normalized_weighted_distance(
const std::basic_string<CharT>& sentence1,
const std::basic_string<CharT>& sentence2,
double min_ratio)
{
return normalized_weighted_distance(
boost::basic_string_view<CharT>(sentence1),
boost::basic_string_view<CharT>(sentence2),
min_ratio);
}

View File

@ -15,7 +15,7 @@ process::extract(const std::wstring& query, const std::vector<std::wstring>& cho
for (const auto& choice : choices) {
std::wstring b = (preprocess) ? utils::default_process(choice) : choice;
double score = fuzz::WRatio({query}, {choice}, score_cutoff);
double score = fuzz::WRatio(Sentence<wchar_t>(query), Sentence<wchar_t>(choice), score_cutoff);
if (score >= score_cutoff) {
results.emplace_back(std::make_pair(choice, score));
}
@ -46,7 +46,7 @@ process::extractOne(const std::wstring& query, const std::vector<std::wstring>&
for (const auto& choice : choices) {
std::wstring b = (preprocess) ? utils::default_process(choice) : choice;
double score = fuzz::WRatio({a}, {b}, score_cutoff);
double score = fuzz::WRatio(Sentence<wchar_t>(a), Sentence<wchar_t>(b), score_cutoff);
if (score >= score_cutoff) {
score_cutoff = score;
match_found = true;

View File

@ -1,179 +0,0 @@
#include "utils.hpp"
#include <algorithm>
#include <cwctype>
/**
* Finds the longest common prefix between two ranges
*/
template <typename InputIterator1, typename InputIterator2>
inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1,
InputIterator2 first2, InputIterator2 last2)
{
return std::distance(first1, std::mismatch(first1, last1, first2, last2).first);
}
/**
* Removes common prefix of two string views
*/
std::size_t remove_common_prefix(boost::wstring_view& a, boost::wstring_view& b)
{
auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end());
a.remove_prefix(prefix);
b.remove_prefix(prefix);
return prefix;
}
/**
* Removes common suffix of two string views
*/
std::size_t remove_common_suffix(boost::wstring_view& a, boost::wstring_view& b)
{
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
a.remove_suffix(suffix);
b.remove_suffix(suffix);
return suffix;
}
/**
* Removes common affix of two string views
*/
Affix utils::remove_common_affix(boost::wstring_view& a, boost::wstring_view& b)
{
return Affix{
remove_common_prefix(a, b),
remove_common_suffix(a, b)
};
}
template <typename T>
void vec_remove_common_affix(T& a, T& b)
{
auto prefix = std::mismatch(a.begin(), a.end(), b.begin(), b.end());
a.erase(a.begin(), prefix.first);
b.erase(b.begin(), prefix.second);
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
a.erase(a.end() - suffix, a.end());
b.erase(b.end() - suffix, b.end());
}
void utils::remove_common_affix(std::vector<boost::wstring_view>& a, std::vector<boost::wstring_view>& b)
{
vec_remove_common_affix(a, b);
if (!a.empty() && !b.empty()) {
remove_common_prefix(a.front(), b.front());
remove_common_suffix(a.back(), b.back());
}
}
std::wstring utils::join(const std::vector<boost::wstring_view>& sentence)
{
if (sentence.empty()) {
return std::wstring();
}
auto sentence_iter = sentence.begin();
std::wstring result{ *sentence_iter };
const std::wstring whitespace{ 0x20 };
++sentence_iter;
for (; sentence_iter != sentence.end(); ++sentence_iter) {
result.append(whitespace).append(std::wstring{ *sentence_iter });
}
return result;
}
percent utils::result_cutoff(double result, percent score_cutoff)
{
return (result >= score_cutoff) ? result : 0;
}
// trim from start (in place)
void ltrim(std::wstring& s)
{
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](const wchar_t &ch) {
return !std::iswspace(ch);
}));
}
// trim from end (in place)
void rtrim(std::wstring& s)
{
s.erase(std::find_if(s.rbegin(), s.rend(), [](const wchar_t &ch) {
return !std::iswspace(ch);
}).base(), s.end());
}
// trim from both ends (in place)
void utils::trim(std::wstring& s)
{
ltrim(s);
rtrim(s);
}
void utils::lower_case(std::wstring& s)
{
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
}
std::wstring utils::default_process(std::wstring s)
{
// replace embedded null terminators
std::replace( s.begin(), s.end(), {'\x00'}, ' ');
trim(s);
lower_case(s);
return s;
}
DecomposedSet utils::set_decomposition(std::vector<boost::wstring_view> a, std::vector<boost::wstring_view> b)
{
std::vector<boost::wstring_view> intersection;
std::vector<boost::wstring_view> difference_ab;
a.erase(std::unique(a.begin(), a.end()), a.end());
b.erase(std::unique(b.begin(), b.end()), b.end());
for (const auto& current_a : a) {
auto element_b = std::find(b.begin(), b.end(), current_a);
if (element_b != b.end()) {
b.erase(element_b);
intersection.emplace_back(current_a);
} else {
difference_ab.emplace_back(current_a);
}
}
return DecomposedSet{ intersection, difference_ab, b };
}
std::size_t utils::joined_size(const std::vector<boost::wstring_view>& x)
{
if (x.empty()) {
return 0;
}
// there is a whitespace between each word
std::size_t result = x.size() - 1;
for (const auto& y : x) {
result += y.size();
}
return result;
}
std::vector<boost::wstring_view> utils::splitSV(const boost::wstring_view& str)
{
std::vector<boost::wstring_view> output;
// assume a word length of 6 + 1 whitespace
output.reserve(str.size() / 7);
auto first = str.data(), second = str.data(), last = first + str.size();
for (; second != last && first != last; first = second + 1) {
// maybe use localisation
second = std::find_if(first, last, [](const wchar_t &c) { return std::iswspace(c); });
if (first != second) {
output.emplace_back(first, second - first);
}
}
return output;
}

View File

@ -5,21 +5,29 @@
/* 0.0% - 100.0% */
using percent = double;
template<typename CharT>
using string_view_vec = std::vector<boost::basic_string_view<CharT>>;
template<typename CharT>
struct Sentence {
boost::wstring_view sentence;
boost::basic_string_view<CharT> sentence;
uint64_t bitmap = 0;
Sentence(boost::wstring_view sentence, uint64_t bitmap)
Sentence(boost::basic_string_view<CharT> sentence, uint64_t bitmap)
: sentence(sentence), bitmap(bitmap) {}
Sentence(boost::wstring_view sentence)
Sentence(boost::basic_string_view<CharT> sentence)
: sentence(sentence), bitmap(0) {}
Sentence(std::basic_string<CharT> sentence, uint64_t bitmap)
: sentence(boost::basic_string_view<CharT>(sentence)), bitmap(bitmap) {}
Sentence(std::basic_string<CharT> sentence)
: sentence(boost::basic_string_view<CharT>(sentence)), bitmap(0) {}
};
template<typename CharT>
struct DecomposedSet {
std::vector<boost::wstring_view> intersection;
std::vector<boost::wstring_view> difference_ab;
std::vector<boost::wstring_view> difference_ba;
DecomposedSet(std::vector<boost::wstring_view> intersection, std::vector<boost::wstring_view> difference_ab, std::vector<boost::wstring_view> difference_ba)
string_view_vec<CharT> intersection;
string_view_vec<CharT> difference_ab;
string_view_vec<CharT> difference_ba;
DecomposedSet(string_view_vec<CharT> intersection, string_view_vec<CharT> difference_ab, string_view_vec<CharT> difference_ba)
: intersection(std::move(intersection))
, difference_ab(std::move(difference_ab))
, difference_ba(std::move(difference_ba))
@ -33,38 +41,40 @@ struct Affix {
namespace utils {
std::vector<boost::wstring_view> splitSV(const boost::wstring_view& str);
template<typename CharT>
string_view_vec<CharT> splitSV(const boost::basic_string_view<CharT>& str);
DecomposedSet set_decomposition(std::vector<boost::wstring_view> a, std::vector<boost::wstring_view> b);
template<typename CharT>
string_view_vec<CharT> splitSV(const std::basic_string<CharT>& str);
std::size_t joined_size(const std::vector<boost::wstring_view>& x);
template<typename CharT>
std::size_t joined_size(const string_view_vec<CharT>& x);
std::wstring join(const std::vector<boost::wstring_view>& sentence);
template<typename CharT>
std::basic_string<CharT> join(const string_view_vec<CharT>& sentence);
template<typename CharT>
DecomposedSet<CharT> set_decomposition(string_view_vec<CharT> a, string_view_vec<CharT> b);
template<typename CharT>
Affix remove_common_affix(boost::basic_string_view<CharT>& a, boost::basic_string_view<CharT>& b);
template<typename CharT>
void trim(std::basic_string<CharT>& s);
template<typename CharT>
void lower_case(std::basic_string<CharT>& s);
template<typename CharT>
std::basic_string<CharT> default_process(std::basic_string<CharT> s);
template<typename CharT>
uint64_t bitmap_create(const boost::basic_string_view<CharT>& sentence);
template<typename CharT>
uint64_t bitmap_create(const std::basic_string<CharT>& sentence);
percent result_cutoff(double result, percent score_cutoff);
void trim(std::wstring& s);
void lower_case(std::wstring& s);
std::wstring default_process(std::wstring s);
Affix remove_common_affix(boost::wstring_view& a, boost::wstring_view& b);
void remove_common_affix(std::vector<boost::wstring_view>& a, std::vector<boost::wstring_view>& b);
}
inline uint64_t bitmap_create(const boost::wstring_view& sentence) {
uint64_t bitmap = 0;
for (const unsigned int& letter : sentence) {
uint8_t shift = (letter % 16) * 4;
// make sure there is no overflow when more than 8 characters
// with the same shift exist
uint64_t bitmask = static_cast<uint64_t>(0b1111) << shift;
if ((bitmap & bitmask) != bitmask) {
bitmap += static_cast<uint64_t>(1) << shift;
}
}
return bitmap;
}
#include "utils.txx"

195
cpp/src/utils.txx Normal file
View File

@ -0,0 +1,195 @@
#include "utils.hpp"
#include <algorithm>
#include <locale>
template<typename CharT>
string_view_vec<CharT> utils::splitSV(const boost::basic_string_view<CharT>& str)
{
string_view_vec<CharT> output;
auto first = str.data(), second = str.data(), last = first + str.size();
for (; second != last && first != last; first = second + 1) {
// TODO: maybe use localisation
second = std::find_if(first, last, [](const CharT& c) { return std::isspace(c); });
if (first != second) {
output.emplace_back(first, second - first);
}
}
return output;
}
template<typename CharT>
string_view_vec<CharT> splitSV(const std::basic_string<CharT>& str)
{
return splitSV(boost::basic_string_view<CharT>(str));
}
template<typename CharT>
std::size_t utils::joined_size(const string_view_vec<CharT>& x)
{
if (x.empty()) {
return 0;
}
// there is a whitespace between each word
std::size_t result = x.size() - 1;
for (const auto& y : x) {
result += y.size();
}
return result;
}
template<typename CharT>
std::basic_string<CharT> utils::join(const string_view_vec<CharT>& sentence)
{
if (sentence.empty()) {
return std::basic_string<CharT>();
}
auto sentence_iter = sentence.begin();
std::basic_string<CharT> result{ *sentence_iter };
const std::basic_string<CharT> whitespace{ 0x20 };
++sentence_iter;
for (; sentence_iter != sentence.end(); ++sentence_iter) {
result.append(whitespace).append(std::basic_string<CharT>{ *sentence_iter });
}
return result;
}
template<typename CharT>
DecomposedSet<CharT> utils::set_decomposition(string_view_vec<CharT> a, string_view_vec<CharT> b)
{
string_view_vec<CharT> intersection;
string_view_vec<CharT> difference_ab;
a.erase(std::unique(a.begin(), a.end()), a.end());
b.erase(std::unique(b.begin(), b.end()), b.end());
for (const auto& current_a : a) {
auto element_b = std::find(b.begin(), b.end(), current_a);
if (element_b != b.end()) {
b.erase(element_b);
intersection.emplace_back(current_a);
} else {
difference_ab.emplace_back(current_a);
}
}
return DecomposedSet<CharT>{ intersection, difference_ab, b };
}
/**
* Finds the longest common prefix between two ranges
*/
template <typename InputIterator1, typename InputIterator2>
inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1,
InputIterator2 first2, InputIterator2 last2)
{
return std::distance(first1, std::mismatch(first1, last1, first2, last2).first);
}
/**
* Removes common prefix of two string views
*/
template<typename CharT>
std::size_t remove_common_prefix(boost::basic_string_view<CharT>& a, boost::basic_string_view<CharT>& b)
{
auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end());
a.remove_prefix(prefix);
b.remove_prefix(prefix);
return prefix;
}
/**
* Removes common suffix of two string views
*/
template<typename CharT>
std::size_t remove_common_suffix(boost::basic_string_view<CharT>& a, boost::basic_string_view<CharT>& b)
{
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
a.remove_suffix(suffix);
b.remove_suffix(suffix);
return suffix;
}
/**
* Removes common affix of two string views
*/
template<typename CharT>
Affix utils::remove_common_affix(boost::basic_string_view<CharT>& a, boost::basic_string_view<CharT>& b)
{
return Affix{
remove_common_prefix(a, b),
remove_common_suffix(a, b)
};
}
template<typename CharT>
void ltrim(std::basic_string<CharT>& s)
{
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](const CharT& ch) {
return !std::isspace(ch);
}));
}
template<typename CharT>
void rtrim(std::basic_string<CharT>& s)
{
s.erase(std::find_if(s.rbegin(), s.rend(), [](const CharT& ch) {
return !std::isspace(ch);
}).base(), s.end());
}
template<typename CharT>
void utils::trim(std::basic_string<CharT>& s)
{
ltrim(s);
rtrim(s);
}
template<typename CharT>
void utils::lower_case(std::basic_string<CharT>& s)
{
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
}
template<typename CharT>
std::basic_string<CharT> utils::default_process(std::basic_string<CharT> s)
{
// replace embedded null terminators
std::replace( s.begin(), s.end(), CharT{0}, CharT{0x20});
trim(s);
lower_case(s);
return s;
}
template<typename CharT>
uint64_t utils::bitmap_create(const boost::basic_string_view<CharT>& sentence)
{
uint64_t bitmap = 0;
for (const unsigned int& letter : sentence) {
uint8_t shift = (letter % 16) * 4;
// make sure there is no overflow when more than 8 characters
// with the same shift exist
uint64_t bitmask = static_cast<uint64_t>(0b1111) << shift;
if ((bitmap & bitmask) != bitmask) {
bitmap += static_cast<uint64_t>(1) << shift;
}
}
return bitmap;
}
template<typename CharT>
uint64_t utils::bitmap_create(const std::basic_string<CharT>& sentence)
{
return bitmap_create(boost::basic_string_view<CharT>(sentence));
}
inline percent utils::result_cutoff(double result, percent score_cutoff)
{
return (result >= score_cutoff) ? result : 0;
}

View File

@ -368,13 +368,13 @@ static PyObject* token_ratio(PyObject *self, PyObject *args, PyObject *keywds) {
double result;
if (preprocess) {
result = fuzz::token_ratio(
{s1},
{s2},
Sentence<wchar_t>(s1),
Sentence<wchar_t>(s2),
score_cutoff);
} else {
result = fuzz::token_ratio(
{s1},
{s2},
Sentence<wchar_t>(s1),
Sentence<wchar_t>(s2),
score_cutoff);
}
@ -493,8 +493,8 @@ static PyObject* WRatio(PyObject *self, PyObject *args, PyObject *keywds) {
std::wstring s2 = PyObject_To_Wstring(py_s2, preprocess);
double result = fuzz::WRatio(
{s1},
{s2},
Sentence<wchar_t>(s1),
Sentence<wchar_t>(s2),
score_cutoff);
return PyFloat_FromDouble(result);

View File

@ -28,11 +28,12 @@ PyObject* extract(PyObject *self, PyObject *args, PyObject *keywds) {
int preprocess = 1;
static const char *kwlist[] = {"query", "choices", "score_cutoff", "preprocess", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "UO|dp", const_cast<char **>(kwlist),
if (!PyArg_ParseTupleAndKeywords(args, keywds, "UO|dh", const_cast<char **>(kwlist),
&py_query, &py_choices, &score_cutoff, &preprocess)) {
return NULL;
}
PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings");
if (!choices) {
return NULL;
@ -44,7 +45,7 @@ PyObject* extract(PyObject *self, PyObject *args, PyObject *keywds) {
}
std::wstring cleaned_query = PyObject_To_Wstring(py_query, preprocess);
uint64_t query_bitmap = bitmap_create(cleaned_query);
uint64_t query_bitmap = utils::bitmap_create(cleaned_query);
PyObject* results = PyList_New(0);
@ -62,12 +63,12 @@ PyObject* extract(PyObject *self, PyObject *args, PyObject *keywds) {
std::wstring choice(buffer, len);
PyMem_Free(buffer);
boost::wstring_view cleaned_choice = (preprocess) ? utils::default_process(choice) : choice;
uint64_t choice_bitmap = bitmap_create(cleaned_choice);
std::wstring cleaned_choice = (preprocess) ? utils::default_process(choice) : choice;
uint64_t choice_bitmap = utils::bitmap_create(cleaned_choice);
double score= fuzz::WRatio(
{cleaned_query, query_bitmap},
{cleaned_choice, choice_bitmap},
Sentence<wchar_t>(cleaned_query, query_bitmap),
Sentence<wchar_t>(cleaned_choice, choice_bitmap),
score_cutoff);
if (score >= score_cutoff) {
@ -117,7 +118,7 @@ PyObject* extractOne(PyObject *self, PyObject *args, PyObject *keywds) {
}
std::wstring cleaned_query = PyObject_To_Wstring(py_query, preprocess);
uint64_t query_bitmap = bitmap_create(cleaned_query);
uint64_t query_bitmap = utils::bitmap_create(cleaned_query);
double end_score = 0;
std::wstring result_choice;
@ -136,12 +137,12 @@ PyObject* extractOne(PyObject *self, PyObject *args, PyObject *keywds) {
std::wstring choice(buffer, len);
PyMem_Free(buffer);
boost::wstring_view cleaned_choice = (preprocess) ? utils::default_process(choice) : choice;
uint64_t choice_bitmap = bitmap_create(cleaned_choice);
std::wstring cleaned_choice = (preprocess) ? utils::default_process(choice) : choice;
uint64_t choice_bitmap = utils::bitmap_create(cleaned_choice);
double score = fuzz::WRatio(
{cleaned_query, query_bitmap},
{cleaned_choice, choice_bitmap},
Sentence<wchar_t>(cleaned_query, query_bitmap),
Sentence<wchar_t>(cleaned_choice, choice_bitmap),
score_cutoff);
if (score >= score_cutoff) {

View File

@ -52,19 +52,19 @@ setup(
ext_modules = [
Extension(
'rapidfuzz.levenshtein',
['python/src/py_levenshtein.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'],
['python/src/py_levenshtein.cpp'],
include_dirs=["cpp/src", "cpp/extern"],
language='c++',
),
Extension(
'rapidfuzz.fuzz',
['python/src/py_fuzz.cpp', 'cpp/src/fuzz.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'],
['python/src/py_fuzz.cpp'],
include_dirs=["cpp/src", "cpp/extern"],
language='c++',
),
Extension(
'rapidfuzz._process',
['python/src/py_process.cpp', 'cpp/src/fuzz.cpp', 'cpp/src/levenshtein.cpp', 'cpp/src/utils.cpp'],
['python/src/py_process.cpp'],
include_dirs=["cpp/src", "cpp/extern"],
language='c++',
),