make levenshtein work string_view and wstring_view
This commit is contained in:
parent
4d2eb0cc98
commit
097365692a
|
@ -61,9 +61,9 @@ percent _token_ratio(const std::wstring &a, const std::wstring &b, percent score
|
||||||
|
|
||||||
auto [intersection, difference_ab, difference_ba] = utils::set_decomposition(tokens_a, tokens_b);
|
auto [intersection, difference_ab, difference_ba] = utils::set_decomposition(tokens_a, tokens_b);
|
||||||
|
|
||||||
std::size_t ab_len = utils::joined_size(difference_ab, 1);
|
std::size_t ab_len = utils::joined_size(difference_ab);
|
||||||
std::size_t ba_len = utils::joined_size(difference_ba, 1);
|
std::size_t ba_len = utils::joined_size(difference_ba);
|
||||||
std::size_t double_prefix = 2 * utils::joined_size(intersection, 1);
|
std::size_t double_prefix = 2 * utils::joined_size(intersection);
|
||||||
|
|
||||||
// fuzzywuzzy joined sect and ab/ba for comparisions
|
// fuzzywuzzy joined sect and ab/ba for comparisions
|
||||||
// this is not done here as an optimisation, so the lengths get incremented by 1
|
// this is not done here as an optimisation, so the lengths get incremented by 1
|
||||||
|
@ -76,11 +76,11 @@ percent _token_ratio(const std::wstring &a, const std::wstring &b, percent score
|
||||||
++ba_len;
|
++ba_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
float result = levenshtein::normalized_weighted_distance(tokens_a, tokens_b, score_cutoff / 100, L" ");
|
float result = levenshtein::normalized_weighted_distance(tokens_a, tokens_b, score_cutoff / 100);
|
||||||
|
|
||||||
// TODO: could add score cutoff aswell, but would need to copy most things from normalized_score_cutoff
|
// TODO: could add score cutoff aswell, but would need to copy most things from normalized_score_cutoff
|
||||||
// as an alternative add another utility function to levenshtein for this case
|
// as an alternative add another utility function to levenshtein for this case
|
||||||
std::size_t sect_distance = levenshtein::weighted_distance(difference_ab, difference_ba, L" ");
|
std::size_t sect_distance = levenshtein::weighted_distance(difference_ab, difference_ba);
|
||||||
if (sect_distance != std::numeric_limits<std::size_t>::max()) {
|
if (sect_distance != std::numeric_limits<std::size_t>::max()) {
|
||||||
std::size_t lensum = ab_len + ba_len + double_prefix;
|
std::size_t lensum = ab_len + ba_len + double_prefix;
|
||||||
result = std::max(result, (float)1.0 - sect_distance / (float)lensum);
|
result = std::max(result, (float)1.0 - sect_distance / (float)lensum);
|
||||||
|
@ -184,9 +184,9 @@ percent fuzz::token_set_ratio(const std::wstring &a, const std::wstring &b, perc
|
||||||
|
|
||||||
auto [intersection, difference_ab, difference_ba] = utils::set_decomposition(tokens_a, tokens_b);
|
auto [intersection, difference_ab, difference_ba] = utils::set_decomposition(tokens_a, tokens_b);
|
||||||
|
|
||||||
std::size_t ab_len = utils::joined_size(difference_ab, 1);
|
std::size_t ab_len = utils::joined_size(difference_ab);
|
||||||
std::size_t ba_len = utils::joined_size(difference_ba, 1);
|
std::size_t ba_len = utils::joined_size(difference_ba);
|
||||||
std::size_t double_prefix = 2 * utils::joined_size(intersection, 1);
|
std::size_t double_prefix = 2 * utils::joined_size(intersection);
|
||||||
|
|
||||||
// fuzzywuzzy joined sect and ab/ba for comparisions
|
// fuzzywuzzy joined sect and ab/ba for comparisions
|
||||||
// this is not done here as an optimisation, so the lengths get incremented by 1
|
// this is not done here as an optimisation, so the lengths get incremented by 1
|
||||||
|
@ -201,7 +201,7 @@ percent fuzz::token_set_ratio(const std::wstring &a, const std::wstring &b, perc
|
||||||
|
|
||||||
// TODO: could add score cutoff aswell, but would need to copy most things from normalized_score_cutoff
|
// TODO: could add score cutoff aswell, but would need to copy most things from normalized_score_cutoff
|
||||||
// as an alternative add another utility function to levenshtein for this case
|
// as an alternative add another utility function to levenshtein for this case
|
||||||
std::size_t sect_distance = levenshtein::weighted_distance(difference_ab, difference_ba, L" ");
|
std::size_t sect_distance = levenshtein::weighted_distance(difference_ab, difference_ba);
|
||||||
float result = 0;
|
float result = 0;
|
||||||
if (sect_distance != std::numeric_limits<std::size_t>::max()) {
|
if (sect_distance != std::numeric_limits<std::size_t>::max()) {
|
||||||
std::size_t lensum = ab_len + ba_len + double_prefix;
|
std::size_t lensum = ab_len + ba_len + double_prefix;
|
||||||
|
|
|
@ -1,234 +0,0 @@
|
||||||
#include "levenshtein.hpp"
|
|
||||||
#include <numeric>
|
|
||||||
#include <optional>
|
|
||||||
|
|
||||||
|
|
||||||
template<typename MinDistanceCalc=std::false_type, typename CharT, typename Delimiter=std::nullopt_t>
|
|
||||||
auto levenshtein_word_cmp(const char &letter_cmp, const string_view_vec<CharT> &words,
|
|
||||||
std::vector<std::size_t> &cache, std::size_t current_cache, Delimiter delimiter=std::nullopt)
|
|
||||||
{
|
|
||||||
std::size_t result = current_cache + 1;
|
|
||||||
auto cache_iter = cache.begin();
|
|
||||||
auto word_iter = words.begin();
|
|
||||||
auto min_distance = std::numeric_limits<std::size_t>::max();
|
|
||||||
|
|
||||||
auto charCmp = [&] (const char &char2) {
|
|
||||||
if (letter_cmp == char2) { result = current_cache; }
|
|
||||||
else { ++result; }
|
|
||||||
|
|
||||||
current_cache = *cache_iter;
|
|
||||||
if (result > current_cache + 1) {
|
|
||||||
result = current_cache + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if constexpr(!std::is_same<std::false_type, MinDistanceCalc>::value) {
|
|
||||||
if (current_cache < min_distance) {
|
|
||||||
min_distance = current_cache;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
*cache_iter = result;
|
|
||||||
++cache_iter;
|
|
||||||
};
|
|
||||||
|
|
||||||
// no delimiter should be added in front of the first word
|
|
||||||
for (const auto &letter : *word_iter) {
|
|
||||||
charCmp(letter);
|
|
||||||
}
|
|
||||||
++word_iter;
|
|
||||||
|
|
||||||
for (; word_iter != words.end(); ++word_iter) {
|
|
||||||
// between every word there should be a delimiter if one exists
|
|
||||||
if constexpr(!std::is_same<std::nullopt_t, Delimiter>::value) {
|
|
||||||
for (const auto &letter : delimiter) {
|
|
||||||
charCmp(letter);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// check following word
|
|
||||||
for (const auto &letter : *word_iter) {
|
|
||||||
charCmp(letter);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if constexpr(!std::is_same<std::false_type, MinDistanceCalc>::value) {
|
|
||||||
return min_distance;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
std::size_t levenshtein::weighted_distance(std::vector<std::wstring_view> sentence1, std::vector<std::wstring_view> sentence2, std::wstring_view delimiter) {
|
|
||||||
remove_common_affix(sentence1, sentence2);
|
|
||||||
std::size_t sentence1_len = utils::joined_size(sentence1, delimiter);
|
|
||||||
std::size_t sentence2_len = utils::joined_size(sentence2, delimiter);
|
|
||||||
|
|
||||||
if (sentence2_len > sentence1_len) {
|
|
||||||
std::swap(sentence1, sentence2);
|
|
||||||
std::swap(sentence1_len, sentence2_len);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!sentence2_len) {
|
|
||||||
return sentence1_len;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::size_t> cache(sentence2_len);
|
|
||||||
std::iota(cache.begin(), cache.end(), 1);
|
|
||||||
|
|
||||||
std::size_t range1_pos = 0;
|
|
||||||
auto word_iter = sentence1.begin();
|
|
||||||
|
|
||||||
// no delimiter in front of first word
|
|
||||||
for (const auto &letter : *word_iter) {
|
|
||||||
levenshtein_word_cmp(letter, sentence2, cache, range1_pos, delimiter);
|
|
||||||
++range1_pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
++word_iter;
|
|
||||||
for (; word_iter != sentence1.end(); ++word_iter) {
|
|
||||||
// delimiter between words
|
|
||||||
for (const auto &letter : delimiter) {
|
|
||||||
levenshtein_word_cmp(letter, sentence2, cache, range1_pos, delimiter);
|
|
||||||
++range1_pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto &letter : *word_iter) {
|
|
||||||
levenshtein_word_cmp(letter, sentence2, cache, range1_pos, delimiter);
|
|
||||||
++range1_pos;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return cache.back();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
std::size_t levenshtein::weighted_distance(std::vector<std::wstring_view> sentence1, std::vector<std::wstring_view> sentence2, std::size_t max_distance, std::wstring_view delimiter) {
|
|
||||||
remove_common_affix(sentence1, sentence2);
|
|
||||||
std::size_t sentence1_len = utils::joined_size(sentence1, delimiter);
|
|
||||||
std::size_t sentence2_len = utils::joined_size(sentence2, delimiter);
|
|
||||||
|
|
||||||
if (sentence2_len > sentence1_len) {
|
|
||||||
std::swap(sentence1, sentence2);
|
|
||||||
std::swap(sentence1_len, sentence2_len);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!sentence2_len) {
|
|
||||||
return sentence1_len;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::size_t> cache(sentence2_len);
|
|
||||||
std::iota(cache.begin(), cache.end(), 1);
|
|
||||||
|
|
||||||
std::size_t range1_pos = 0;
|
|
||||||
auto word_iter = sentence1.begin();
|
|
||||||
|
|
||||||
// no delimiter in front of first word
|
|
||||||
for (const auto &letter : *word_iter) {
|
|
||||||
auto min_distance = levenshtein_word_cmp<std::true_type>(letter, sentence2, cache, range1_pos, delimiter);
|
|
||||||
if (min_distance > max_distance) {
|
|
||||||
return std::numeric_limits<std::size_t>::max();
|
|
||||||
}
|
|
||||||
++range1_pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
++word_iter;
|
|
||||||
for (; word_iter != sentence1.end(); ++word_iter) {
|
|
||||||
// delimiter between words
|
|
||||||
for (const auto &letter : delimiter) {
|
|
||||||
auto min_distance = levenshtein_word_cmp<std::true_type>(letter, sentence2, cache, range1_pos, delimiter);
|
|
||||||
if (min_distance > max_distance) {
|
|
||||||
return std::numeric_limits<std::size_t>::max();
|
|
||||||
}
|
|
||||||
++range1_pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto &letter : *word_iter) {
|
|
||||||
auto min_distance = levenshtein_word_cmp<std::true_type>(letter, sentence2, cache, range1_pos, delimiter);
|
|
||||||
if (min_distance > max_distance) {
|
|
||||||
return std::numeric_limits<std::size_t>::max();
|
|
||||||
}
|
|
||||||
++range1_pos;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return cache.back();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
std::size_t levenshtein::weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, std::wstring_view delimiter) {
|
|
||||||
remove_common_affix(sentence1, sentence2);
|
|
||||||
|
|
||||||
if (sentence2.length() > sentence1.length()) std::swap(sentence1, sentence2);
|
|
||||||
|
|
||||||
if (sentence2.empty()) {
|
|
||||||
return sentence1.length();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::size_t> cache(sentence2.length());
|
|
||||||
std::iota(cache.begin(), cache.end(), 1);
|
|
||||||
|
|
||||||
std::size_t sentence1_pos = 0;
|
|
||||||
for (const auto &char1 : sentence1) {
|
|
||||||
auto cache_iter = cache.begin();
|
|
||||||
std::size_t current_cache = sentence1_pos;
|
|
||||||
std::size_t result = sentence1_pos + 1;
|
|
||||||
for (const auto &char2 : sentence2) {
|
|
||||||
if (char1 == char2) {
|
|
||||||
result = current_cache;
|
|
||||||
} else {
|
|
||||||
++result;
|
|
||||||
}
|
|
||||||
current_cache = *cache_iter;
|
|
||||||
if (result > current_cache + 1) {
|
|
||||||
result = current_cache + 1;
|
|
||||||
}
|
|
||||||
*cache_iter = result;
|
|
||||||
++cache_iter;
|
|
||||||
}
|
|
||||||
++sentence1_pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
return cache.back();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
std::size_t levenshtein::weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, std::size_t max_distance, std::wstring_view delimiter) {
|
|
||||||
remove_common_affix(sentence1, sentence2);
|
|
||||||
|
|
||||||
if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2);
|
|
||||||
|
|
||||||
if (sentence2.empty()) {
|
|
||||||
return sentence1.length();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::size_t> cache(sentence2.length());
|
|
||||||
std::iota(cache.begin(), cache.end(), 1);
|
|
||||||
|
|
||||||
std::size_t sentence1_pos = 0;
|
|
||||||
for (const auto &char1 : sentence1) {
|
|
||||||
auto cache_iter = cache.begin();
|
|
||||||
std::size_t current_cache = sentence1_pos;
|
|
||||||
std::size_t result = sentence1_pos+1;
|
|
||||||
auto min_distance = std::numeric_limits<std::size_t>::max();
|
|
||||||
for (const auto &char2 : sentence2) {
|
|
||||||
if (char1 == char2) {
|
|
||||||
result = current_cache;
|
|
||||||
} else {
|
|
||||||
++result;
|
|
||||||
}
|
|
||||||
current_cache = *cache_iter;
|
|
||||||
if (result > current_cache + 1) {
|
|
||||||
result = current_cache + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (current_cache < min_distance) {
|
|
||||||
min_distance = current_cache;
|
|
||||||
}
|
|
||||||
*cache_iter = result;
|
|
||||||
++cache_iter;
|
|
||||||
}
|
|
||||||
if (min_distance > max_distance) {
|
|
||||||
return std::numeric_limits<std::size_t>::max();
|
|
||||||
}
|
|
||||||
++sentence1_pos;
|
|
||||||
}
|
|
||||||
return cache.back();
|
|
||||||
}
|
|
|
@ -3,9 +3,10 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
#include <optional>
|
||||||
|
#include <numeric>
|
||||||
#include "utils.hpp"
|
#include "utils.hpp"
|
||||||
|
|
||||||
|
|
||||||
namespace levenshtein {
|
namespace levenshtein {
|
||||||
enum EditType {
|
enum EditType {
|
||||||
EditKeep,
|
EditKeep,
|
||||||
|
@ -47,6 +48,10 @@ namespace levenshtein {
|
||||||
std::vector<MatchingBlock> matching_blocks(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2);
|
std::vector<MatchingBlock> matching_blocks(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2);
|
||||||
|
|
||||||
|
|
||||||
|
template<typename MinDistanceCalc=std::false_type, typename CharT>
|
||||||
|
auto levenshtein_word_cmp(const CharT &letter_cmp, const string_view_vec<CharT> &words,
|
||||||
|
std::vector<std::size_t> &cache, std::size_t current_cache);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculates the minimum number of insertions, deletions, and substitutions
|
* Calculates the minimum number of insertions, deletions, and substitutions
|
||||||
* required to change one sequence into the other according to Levenshtein.
|
* required to change one sequence into the other according to Levenshtein.
|
||||||
|
@ -58,68 +63,31 @@ namespace levenshtein {
|
||||||
* Insert | 1
|
* Insert | 1
|
||||||
* Remove | 1
|
* Remove | 1
|
||||||
* Replace | 2
|
* Replace | 2
|
||||||
|
*
|
||||||
|
* @param sentence1 first sentence to match (can be either a string type or a vector of strings)
|
||||||
|
* @param sentence2 second sentence to match (can be either a string type or a vector of strings)
|
||||||
|
* @param max_distance maximum distance to exit early. When using this the calculation is about 20% slower
|
||||||
|
* so when it can not exit early it should not be used
|
||||||
|
* @return weighted levenshtein distance
|
||||||
*/
|
*/
|
||||||
std::size_t weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2,
|
template<typename CharT, typename MinDistance=std::nullopt_t>
|
||||||
std::wstring_view delimiter=L"");
|
std::size_t weighted_distance_impl(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2, MinDistance max_distance=std::nullopt);
|
||||||
std::size_t weighted_distance(std::vector<std::wstring_view> sentence1, std::vector<std::wstring_view> sentence2,
|
|
||||||
std::wstring_view delimiter=L"");
|
|
||||||
|
|
||||||
|
template<typename MinDistance=std::nullopt_t>
|
||||||
|
std::size_t weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, MinDistance max_distance=std::nullopt);
|
||||||
|
|
||||||
/**
|
template<typename MinDistance=std::nullopt_t>
|
||||||
* These functions allow providing a max_distance parameter that can be used to exit early when the
|
std::size_t weighted_distance(std::string_view sentence1, std::string_view sentence2, MinDistance max_distance=std::nullopt);
|
||||||
* calculated levenshtein distance is at least as big as max_distance and will return the maximal
|
|
||||||
* possible value for std::size_t.
|
template<typename CharT, typename MinDistance=std::nullopt_t>
|
||||||
* This range check makes the levenshtein calculation about 20% slower, so it should be only used
|
std::size_t weighted_distance(string_view_vec<CharT> sentence1, string_view_vec<CharT> sentence2, MinDistance max_distance=std::nullopt);
|
||||||
* when it can usually exit early.
|
|
||||||
*/
|
|
||||||
std::size_t weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2,
|
|
||||||
std::size_t max_distance, std::wstring_view delimiter=L"");
|
|
||||||
std::size_t weighted_distance(std::vector<std::wstring_view> sentence1, std::vector<std::wstring_view> sentence2,
|
|
||||||
std::size_t max_distance, std::wstring_view delimiter=L"");
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculates a normalized score of the weighted Levenshtein algorithm between 0.0 and
|
* Calculates a normalized score of the weighted Levenshtein algorithm between 0.0 and
|
||||||
* 1.0 (inclusive), where 1.0 means the sequences are the same.
|
* 1.0 (inclusive), where 1.0 means the sequences are the same.
|
||||||
*/
|
*/
|
||||||
template<typename Sentence1, typename Sentence2>
|
template<typename Sentence1, typename Sentence2>
|
||||||
float normalized_weighted_distance(const Sentence1 &sentence1, const Sentence2 &sentence2,
|
float normalized_weighted_distance(const Sentence1 &sentence1, const Sentence2 &sentence2, float min_ratio=0.0);
|
||||||
float min_ratio=0.0, std::wstring_view delimiter=L"")
|
|
||||||
{
|
|
||||||
if (sentence1.empty() && sentence2.empty()) {
|
|
||||||
return 1.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (sentence1.empty() || sentence1.empty()) {
|
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::size_t sentence1_len = utils::joined_size(sentence1, delimiter);
|
|
||||||
std::size_t sentence2_len = utils::joined_size(sentence2, delimiter);
|
|
||||||
std::size_t lensum = sentence1_len + sentence2_len;
|
|
||||||
|
|
||||||
// constant time calculation to find a string ratio based on the string length
|
|
||||||
// so it can exit early without running any levenshtein calculations
|
|
||||||
std::size_t min_distance = (sentence1_len > sentence2_len)
|
|
||||||
? sentence1_len - sentence2_len
|
|
||||||
: sentence2_len - sentence1_len;
|
|
||||||
|
|
||||||
float len_ratio = 1.0 - (float)min_distance / (float)lensum;
|
|
||||||
if (len_ratio < min_ratio) {
|
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: this needs more thoughts when to start using score cutoff, since it performs slower when it can not exit early
|
|
||||||
// -> just because it has a smaller ratio does not mean levenshtein can always exit early
|
|
||||||
// has to be tested with some more real examples
|
|
||||||
std::size_t distance = (min_ratio > 0.7)
|
|
||||||
? weighted_distance(sentence1, sentence2, std::ceil((float)lensum - min_ratio * lensum), delimiter)
|
|
||||||
: weighted_distance(sentence1, sentence2, delimiter);
|
|
||||||
|
|
||||||
if (distance == std::numeric_limits<std::size_t>::max()) {
|
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
return 1.0 - (float)distance / (float)lensum;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -268,3 +236,223 @@ levenshtein::matching_blocks(std::basic_string_view<CharT> sentence1, std::basic
|
||||||
mblocks.emplace_back(sentence1.length(), sentence2.length(), 0);
|
mblocks.emplace_back(sentence1.length(), sentence2.length(), 0);
|
||||||
return mblocks;
|
return mblocks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename MinDistanceCalc=std::false_type, typename CharT>
|
||||||
|
inline auto levenshtein::levenshtein_word_cmp(const CharT &letter_cmp, const string_view_vec<CharT> &words,
|
||||||
|
std::vector<std::size_t> &cache, std::size_t current_cache)
|
||||||
|
{
|
||||||
|
std::size_t result = current_cache + 1;
|
||||||
|
auto cache_iter = cache.begin();
|
||||||
|
auto word_iter = words.begin();
|
||||||
|
auto min_distance = std::numeric_limits<std::size_t>::max();
|
||||||
|
|
||||||
|
auto charCmp = [&] (const CharT &char2) {
|
||||||
|
if (letter_cmp == char2) { result = current_cache; }
|
||||||
|
else { ++result; }
|
||||||
|
|
||||||
|
current_cache = *cache_iter;
|
||||||
|
if (result > current_cache + 1) {
|
||||||
|
result = current_cache + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr(!std::is_same_v<std::false_type, MinDistanceCalc>) {
|
||||||
|
if (current_cache < min_distance) {
|
||||||
|
min_distance = current_cache;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
*cache_iter = result;
|
||||||
|
++cache_iter;
|
||||||
|
};
|
||||||
|
|
||||||
|
// no whitespace should be added in front of the first word
|
||||||
|
for (const auto &letter : *word_iter) {
|
||||||
|
charCmp(letter);
|
||||||
|
}
|
||||||
|
++word_iter;
|
||||||
|
|
||||||
|
for (; word_iter != words.end(); ++word_iter) {
|
||||||
|
// between every word there should be one whitespace
|
||||||
|
charCmp(0x20);
|
||||||
|
|
||||||
|
// check following word
|
||||||
|
for (const auto &letter : *word_iter) {
|
||||||
|
charCmp(letter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr(!std::is_same_v<std::false_type, MinDistanceCalc>) {
|
||||||
|
return min_distance;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename CharT, typename MinDistance=std::nullopt_t>
|
||||||
|
inline std::size_t levenshtein::weighted_distance(string_view_vec<CharT> sentence1, string_view_vec<CharT> sentence2,
|
||||||
|
MinDistance max_distance) {
|
||||||
|
remove_common_affix(sentence1, sentence2);
|
||||||
|
std::size_t sentence1_len = utils::joined_size(sentence1);
|
||||||
|
std::size_t sentence2_len = utils::joined_size(sentence2);
|
||||||
|
|
||||||
|
if (sentence2_len > sentence1_len) {
|
||||||
|
std::swap(sentence1, sentence2);
|
||||||
|
std::swap(sentence1_len, sentence2_len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!sentence2_len) {
|
||||||
|
return sentence1_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::size_t> cache(sentence2_len);
|
||||||
|
std::iota(cache.begin(), cache.end(), 1);
|
||||||
|
|
||||||
|
std::size_t range1_pos = 0;
|
||||||
|
auto word_iter = sentence1.begin();
|
||||||
|
|
||||||
|
// no delimiter in front of first word
|
||||||
|
for (const auto &letter : *word_iter) {
|
||||||
|
if constexpr(!std::is_same_v<MinDistance, std::nullopt_t>) {
|
||||||
|
size_t min_distance = levenshtein_word_cmp<std::true_type>(letter, sentence2, cache, range1_pos);
|
||||||
|
if (min_distance > max_distance) {
|
||||||
|
return std::numeric_limits<std::size_t>::max();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
levenshtein_word_cmp(letter, sentence2, cache, range1_pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
++range1_pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
++word_iter;
|
||||||
|
for (; word_iter != sentence1.end(); ++word_iter) {
|
||||||
|
// whitespace between words
|
||||||
|
if constexpr(!std::is_same_v<MinDistance, std::nullopt_t>) {
|
||||||
|
size_t min_distance = levenshtein_word_cmp<std::true_type>((CharT)0x20, sentence2, cache, range1_pos);
|
||||||
|
if (min_distance > max_distance) {
|
||||||
|
return std::numeric_limits<std::size_t>::max();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
levenshtein_word_cmp((CharT)0x20, sentence2, cache, range1_pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
++range1_pos;
|
||||||
|
|
||||||
|
for (const auto &letter : *word_iter) {
|
||||||
|
if constexpr(!std::is_same_v<MinDistance, std::nullopt_t>) {
|
||||||
|
size_t min_distance = levenshtein_word_cmp<std::true_type>(letter, sentence2, cache, range1_pos);
|
||||||
|
if (min_distance > max_distance) {
|
||||||
|
return std::numeric_limits<std::size_t>::max();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
levenshtein_word_cmp(letter, sentence2, cache, range1_pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
++range1_pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return cache.back();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename MinDistance=std::nullopt_t>
|
||||||
|
inline std::size_t levenshtein::weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, MinDistance max_distance) {
|
||||||
|
return weighted_distance_impl(sentence1, sentence2, max_distance);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename MinDistance=std::nullopt_t>
|
||||||
|
inline std::size_t levenshtein::weighted_distance(std::string_view sentence1, std::string_view sentence2, MinDistance max_distance) {
|
||||||
|
return weighted_distance_impl(sentence1, sentence2, max_distance);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename CharT, typename MinDistance=std::nullopt_t>
|
||||||
|
inline std::size_t levenshtein::weighted_distance_impl(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2, MinDistance max_distance) {
|
||||||
|
|
||||||
|
remove_common_affix(sentence1, sentence2);
|
||||||
|
|
||||||
|
if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2);
|
||||||
|
|
||||||
|
if (sentence2.empty()) {
|
||||||
|
return sentence1.length();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::size_t> cache(sentence2.length());
|
||||||
|
std::iota(cache.begin(), cache.end(), 1);
|
||||||
|
|
||||||
|
std::size_t sentence1_pos = 0;
|
||||||
|
for (const auto &char1 : sentence1) {
|
||||||
|
auto cache_iter = cache.begin();
|
||||||
|
std::size_t current_cache = sentence1_pos;
|
||||||
|
std::size_t result = sentence1_pos+1;
|
||||||
|
auto min_distance = std::numeric_limits<std::size_t>::max();
|
||||||
|
for (const auto &char2 : sentence2) {
|
||||||
|
if (char1 == char2) {
|
||||||
|
result = current_cache;
|
||||||
|
} else {
|
||||||
|
++result;
|
||||||
|
}
|
||||||
|
current_cache = *cache_iter;
|
||||||
|
if (result > current_cache + 1) {
|
||||||
|
result = current_cache + 1;
|
||||||
|
}
|
||||||
|
if constexpr(!std::is_same_v<MinDistance, std::nullopt_t>) {
|
||||||
|
if (current_cache < min_distance) {
|
||||||
|
min_distance = current_cache;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*cache_iter = result;
|
||||||
|
++cache_iter;
|
||||||
|
}
|
||||||
|
if constexpr(!std::is_same_v<MinDistance, std::nullopt_t>) {
|
||||||
|
if (min_distance > max_distance) {
|
||||||
|
return std::numeric_limits<std::size_t>::max();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
++sentence1_pos;
|
||||||
|
}
|
||||||
|
return cache.back();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<typename Sentence1, typename Sentence2>
|
||||||
|
inline float levenshtein::normalized_weighted_distance(const Sentence1 &sentence1, const Sentence2 &sentence2, float min_ratio)
|
||||||
|
{
|
||||||
|
if (sentence1.empty() && sentence2.empty()) {
|
||||||
|
return 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sentence1.empty() || sentence1.empty()) {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::size_t sentence1_len = utils::joined_size(sentence1);
|
||||||
|
std::size_t sentence2_len = utils::joined_size(sentence2);
|
||||||
|
std::size_t lensum = sentence1_len + sentence2_len;
|
||||||
|
|
||||||
|
// constant time calculation to find a string ratio based on the string length
|
||||||
|
// so it can exit early without running any levenshtein calculations
|
||||||
|
std::size_t min_distance = (sentence1_len > sentence2_len)
|
||||||
|
? sentence1_len - sentence2_len
|
||||||
|
: sentence2_len - sentence1_len;
|
||||||
|
|
||||||
|
float len_ratio = 1.0 - (float)min_distance / (float)lensum;
|
||||||
|
if (len_ratio < min_ratio) {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: this needs more thoughts when to start using score cutoff, since it performs slower when it can not exit early
|
||||||
|
// -> just because it has a smaller ratio does not mean levenshtein can always exit early
|
||||||
|
// has to be tested with some more real examples
|
||||||
|
std::size_t distance = (min_ratio > 0.7)
|
||||||
|
? weighted_distance(sentence1, sentence2, std::ceil((float)lensum - min_ratio * lensum))
|
||||||
|
: weighted_distance(sentence1, sentence2);
|
||||||
|
|
||||||
|
if (distance == std::numeric_limits<std::size_t>::max()) {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
return 1.0 - (float)distance / (float)lensum;
|
||||||
|
}
|
||||||
|
|
|
@ -39,11 +39,11 @@ namespace utils {
|
||||||
decomposed_set<CharT> set_decomposition(string_view_vec<CharT> a, string_view_vec<CharT> b);
|
decomposed_set<CharT> set_decomposition(string_view_vec<CharT> a, string_view_vec<CharT> b);
|
||||||
|
|
||||||
|
|
||||||
template<typename T, typename Delimiter=std::nullopt_t>
|
template<typename T>
|
||||||
inline std::size_t joined_size(const T &x, const Delimiter &delimiter=std::nullopt);
|
std::size_t joined_size(const T &x);
|
||||||
|
|
||||||
template<typename T, typename Delimiter=std::nullopt_t>
|
template<typename T>
|
||||||
inline std::size_t joined_size(const std::vector<T> &x, const Delimiter &delimiter=std::nullopt);
|
std::size_t joined_size(const std::vector<T> &x);
|
||||||
|
|
||||||
|
|
||||||
template<typename CharT>
|
template<typename CharT>
|
||||||
|
@ -170,31 +170,22 @@ inline void remove_common_affix(std::vector<T> &a, std::vector<T> &b)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<typename T, typename Delimiter=std::nullopt_t>
|
template<typename T>
|
||||||
inline std::size_t utils::joined_size(const T &x, const Delimiter &delimiter){
|
inline std::size_t utils::joined_size(const T &x){
|
||||||
return x.size();
|
return x.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, typename Delimiter=std::nullopt_t>
|
|
||||||
inline std::size_t utils::joined_size(const std::vector<T> &x, const Delimiter &delimiter){
|
template<typename T>
|
||||||
|
inline std::size_t utils::joined_size(const std::vector<T> &x){
|
||||||
if (x.empty()) {
|
if (x.empty()) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
std::size_t result;
|
|
||||||
|
|
||||||
if constexpr(!std::is_same<std::nullopt_t, Delimiter>::value) {
|
// there is a whitespace between each word
|
||||||
if constexpr(std::is_integral<Delimiter>::value) {
|
std::size_t result = x.size() - 1;
|
||||||
result = (x.size() - 1) * delimiter;
|
for (const auto &y: x) result += y.size();
|
||||||
} else {
|
|
||||||
result = (x.size() - 1) * delimiter.size();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
result = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto &y: x) {
|
|
||||||
result += joined_size(y, delimiter);
|
|
||||||
}
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue