reduce template usage to a minimum
This commit is contained in:
parent
d665e3b961
commit
028db547d1
|
@ -1,6 +1,7 @@
|
|||
.vscode/
|
||||
__pycache__/
|
||||
.idea/
|
||||
build/
|
||||
rapidfuzz.egg-info/
|
||||
dist/
|
||||
*.data
|
||||
|
|
|
@ -0,0 +1,241 @@
|
|||
#include "levenshtein.hpp"
|
||||
|
||||
levenshtein::Matrix levenshtein::matrix(std::wstring_view sentence1, std::wstring_view sentence2) {
|
||||
Affix affix = utils::remove_common_affix(sentence1, sentence2);
|
||||
|
||||
std::size_t matrix_columns = sentence1.length() + 1;
|
||||
std::size_t matrix_rows = sentence2.length() + 1;
|
||||
|
||||
std::vector<std::size_t> cache_matrix(matrix_rows*matrix_columns, 0);
|
||||
|
||||
for (std::size_t i = 0; i < matrix_rows; ++i) {
|
||||
cache_matrix[i] = i;
|
||||
}
|
||||
|
||||
for (std::size_t i = 1; i < matrix_columns; ++i) {
|
||||
cache_matrix[matrix_rows*i] = i;
|
||||
}
|
||||
|
||||
std::size_t sentence1_pos = 0;
|
||||
for (const auto &char1 : sentence1) {
|
||||
auto prev_cache = cache_matrix.begin() + sentence1_pos * matrix_rows;
|
||||
auto result_cache = cache_matrix.begin() + (sentence1_pos + 1) * matrix_rows + 1;
|
||||
std::size_t result = sentence1_pos + 1;
|
||||
for (const auto &char2 : sentence2) {
|
||||
result = std::min({
|
||||
result + 1,
|
||||
*prev_cache + (char1 != char2),
|
||||
*(++prev_cache) + 1
|
||||
});
|
||||
*result_cache = result;
|
||||
++result_cache;
|
||||
}
|
||||
++sentence1_pos;
|
||||
}
|
||||
|
||||
return Matrix {
|
||||
affix.prefix_len,
|
||||
cache_matrix,
|
||||
matrix_columns,
|
||||
matrix_rows
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
std::vector<levenshtein::EditOp> levenshtein::editops(std::wstring_view sentence1, std::wstring_view sentence2) {
|
||||
auto m = matrix(sentence1, sentence2);
|
||||
std::size_t matrix_columns = m.matrix_columns;
|
||||
std::size_t matrix_rows = m.matrix_rows;
|
||||
std::size_t prefix_len = m.prefix_len;
|
||||
auto lev_matrix = m.matrix;
|
||||
|
||||
std::vector<EditOp> ops;
|
||||
ops.reserve(lev_matrix[matrix_columns * matrix_rows - 1]);
|
||||
|
||||
std::size_t i = matrix_columns - 1;
|
||||
std::size_t j = matrix_rows - 1;
|
||||
std::size_t position = matrix_columns * matrix_rows - 1;
|
||||
|
||||
auto is_replace = [=](std::size_t pos) {
|
||||
return lev_matrix[pos - matrix_rows - 1] < lev_matrix[pos];
|
||||
};
|
||||
auto is_insert = [=](std::size_t pos) {
|
||||
return lev_matrix[pos - 1] < lev_matrix[pos];
|
||||
};
|
||||
auto is_delete = [=](std::size_t pos) {
|
||||
return lev_matrix[pos - matrix_rows] < lev_matrix[pos];
|
||||
};
|
||||
auto is_keep = [=](std::size_t pos) {
|
||||
return lev_matrix[pos - matrix_rows - 1] == lev_matrix[pos];
|
||||
};
|
||||
|
||||
while (i > 0 || j > 0) {
|
||||
EditType op_type;
|
||||
|
||||
if (i && j && is_replace(position)) {
|
||||
op_type = EditType::EditReplace;
|
||||
--i;
|
||||
--j;
|
||||
position -= matrix_rows + 1;
|
||||
} else if (j && is_insert(position)) {
|
||||
op_type = EditType::EditInsert;
|
||||
--j;
|
||||
--position;
|
||||
} else if (i && is_delete(position)) {
|
||||
op_type = EditType::EditDelete;
|
||||
--i;
|
||||
position -= matrix_rows;
|
||||
} else if (is_keep(position)) {
|
||||
--i;
|
||||
--j;
|
||||
position -= matrix_rows + 1;
|
||||
// EditKeep does not has to be stored
|
||||
continue;
|
||||
} else {
|
||||
throw std::logic_error("something went wrong extracting the editops from the levenshtein matrix");
|
||||
}
|
||||
|
||||
ops.emplace_back(op_type, i + prefix_len, j + prefix_len);
|
||||
}
|
||||
|
||||
std::reverse(ops.begin(), ops.end());
|
||||
return ops;
|
||||
}
|
||||
|
||||
|
||||
std::vector<levenshtein::MatchingBlock> levenshtein::matching_blocks(std::wstring_view sentence1, std::wstring_view sentence2) {
|
||||
auto edit_ops = editops(sentence1, sentence2);
|
||||
std::size_t first_start = 0;
|
||||
std::size_t second_start = 0;
|
||||
std::vector<MatchingBlock> mblocks;
|
||||
|
||||
for (const auto &op : edit_ops) {
|
||||
if (op.op_type == EditType::EditKeep) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (first_start < op.first_start || second_start < op.second_start) {
|
||||
mblocks.emplace_back(first_start, second_start, op.first_start - first_start);
|
||||
first_start = op.first_start;
|
||||
second_start = op.second_start;
|
||||
}
|
||||
|
||||
switch (op.op_type) {
|
||||
case EditType::EditReplace:
|
||||
first_start += 1;
|
||||
second_start += 1;
|
||||
break;
|
||||
case EditType::EditDelete:
|
||||
first_start += 1;
|
||||
break;
|
||||
case EditType::EditInsert:
|
||||
second_start += 1;
|
||||
break;
|
||||
case EditType::EditKeep:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
mblocks.emplace_back(sentence1.length(), sentence2.length(), 0);
|
||||
return mblocks;
|
||||
}
|
||||
|
||||
|
||||
float levenshtein::normalized_distance(std::wstring_view sentence1, std::wstring_view sentence2, float min_ratio) {
|
||||
if (sentence1.empty() || sentence2.empty()) {
|
||||
return sentence1.empty() && sentence2.empty();
|
||||
}
|
||||
|
||||
std::size_t sentence1_len = utils::joined_size(sentence1);
|
||||
std::size_t sentence2_len = utils::joined_size(sentence2);
|
||||
std::size_t max_len = std::max(sentence1_len, sentence2_len);
|
||||
|
||||
// constant time calculation to find a string ratio based on the string length
|
||||
// so it can exit early without running any levenshtein calculations
|
||||
std::size_t min_distance = (sentence1_len > sentence2_len)
|
||||
? sentence1_len - sentence2_len
|
||||
: sentence2_len - sentence1_len;
|
||||
|
||||
float len_ratio = 1.0 - (float)min_distance / (float)max_len;
|
||||
if (len_ratio < min_ratio) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
std::size_t dist = distance(sentence1, sentence2);
|
||||
|
||||
float ratio = 1.0 - (float)dist / (float)max_len;
|
||||
return (ratio >= min_ratio) ? ratio : 0.0;
|
||||
}
|
||||
|
||||
|
||||
std::size_t levenshtein::distance(std::wstring_view sentence1, std::wstring_view sentence2) {
|
||||
|
||||
utils::remove_common_affix(sentence1, sentence2);
|
||||
|
||||
if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2);
|
||||
|
||||
if (sentence2.empty()) {
|
||||
return sentence1.length();
|
||||
}
|
||||
|
||||
std::vector<std::size_t> cache(sentence2.length()+1);
|
||||
std::iota(cache.begin(), cache.end(), 0);
|
||||
|
||||
for (const auto &char1 : sentence1) {
|
||||
auto cache_iter = cache.begin();
|
||||
std::size_t temp = *cache_iter;
|
||||
*cache_iter += 1;
|
||||
|
||||
for (const auto& char2 : sentence2) {
|
||||
if (char1 != char2) {
|
||||
++temp;
|
||||
}
|
||||
|
||||
temp = std::min({
|
||||
*cache_iter + 1,
|
||||
*(++cache_iter) + 1,
|
||||
temp
|
||||
});
|
||||
std::swap(*cache_iter, temp);
|
||||
}
|
||||
}
|
||||
return cache.back();
|
||||
}
|
||||
|
||||
|
||||
std::size_t levenshtein::generic_distance(std::wstring_view sentence1, std::wstring_view sentence2,
|
||||
std::size_t insert_cost, std::size_t delete_cost, std::size_t replace_cost)
|
||||
{
|
||||
utils::remove_common_affix(sentence1, sentence2);
|
||||
if (sentence1.size() > sentence2.size()) {
|
||||
std::swap(sentence1, sentence2);
|
||||
std::swap(insert_cost, delete_cost);
|
||||
}
|
||||
|
||||
std::vector<std::size_t> cache(sentence1.size() + 1);
|
||||
|
||||
cache[0] = 0;
|
||||
for (std::size_t i = 1; i < cache.size(); ++i) {
|
||||
cache[i] = cache[i - 1] + delete_cost;
|
||||
}
|
||||
|
||||
for (const auto &char2 : sentence2) {
|
||||
auto cache_iter = cache.begin();
|
||||
std::size_t temp = *cache_iter;
|
||||
*cache_iter += insert_cost;
|
||||
|
||||
for (const auto &char1 : sentence1) {
|
||||
if (char1 != char2) {
|
||||
temp = std::min({
|
||||
*cache_iter + delete_cost,
|
||||
*(cache_iter+1) + insert_cost,
|
||||
temp + replace_cost
|
||||
});
|
||||
}
|
||||
++cache_iter;
|
||||
std::swap(*cache_iter, temp);
|
||||
}
|
||||
}
|
||||
|
||||
return cache.back();
|
||||
}
|
|
@ -30,11 +30,9 @@ namespace levenshtein {
|
|||
std::size_t matrix_rows;
|
||||
};
|
||||
|
||||
template<typename CharT>
|
||||
Matrix matrix(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2);
|
||||
Matrix matrix(std::wstring_view sentence1, std::wstring_view sentence2);
|
||||
|
||||
template<typename CharT>
|
||||
std::vector<EditOp> editops(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2);
|
||||
std::vector<EditOp> editops(std::wstring_view sentence1, std::wstring_view sentence2);
|
||||
|
||||
struct MatchingBlock {
|
||||
std::size_t first_start;
|
||||
|
@ -44,8 +42,7 @@ namespace levenshtein {
|
|||
: first_start(first_start), second_start(second_start), len(len) {}
|
||||
};
|
||||
|
||||
template<typename CharT>
|
||||
std::vector<MatchingBlock> matching_blocks(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2);
|
||||
std::vector<MatchingBlock> matching_blocks(std::wstring_view sentence1, std::wstring_view sentence2);
|
||||
|
||||
|
||||
float normalized_distance(std::wstring_view sentence1, std::wstring_view sentence2, float min_ratio=0.0);
|
||||
|
@ -53,8 +50,8 @@ namespace levenshtein {
|
|||
std::size_t distance(std::wstring_view sentence1, std::wstring_view sentence2);
|
||||
|
||||
|
||||
template<typename MaxDistanceCalc=std::false_type, typename CharT>
|
||||
auto levenshtein_word_cmp(const CharT &letter_cmp, const string_view_vec<CharT> &words,
|
||||
template<typename MaxDistanceCalc=std::false_type>
|
||||
auto levenshtein_word_cmp(const wchar_t &letter_cmp, const std::vector<std::wstring_view> &words,
|
||||
std::vector<std::size_t> &cache, std::size_t current_cache);
|
||||
|
||||
/**
|
||||
|
@ -75,20 +72,14 @@ namespace levenshtein {
|
|||
* so when it can not exit early it should not be used
|
||||
* @return weighted levenshtein distance
|
||||
*/
|
||||
template<typename CharT, typename MaxDistance=std::nullopt_t>
|
||||
std::size_t weighted_distance_impl(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2, MaxDistance max_distance=std::nullopt);
|
||||
|
||||
template<typename MaxDistance=std::nullopt_t>
|
||||
std::size_t weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, MaxDistance max_distance=std::nullopt);
|
||||
|
||||
template<typename MaxDistance=std::nullopt_t>
|
||||
std::size_t weighted_distance(std::string_view sentence1, std::string_view sentence2, MaxDistance max_distance=std::nullopt);
|
||||
|
||||
template<typename CharT, typename MaxDistance=std::nullopt_t>
|
||||
std::size_t weighted_distance(string_view_vec<CharT> sentence1, string_view_vec<CharT> sentence2, MaxDistance max_distance=std::nullopt);
|
||||
std::size_t weighted_distance(std::vector<std::wstring_view> sentence1, std::vector<std::wstring_view> sentence2, MaxDistance max_distance=std::nullopt);
|
||||
|
||||
|
||||
size_t generic_distance(std::wstring_view source, std::wstring_view target, size_t insert_cost = 1, size_t delete_cost = 1, size_t replace_cost = 1);
|
||||
std::size_t generic_distance(std::wstring_view source, std::wstring_view target, std::size_t insert_cost = 1, std::size_t delete_cost = 1, std::size_t replace_cost = 1);
|
||||
|
||||
/**
|
||||
* Calculates a normalized score of the weighted Levenshtein algorithm between 0.0 and
|
||||
|
@ -100,214 +91,8 @@ namespace levenshtein {
|
|||
|
||||
|
||||
|
||||
template<typename CharT>
|
||||
inline levenshtein::Matrix levenshtein::matrix(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2) {
|
||||
Affix affix = remove_common_affix(sentence1, sentence2);
|
||||
|
||||
std::size_t matrix_columns = sentence1.length() + 1;
|
||||
std::size_t matrix_rows = sentence2.length() + 1;
|
||||
|
||||
std::vector<std::size_t> cache_matrix(matrix_rows*matrix_columns, 0);
|
||||
|
||||
for (std::size_t i = 0; i < matrix_rows; ++i) {
|
||||
cache_matrix[i] = i;
|
||||
}
|
||||
|
||||
for (std::size_t i = 1; i < matrix_columns; ++i) {
|
||||
cache_matrix[matrix_rows*i] = i;
|
||||
}
|
||||
|
||||
std::size_t sentence1_pos = 0;
|
||||
for (const auto &char1 : sentence1) {
|
||||
auto prev_cache = cache_matrix.begin() + sentence1_pos * matrix_rows;
|
||||
auto result_cache = cache_matrix.begin() + (sentence1_pos + 1) * matrix_rows + 1;
|
||||
std::size_t result = sentence1_pos + 1;
|
||||
for (const auto &char2 : sentence2) {
|
||||
result = std::min({
|
||||
result + 1,
|
||||
*prev_cache + (char1 != char2),
|
||||
*(++prev_cache) + 1
|
||||
});
|
||||
*result_cache = result;
|
||||
++result_cache;
|
||||
}
|
||||
++sentence1_pos;
|
||||
}
|
||||
|
||||
return Matrix {
|
||||
affix.prefix_len,
|
||||
cache_matrix,
|
||||
matrix_columns,
|
||||
matrix_rows
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename CharT>
|
||||
inline std::vector<levenshtein::EditOp>
|
||||
levenshtein::editops(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2) {
|
||||
auto m = matrix(sentence1, sentence2);
|
||||
std::size_t matrix_columns = m.matrix_columns;
|
||||
std::size_t matrix_rows = m.matrix_rows;
|
||||
std::size_t prefix_len = m.prefix_len;
|
||||
auto lev_matrix = m.matrix;
|
||||
|
||||
std::vector<EditOp> ops;
|
||||
ops.reserve(lev_matrix[matrix_columns * matrix_rows - 1]);
|
||||
|
||||
std::size_t i = matrix_columns - 1;
|
||||
std::size_t j = matrix_rows - 1;
|
||||
std::size_t position = matrix_columns * matrix_rows - 1;
|
||||
|
||||
auto is_replace = [=](std::size_t pos) {
|
||||
return lev_matrix[pos - matrix_rows - 1] < lev_matrix[pos];
|
||||
};
|
||||
auto is_insert = [=](std::size_t pos) {
|
||||
return lev_matrix[pos - 1] < lev_matrix[pos];
|
||||
};
|
||||
auto is_delete = [=](std::size_t pos) {
|
||||
return lev_matrix[pos - matrix_rows] < lev_matrix[pos];
|
||||
};
|
||||
auto is_keep = [=](std::size_t pos) {
|
||||
return lev_matrix[pos - matrix_rows - 1] == lev_matrix[pos];
|
||||
};
|
||||
|
||||
while (i > 0 || j > 0) {
|
||||
EditType op_type;
|
||||
|
||||
if (i && j && is_replace(position)) {
|
||||
op_type = EditType::EditReplace;
|
||||
--i;
|
||||
--j;
|
||||
position -= matrix_rows + 1;
|
||||
} else if (j && is_insert(position)) {
|
||||
op_type = EditType::EditInsert;
|
||||
--j;
|
||||
--position;
|
||||
} else if (i && is_delete(position)) {
|
||||
op_type = EditType::EditDelete;
|
||||
--i;
|
||||
position -= matrix_rows;
|
||||
} else if (is_keep(position)) {
|
||||
--i;
|
||||
--j;
|
||||
position -= matrix_rows + 1;
|
||||
// EditKeep does not has to be stored
|
||||
continue;
|
||||
} else {
|
||||
throw std::logic_error("something went wrong extracting the editops from the levenshtein matrix");
|
||||
}
|
||||
|
||||
ops.emplace_back(op_type, i + prefix_len, j + prefix_len);
|
||||
}
|
||||
|
||||
std::reverse(ops.begin(), ops.end());
|
||||
return ops;
|
||||
}
|
||||
|
||||
|
||||
template<typename CharT>
|
||||
inline std::vector<levenshtein::MatchingBlock>
|
||||
levenshtein::matching_blocks(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2) {
|
||||
auto edit_ops = editops(sentence1, sentence2);
|
||||
std::size_t first_start = 0;
|
||||
std::size_t second_start = 0;
|
||||
std::vector<MatchingBlock> mblocks;
|
||||
|
||||
for (const auto &op : edit_ops) {
|
||||
if (op.op_type == EditType::EditKeep) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (first_start < op.first_start || second_start < op.second_start) {
|
||||
mblocks.emplace_back(first_start, second_start, op.first_start - first_start);
|
||||
first_start = op.first_start;
|
||||
second_start = op.second_start;
|
||||
}
|
||||
|
||||
switch (op.op_type) {
|
||||
case EditType::EditReplace:
|
||||
first_start += 1;
|
||||
second_start += 1;
|
||||
break;
|
||||
case EditType::EditDelete:
|
||||
first_start += 1;
|
||||
break;
|
||||
case EditType::EditInsert:
|
||||
second_start += 1;
|
||||
break;
|
||||
case EditType::EditKeep:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
mblocks.emplace_back(sentence1.length(), sentence2.length(), 0);
|
||||
return mblocks;
|
||||
}
|
||||
|
||||
inline float levenshtein::normalized_distance(std::wstring_view sentence1, std::wstring_view sentence2, float min_ratio) {
|
||||
if (sentence1.empty() || sentence2.empty()) {
|
||||
return sentence1.empty() && sentence2.empty();
|
||||
}
|
||||
|
||||
std::size_t sentence1_len = utils::joined_size(sentence1);
|
||||
std::size_t sentence2_len = utils::joined_size(sentence2);
|
||||
std::size_t max_len = std::max(sentence1_len, sentence2_len);
|
||||
|
||||
// constant time calculation to find a string ratio based on the string length
|
||||
// so it can exit early without running any levenshtein calculations
|
||||
std::size_t min_distance = (sentence1_len > sentence2_len)
|
||||
? sentence1_len - sentence2_len
|
||||
: sentence2_len - sentence1_len;
|
||||
|
||||
float len_ratio = 1.0 - (float)min_distance / (float)max_len;
|
||||
if (len_ratio < min_ratio) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
std::size_t dist = distance(sentence1, sentence2);
|
||||
|
||||
float ratio = 1.0 - (float)dist / (float)max_len;
|
||||
return (ratio >= min_ratio) ? ratio : 0.0;
|
||||
}
|
||||
|
||||
inline std::size_t levenshtein::distance(std::wstring_view sentence1, std::wstring_view sentence2) {
|
||||
|
||||
remove_common_affix(sentence1, sentence2);
|
||||
|
||||
if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2);
|
||||
|
||||
if (sentence2.empty()) {
|
||||
return sentence1.length();
|
||||
}
|
||||
|
||||
std::vector<std::size_t> cache(sentence2.length()+1);
|
||||
std::iota(cache.begin(), cache.end(), 0);
|
||||
|
||||
for (const auto &char1 : sentence1) {
|
||||
auto cache_iter = cache.begin();
|
||||
size_t temp = *cache_iter;
|
||||
*cache_iter += 1;
|
||||
|
||||
for (const auto& char2 : sentence2) {
|
||||
if (char1 != char2) {
|
||||
++temp;
|
||||
}
|
||||
|
||||
temp = std::min({
|
||||
*cache_iter + 1,
|
||||
*(++cache_iter) + 1,
|
||||
temp
|
||||
});
|
||||
std::swap(*cache_iter, temp);
|
||||
}
|
||||
}
|
||||
return cache.back();
|
||||
}
|
||||
|
||||
template<typename MaxDistanceCalc, typename CharT>
|
||||
inline auto levenshtein::levenshtein_word_cmp(const CharT &letter_cmp, const string_view_vec<CharT> &words,
|
||||
template<typename MaxDistanceCalc>
|
||||
inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const std::vector<std::wstring_view> &words,
|
||||
std::vector<std::size_t> &cache, std::size_t current_cache)
|
||||
{
|
||||
std::size_t result = current_cache + 1;
|
||||
|
@ -315,7 +100,7 @@ inline auto levenshtein::levenshtein_word_cmp(const CharT &letter_cmp, const str
|
|||
auto word_iter = words.begin();
|
||||
auto min_distance = std::numeric_limits<std::size_t>::max();
|
||||
|
||||
auto charCmp = [&] (const CharT &char2) {
|
||||
auto charCmp = [&] (const wchar_t &char2) {
|
||||
if (letter_cmp == char2) { result = current_cache; }
|
||||
else { ++result; }
|
||||
|
||||
|
@ -356,9 +141,9 @@ inline auto levenshtein::levenshtein_word_cmp(const CharT &letter_cmp, const str
|
|||
}
|
||||
|
||||
|
||||
template<typename CharT, typename MaxDistance>
|
||||
inline std::size_t levenshtein::weighted_distance(string_view_vec<CharT> sentence1, string_view_vec<CharT> sentence2, MaxDistance max_distance) {
|
||||
remove_common_affix(sentence1, sentence2);
|
||||
template<typename MaxDistance>
|
||||
inline std::size_t levenshtein::weighted_distance(std::vector<std::wstring_view> sentence1, std::vector<std::wstring_view> sentence2, MaxDistance max_distance) {
|
||||
utils::remove_common_affix(sentence1, sentence2);
|
||||
std::size_t sentence1_len = utils::joined_size(sentence1);
|
||||
std::size_t sentence2_len = utils::joined_size(sentence2);
|
||||
|
||||
|
@ -380,7 +165,7 @@ inline std::size_t levenshtein::weighted_distance(string_view_vec<CharT> sentenc
|
|||
// no delimiter in front of first word
|
||||
for (const auto &letter : *word_iter) {
|
||||
if constexpr(!std::is_same_v<MaxDistance, std::nullopt_t>) {
|
||||
size_t min_distance = levenshtein_word_cmp<std::true_type>(letter, sentence2, cache, range1_pos);
|
||||
std::size_t min_distance = levenshtein_word_cmp<std::true_type>(letter, sentence2, cache, range1_pos);
|
||||
if (min_distance > max_distance) {
|
||||
return std::numeric_limits<std::size_t>::max();
|
||||
}
|
||||
|
@ -395,19 +180,19 @@ inline std::size_t levenshtein::weighted_distance(string_view_vec<CharT> sentenc
|
|||
for (; word_iter != sentence1.end(); ++word_iter) {
|
||||
// whitespace between words
|
||||
if constexpr(!std::is_same_v<MaxDistance, std::nullopt_t>) {
|
||||
size_t min_distance = levenshtein_word_cmp<std::true_type>((CharT)0x20, sentence2, cache, range1_pos);
|
||||
std::size_t min_distance = levenshtein_word_cmp<std::true_type>((wchar_t)0x20, sentence2, cache, range1_pos);
|
||||
if (min_distance > max_distance) {
|
||||
return std::numeric_limits<std::size_t>::max();
|
||||
}
|
||||
} else {
|
||||
levenshtein_word_cmp((CharT)0x20, sentence2, cache, range1_pos);
|
||||
levenshtein_word_cmp((wchar_t)0x20, sentence2, cache, range1_pos);
|
||||
}
|
||||
|
||||
++range1_pos;
|
||||
|
||||
for (const auto &letter : *word_iter) {
|
||||
if constexpr(!std::is_same_v<MaxDistance, std::nullopt_t>) {
|
||||
size_t min_distance = levenshtein_word_cmp<std::true_type>(letter, sentence2, cache, range1_pos);
|
||||
std::size_t min_distance = levenshtein_word_cmp<std::true_type>(letter, sentence2, cache, range1_pos);
|
||||
if (min_distance > max_distance) {
|
||||
return std::numeric_limits<std::size_t>::max();
|
||||
}
|
||||
|
@ -425,20 +210,7 @@ inline std::size_t levenshtein::weighted_distance(string_view_vec<CharT> sentenc
|
|||
|
||||
template<typename MaxDistance>
|
||||
inline std::size_t levenshtein::weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, MaxDistance max_distance) {
|
||||
return weighted_distance_impl(sentence1, sentence2, max_distance);
|
||||
}
|
||||
|
||||
|
||||
template<typename MaxDistance>
|
||||
inline std::size_t levenshtein::weighted_distance(std::string_view sentence1, std::string_view sentence2, MaxDistance max_distance) {
|
||||
return weighted_distance_impl(sentence1, sentence2, max_distance);
|
||||
}
|
||||
|
||||
|
||||
template<typename CharT, typename MaxDistance>
|
||||
inline std::size_t levenshtein::weighted_distance_impl(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2, MaxDistance max_distance) {
|
||||
|
||||
remove_common_affix(sentence1, sentence2);
|
||||
utils::remove_common_affix(sentence1, sentence2);
|
||||
|
||||
if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2);
|
||||
|
||||
|
@ -488,43 +260,7 @@ inline std::size_t levenshtein::weighted_distance_impl(std::basic_string_view<Ch
|
|||
}
|
||||
|
||||
|
||||
inline size_t levenshtein::generic_distance(std::wstring_view sentence1, std::wstring_view sentence2,
|
||||
size_t insert_cost, size_t delete_cost, size_t replace_cost)
|
||||
{
|
||||
remove_common_affix(sentence1, sentence2);
|
||||
if (sentence1.size() > sentence2.size()) {
|
||||
std::swap(sentence1, sentence2);
|
||||
std::swap(insert_cost, delete_cost);
|
||||
}
|
||||
|
||||
const size_t min_size = sentence1.size();
|
||||
std::vector<size_t> cache(sentence1.size() + 1);
|
||||
|
||||
cache[0] = 0;
|
||||
for (size_t i = 1; i < cache.size(); ++i) {
|
||||
cache[i] = cache[i - 1] + delete_cost;
|
||||
}
|
||||
|
||||
for (const auto &char2 : sentence2) {
|
||||
auto cache_iter = cache.begin();
|
||||
size_t temp = *cache_iter;
|
||||
*cache_iter += insert_cost;
|
||||
|
||||
for (const auto &char1 : sentence1) {
|
||||
if (char1 != char2) {
|
||||
temp = std::min({
|
||||
*cache_iter + delete_cost,
|
||||
*(cache_iter+1) + insert_cost,
|
||||
temp + replace_cost
|
||||
});
|
||||
}
|
||||
++cache_iter;
|
||||
std::swap(*cache_iter, temp);
|
||||
}
|
||||
}
|
||||
|
||||
return cache.back();
|
||||
}
|
||||
|
||||
|
||||
template<typename Sentence1, typename Sentence2>
|
||||
|
@ -533,6 +269,7 @@ inline float levenshtein::normalized_weighted_distance(const Sentence1 &sentence
|
|||
if (sentence1.empty() || sentence2.empty()) {
|
||||
return sentence1.empty() && sentence2.empty();
|
||||
}
|
||||
return 1;
|
||||
|
||||
std::size_t sentence1_len = utils::joined_size(sentence1);
|
||||
std::size_t sentence2_len = utils::joined_size(sentence2);
|
||||
|
|
|
@ -0,0 +1,151 @@
|
|||
#include "utils.hpp"
|
||||
|
||||
/**
|
||||
* Finds the longest common prefix between two ranges
|
||||
*/
|
||||
template <typename InputIterator1, typename InputIterator2>
|
||||
inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1,
|
||||
InputIterator2 first2, InputIterator2 last2)
|
||||
{
|
||||
return std::distance(first1, std::mismatch(first1, last1, first2, last2).first);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common prefix of two string views
|
||||
*/
|
||||
std::size_t remove_common_prefix(std::wstring_view& a, std::wstring_view& b) {
|
||||
auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end());
|
||||
a.remove_prefix(prefix);
|
||||
b.remove_prefix(prefix);
|
||||
return prefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common suffix of two string views
|
||||
*/
|
||||
std::size_t remove_common_suffix(std::wstring_view& a, std::wstring_view& b) {
|
||||
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
|
||||
a.remove_suffix(suffix);
|
||||
b.remove_suffix(suffix);
|
||||
return suffix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common affix of two string views
|
||||
*/
|
||||
Affix utils::remove_common_affix(std::wstring_view& a, std::wstring_view& b) {
|
||||
return Affix {
|
||||
remove_common_prefix(a, b),
|
||||
remove_common_suffix(a, b)
|
||||
};
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void vec_remove_common_affix(T &a, T &b) {
|
||||
auto prefix = std::mismatch(a.begin(), a.end(), b.begin(), b.end());
|
||||
a.erase(a.begin(), prefix.first);
|
||||
b.erase(b.begin(), prefix.second);
|
||||
|
||||
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
|
||||
a.erase(a.end()-suffix, a.end());
|
||||
b.erase(b.end()-suffix, b.end());
|
||||
}
|
||||
|
||||
void utils::remove_common_affix(std::vector<std::wstring_view> &a, std::vector<std::wstring_view> &b)
|
||||
{
|
||||
vec_remove_common_affix(a, b);
|
||||
if (!a.empty() && !b.empty()) {
|
||||
remove_common_prefix(a.front(), b.front());
|
||||
remove_common_suffix(a.back(), b.back());
|
||||
}
|
||||
}
|
||||
|
||||
std::wstring utils::join(const std::vector<std::wstring_view> &sentence) {
|
||||
if (sentence.empty()) {
|
||||
return std::wstring();
|
||||
}
|
||||
|
||||
auto sentence_iter = sentence.begin();
|
||||
std::wstring result {*sentence_iter};
|
||||
const std::wstring whitespace {0x20};
|
||||
++sentence_iter;
|
||||
for (; sentence_iter != sentence.end(); ++sentence_iter) {
|
||||
result.append(whitespace).append(std::wstring {*sentence_iter});
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
percent utils::result_cutoff(float result, percent score_cutoff) {
|
||||
return (result >= score_cutoff) ? result : 0;
|
||||
}
|
||||
|
||||
// trim from start (in place)
|
||||
void ltrim(std::wstring &s) {
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
|
||||
return !std::isspace(ch);
|
||||
}));
|
||||
}
|
||||
|
||||
|
||||
// trim from end (in place)
|
||||
void rtrim(std::wstring &s) {
|
||||
s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
|
||||
return !std::isspace(ch);
|
||||
}).base(), s.end());
|
||||
}
|
||||
|
||||
|
||||
// trim from both ends (in place)
|
||||
void utils::trim(std::wstring &s) {
|
||||
ltrim(s);
|
||||
rtrim(s);
|
||||
}
|
||||
|
||||
|
||||
void utils::lower_case(std::wstring &s) {
|
||||
std::for_each(s.begin(), s.end(), [](wchar_t & c){
|
||||
c = ::tolower(c);
|
||||
});
|
||||
}
|
||||
|
||||
std::wstring utils::default_process(std::wstring s) {
|
||||
trim(s);
|
||||
lower_case(s);
|
||||
return s;
|
||||
}
|
||||
|
||||
DecomposedSet utils::set_decomposition(std::vector<std::wstring_view> a, std::vector<std::wstring_view> b) {
|
||||
std::vector<std::wstring_view> intersection;
|
||||
std::vector<std::wstring_view> difference_ab;
|
||||
a.erase(std::unique(a.begin(), a.end()), a.end());
|
||||
b.erase(std::unique(b.begin(), b.end()), b.end());
|
||||
|
||||
for (const auto ¤t_a : a) {
|
||||
auto element_b = std::find(b.begin(), b.end(), current_a);
|
||||
if (element_b != b.end()) {
|
||||
b.erase(element_b);
|
||||
intersection.emplace_back(current_a);
|
||||
} else {
|
||||
difference_ab.emplace_back(current_a);
|
||||
}
|
||||
}
|
||||
|
||||
return DecomposedSet{intersection, difference_ab, b};
|
||||
}
|
||||
|
||||
std::size_t utils::joined_size(const std::wstring_view &x){
|
||||
return x.size();
|
||||
}
|
||||
|
||||
|
||||
std::size_t utils::joined_size(const std::vector<std::wstring_view> &x){
|
||||
if (x.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// there is a whitespace between each word
|
||||
std::size_t result = x.size() - 1;
|
||||
for (const auto &y: x) result += y.size();
|
||||
|
||||
return result;
|
||||
}
|
|
@ -7,54 +7,34 @@
|
|||
|
||||
using percent = float;
|
||||
|
||||
template<typename CharT>
|
||||
using string_view_vec = std::vector<std::basic_string_view<CharT>>;
|
||||
|
||||
|
||||
namespace detail {
|
||||
template<typename T>
|
||||
auto char_type(T const*) -> T;
|
||||
|
||||
template<typename T, typename U = typename T::const_iterator>
|
||||
auto char_type(T const&) -> typename std::iterator_traits<U>::value_type;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
using char_type = decltype(detail::char_type(std::declval<T const&>()));
|
||||
|
||||
|
||||
template<typename CharT>
|
||||
struct DecomposedSet {
|
||||
string_view_vec<CharT> intersection;
|
||||
string_view_vec<CharT> difference_ab;
|
||||
string_view_vec<CharT> difference_ba;
|
||||
DecomposedSet(string_view_vec<CharT> intersection, string_view_vec<CharT> difference_ab, string_view_vec<CharT> difference_ba)
|
||||
std::vector<std::wstring_view> intersection;
|
||||
std::vector<std::wstring_view> difference_ab;
|
||||
std::vector<std::wstring_view> difference_ba;
|
||||
DecomposedSet(std::vector<std::wstring_view> intersection, std::vector<std::wstring_view> difference_ab, std::vector<std::wstring_view> difference_ba)
|
||||
: intersection(std::move(intersection)), difference_ab(std::move(difference_ab)), difference_ba(std::move(difference_ba)) {}
|
||||
};
|
||||
|
||||
|
||||
struct Affix {
|
||||
std::size_t prefix_len;
|
||||
std::size_t suffix_len;
|
||||
};
|
||||
|
||||
namespace utils {
|
||||
|
||||
template<
|
||||
typename T, typename CharT = char_type<T>,
|
||||
typename = std::enable_if_t<std::is_convertible<T const&, std::basic_string_view<CharT>>{}>
|
||||
>
|
||||
string_view_vec<CharT> splitSV(const T &str);
|
||||
|
||||
|
||||
template<typename CharT>
|
||||
DecomposedSet<CharT> set_decomposition(string_view_vec<CharT> a, string_view_vec<CharT> b);
|
||||
|
||||
|
||||
template<typename T>
|
||||
std::size_t joined_size(const T &x);
|
||||
std::vector<std::wstring_view> splitSV(const T &str);
|
||||
|
||||
template<typename T>
|
||||
std::size_t joined_size(const std::vector<T> &x);
|
||||
DecomposedSet set_decomposition(std::vector<std::wstring_view> a, std::vector<std::wstring_view> b);
|
||||
|
||||
|
||||
template<typename CharT>
|
||||
std::basic_string<CharT> join(const string_view_vec<CharT> &sentence);
|
||||
std::size_t joined_size(const std::wstring_view &x);
|
||||
|
||||
std::size_t joined_size(const std::vector<std::wstring_view> &x);
|
||||
|
||||
|
||||
std::wstring join(const std::vector<std::wstring_view> &sentence);
|
||||
|
||||
percent result_cutoff(float result, percent score_cutoff);
|
||||
|
||||
|
@ -62,12 +42,16 @@ namespace utils {
|
|||
void lower_case(std::wstring &s);
|
||||
|
||||
std::wstring default_process(std::wstring s);
|
||||
|
||||
Affix remove_common_affix(std::wstring_view& a, std::wstring_view& b);
|
||||
|
||||
void remove_common_affix(std::vector<std::wstring_view> &a, std::vector<std::wstring_view> &b);
|
||||
}
|
||||
|
||||
|
||||
template<typename T, typename CharT, typename>
|
||||
string_view_vec<CharT> utils::splitSV(const T &str) {
|
||||
string_view_vec<CharT> output;
|
||||
template<typename T>
|
||||
inline std::vector<std::wstring_view> utils::splitSV(const T &str) {
|
||||
std::vector<std::wstring_view> output;
|
||||
// assume a word length of 6 + 1 whitespace
|
||||
output.reserve(str.size() / 7);
|
||||
|
||||
|
@ -82,178 +66,3 @@ string_view_vec<CharT> utils::splitSV(const T &str) {
|
|||
|
||||
return output;
|
||||
}
|
||||
|
||||
|
||||
template<typename CharT>
|
||||
DecomposedSet<CharT> utils::set_decomposition(string_view_vec<CharT> a, string_view_vec<CharT> b) {
|
||||
string_view_vec<CharT> intersection;
|
||||
string_view_vec<CharT> difference_ab;
|
||||
a.erase(std::unique(a.begin(), a.end()), a.end());
|
||||
b.erase(std::unique(b.begin(), b.end()), b.end());
|
||||
|
||||
for (const auto ¤t_a : a) {
|
||||
auto element_b = std::find(b.begin(), b.end(), current_a);
|
||||
if (element_b != b.end()) {
|
||||
b.erase(element_b);
|
||||
intersection.emplace_back(current_a);
|
||||
} else {
|
||||
difference_ab.emplace_back(current_a);
|
||||
}
|
||||
}
|
||||
|
||||
return DecomposedSet{intersection, difference_ab, b};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finds the longest common prefix between two ranges
|
||||
*/
|
||||
template <typename InputIterator1, typename InputIterator2>
|
||||
inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1,
|
||||
InputIterator2 first2, InputIterator2 last2)
|
||||
{
|
||||
return std::distance(first1, std::mismatch(first1, last1, first2, last2).first);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common prefix of two string views
|
||||
*/
|
||||
template<typename CharT>
|
||||
inline std::size_t remove_common_prefix(std::basic_string_view<CharT>& a, std::basic_string_view<CharT>& b) {
|
||||
auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end());
|
||||
a.remove_prefix(prefix);
|
||||
b.remove_prefix(prefix);
|
||||
return prefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common suffix of two string views
|
||||
*/
|
||||
template<typename CharT>
|
||||
inline std::size_t remove_common_suffix(std::basic_string_view<CharT>& a, std::basic_string_view<CharT>& b) {
|
||||
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
|
||||
a.remove_suffix(suffix);
|
||||
b.remove_suffix(suffix);
|
||||
return suffix;
|
||||
}
|
||||
|
||||
struct Affix {
|
||||
std::size_t prefix_len;
|
||||
std::size_t suffix_len;
|
||||
};
|
||||
|
||||
/**
|
||||
* Removes common affix of two string views
|
||||
*/
|
||||
template<typename CharT>
|
||||
inline Affix remove_common_affix(std::basic_string_view<CharT>& a, std::basic_string_view<CharT>& b) {
|
||||
return Affix {
|
||||
remove_common_prefix(a, b),
|
||||
remove_common_suffix(a, b)
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
inline void vec_remove_common_affix(T &a, T &b) {
|
||||
auto prefix = std::mismatch(a.begin(), a.end(), b.begin(), b.end());
|
||||
a.erase(a.begin(), prefix.first);
|
||||
b.erase(b.begin(), prefix.second);
|
||||
|
||||
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
|
||||
a.erase(a.end()-suffix, a.end());
|
||||
b.erase(b.end()-suffix, b.end());
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline void vec_common_affix(std::vector<T> &a, std::vector<T> &b) {
|
||||
iterable_remove_common_affix(a, b);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline void remove_common_affix(std::vector<T> &a, std::vector<T> &b)
|
||||
{
|
||||
vec_remove_common_affix(a, b);
|
||||
if (!a.empty() && !b.empty()) {
|
||||
remove_common_prefix(a.front(), b.front());
|
||||
remove_common_suffix(a.back(), b.back());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
inline std::size_t utils::joined_size(const T &x){
|
||||
return x.size();
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
inline std::size_t utils::joined_size(const std::vector<T> &x){
|
||||
if (x.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// there is a whitespace between each word
|
||||
std::size_t result = x.size() - 1;
|
||||
for (const auto &y: x) result += y.size();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
template<typename CharT>
|
||||
std::basic_string<CharT> utils::join(const string_view_vec<CharT> &sentence) {
|
||||
if (sentence.empty()) {
|
||||
return std::basic_string<CharT>();
|
||||
}
|
||||
|
||||
auto sentence_iter = sentence.begin();
|
||||
std::basic_string<CharT> result {*sentence_iter};
|
||||
const std::basic_string<CharT> whitespace {0x20};
|
||||
++sentence_iter;
|
||||
for (; sentence_iter != sentence.end(); ++sentence_iter) {
|
||||
result.append(whitespace).append(std::basic_string<CharT> {*sentence_iter});
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
inline percent utils::result_cutoff(float result, percent score_cutoff) {
|
||||
return (result >= score_cutoff) ? result : 0;
|
||||
}
|
||||
|
||||
|
||||
// trim from start (in place)
|
||||
inline void ltrim(std::wstring &s) {
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
|
||||
return !std::isspace(ch);
|
||||
}));
|
||||
}
|
||||
|
||||
|
||||
// trim from end (in place)
|
||||
inline void rtrim(std::wstring &s) {
|
||||
s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
|
||||
return !std::isspace(ch);
|
||||
}).base(), s.end());
|
||||
}
|
||||
|
||||
|
||||
// trim from both ends (in place)
|
||||
inline void utils::trim(std::wstring &s) {
|
||||
ltrim(s);
|
||||
rtrim(s);
|
||||
}
|
||||
|
||||
|
||||
inline void utils::lower_case(std::wstring &s) {
|
||||
std::for_each(s.begin(), s.end(), [](wchar_t & c){
|
||||
c = ::tolower(c);
|
||||
});
|
||||
}
|
||||
|
||||
inline std::wstring utils::default_process(std::wstring s) {
|
||||
trim(s);
|
||||
lower_case(s);
|
||||
return s;
|
||||
}
|
Loading…
Reference in New Issue