reduce template usage to a minimum

This commit is contained in:
maxbachmann 2020-03-31 15:16:03 +02:00
parent d665e3b961
commit 028db547d1
No known key found for this signature in database
GPG Key ID: 60334E83C23820B8
6 changed files with 439 additions and 498 deletions

1
.gitignore vendored
View File

@ -1,6 +1,7 @@
.vscode/
__pycache__/
.idea/
build/
rapidfuzz.egg-info/
dist/
*.data

241
cpp/src/levenshtein.cpp Normal file
View File

@ -0,0 +1,241 @@
#include "levenshtein.hpp"
levenshtein::Matrix levenshtein::matrix(std::wstring_view sentence1, std::wstring_view sentence2) {
Affix affix = utils::remove_common_affix(sentence1, sentence2);
std::size_t matrix_columns = sentence1.length() + 1;
std::size_t matrix_rows = sentence2.length() + 1;
std::vector<std::size_t> cache_matrix(matrix_rows*matrix_columns, 0);
for (std::size_t i = 0; i < matrix_rows; ++i) {
cache_matrix[i] = i;
}
for (std::size_t i = 1; i < matrix_columns; ++i) {
cache_matrix[matrix_rows*i] = i;
}
std::size_t sentence1_pos = 0;
for (const auto &char1 : sentence1) {
auto prev_cache = cache_matrix.begin() + sentence1_pos * matrix_rows;
auto result_cache = cache_matrix.begin() + (sentence1_pos + 1) * matrix_rows + 1;
std::size_t result = sentence1_pos + 1;
for (const auto &char2 : sentence2) {
result = std::min({
result + 1,
*prev_cache + (char1 != char2),
*(++prev_cache) + 1
});
*result_cache = result;
++result_cache;
}
++sentence1_pos;
}
return Matrix {
affix.prefix_len,
cache_matrix,
matrix_columns,
matrix_rows
};
}
std::vector<levenshtein::EditOp> levenshtein::editops(std::wstring_view sentence1, std::wstring_view sentence2) {
auto m = matrix(sentence1, sentence2);
std::size_t matrix_columns = m.matrix_columns;
std::size_t matrix_rows = m.matrix_rows;
std::size_t prefix_len = m.prefix_len;
auto lev_matrix = m.matrix;
std::vector<EditOp> ops;
ops.reserve(lev_matrix[matrix_columns * matrix_rows - 1]);
std::size_t i = matrix_columns - 1;
std::size_t j = matrix_rows - 1;
std::size_t position = matrix_columns * matrix_rows - 1;
auto is_replace = [=](std::size_t pos) {
return lev_matrix[pos - matrix_rows - 1] < lev_matrix[pos];
};
auto is_insert = [=](std::size_t pos) {
return lev_matrix[pos - 1] < lev_matrix[pos];
};
auto is_delete = [=](std::size_t pos) {
return lev_matrix[pos - matrix_rows] < lev_matrix[pos];
};
auto is_keep = [=](std::size_t pos) {
return lev_matrix[pos - matrix_rows - 1] == lev_matrix[pos];
};
while (i > 0 || j > 0) {
EditType op_type;
if (i && j && is_replace(position)) {
op_type = EditType::EditReplace;
--i;
--j;
position -= matrix_rows + 1;
} else if (j && is_insert(position)) {
op_type = EditType::EditInsert;
--j;
--position;
} else if (i && is_delete(position)) {
op_type = EditType::EditDelete;
--i;
position -= matrix_rows;
} else if (is_keep(position)) {
--i;
--j;
position -= matrix_rows + 1;
// EditKeep does not has to be stored
continue;
} else {
throw std::logic_error("something went wrong extracting the editops from the levenshtein matrix");
}
ops.emplace_back(op_type, i + prefix_len, j + prefix_len);
}
std::reverse(ops.begin(), ops.end());
return ops;
}
std::vector<levenshtein::MatchingBlock> levenshtein::matching_blocks(std::wstring_view sentence1, std::wstring_view sentence2) {
auto edit_ops = editops(sentence1, sentence2);
std::size_t first_start = 0;
std::size_t second_start = 0;
std::vector<MatchingBlock> mblocks;
for (const auto &op : edit_ops) {
if (op.op_type == EditType::EditKeep) {
continue;
}
if (first_start < op.first_start || second_start < op.second_start) {
mblocks.emplace_back(first_start, second_start, op.first_start - first_start);
first_start = op.first_start;
second_start = op.second_start;
}
switch (op.op_type) {
case EditType::EditReplace:
first_start += 1;
second_start += 1;
break;
case EditType::EditDelete:
first_start += 1;
break;
case EditType::EditInsert:
second_start += 1;
break;
case EditType::EditKeep:
break;
}
}
mblocks.emplace_back(sentence1.length(), sentence2.length(), 0);
return mblocks;
}
float levenshtein::normalized_distance(std::wstring_view sentence1, std::wstring_view sentence2, float min_ratio) {
if (sentence1.empty() || sentence2.empty()) {
return sentence1.empty() && sentence2.empty();
}
std::size_t sentence1_len = utils::joined_size(sentence1);
std::size_t sentence2_len = utils::joined_size(sentence2);
std::size_t max_len = std::max(sentence1_len, sentence2_len);
// constant time calculation to find a string ratio based on the string length
// so it can exit early without running any levenshtein calculations
std::size_t min_distance = (sentence1_len > sentence2_len)
? sentence1_len - sentence2_len
: sentence2_len - sentence1_len;
float len_ratio = 1.0 - (float)min_distance / (float)max_len;
if (len_ratio < min_ratio) {
return 0.0;
}
std::size_t dist = distance(sentence1, sentence2);
float ratio = 1.0 - (float)dist / (float)max_len;
return (ratio >= min_ratio) ? ratio : 0.0;
}
std::size_t levenshtein::distance(std::wstring_view sentence1, std::wstring_view sentence2) {
utils::remove_common_affix(sentence1, sentence2);
if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2);
if (sentence2.empty()) {
return sentence1.length();
}
std::vector<std::size_t> cache(sentence2.length()+1);
std::iota(cache.begin(), cache.end(), 0);
for (const auto &char1 : sentence1) {
auto cache_iter = cache.begin();
std::size_t temp = *cache_iter;
*cache_iter += 1;
for (const auto& char2 : sentence2) {
if (char1 != char2) {
++temp;
}
temp = std::min({
*cache_iter + 1,
*(++cache_iter) + 1,
temp
});
std::swap(*cache_iter, temp);
}
}
return cache.back();
}
std::size_t levenshtein::generic_distance(std::wstring_view sentence1, std::wstring_view sentence2,
std::size_t insert_cost, std::size_t delete_cost, std::size_t replace_cost)
{
utils::remove_common_affix(sentence1, sentence2);
if (sentence1.size() > sentence2.size()) {
std::swap(sentence1, sentence2);
std::swap(insert_cost, delete_cost);
}
std::vector<std::size_t> cache(sentence1.size() + 1);
cache[0] = 0;
for (std::size_t i = 1; i < cache.size(); ++i) {
cache[i] = cache[i - 1] + delete_cost;
}
for (const auto &char2 : sentence2) {
auto cache_iter = cache.begin();
std::size_t temp = *cache_iter;
*cache_iter += insert_cost;
for (const auto &char1 : sentence1) {
if (char1 != char2) {
temp = std::min({
*cache_iter + delete_cost,
*(cache_iter+1) + insert_cost,
temp + replace_cost
});
}
++cache_iter;
std::swap(*cache_iter, temp);
}
}
return cache.back();
}

View File

@ -30,11 +30,9 @@ namespace levenshtein {
std::size_t matrix_rows;
};
template<typename CharT>
Matrix matrix(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2);
Matrix matrix(std::wstring_view sentence1, std::wstring_view sentence2);
template<typename CharT>
std::vector<EditOp> editops(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2);
std::vector<EditOp> editops(std::wstring_view sentence1, std::wstring_view sentence2);
struct MatchingBlock {
std::size_t first_start;
@ -44,8 +42,7 @@ namespace levenshtein {
: first_start(first_start), second_start(second_start), len(len) {}
};
template<typename CharT>
std::vector<MatchingBlock> matching_blocks(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2);
std::vector<MatchingBlock> matching_blocks(std::wstring_view sentence1, std::wstring_view sentence2);
float normalized_distance(std::wstring_view sentence1, std::wstring_view sentence2, float min_ratio=0.0);
@ -53,8 +50,8 @@ namespace levenshtein {
std::size_t distance(std::wstring_view sentence1, std::wstring_view sentence2);
template<typename MaxDistanceCalc=std::false_type, typename CharT>
auto levenshtein_word_cmp(const CharT &letter_cmp, const string_view_vec<CharT> &words,
template<typename MaxDistanceCalc=std::false_type>
auto levenshtein_word_cmp(const wchar_t &letter_cmp, const std::vector<std::wstring_view> &words,
std::vector<std::size_t> &cache, std::size_t current_cache);
/**
@ -75,20 +72,14 @@ namespace levenshtein {
* so when it can not exit early it should not be used
* @return weighted levenshtein distance
*/
template<typename CharT, typename MaxDistance=std::nullopt_t>
std::size_t weighted_distance_impl(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2, MaxDistance max_distance=std::nullopt);
template<typename MaxDistance=std::nullopt_t>
std::size_t weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, MaxDistance max_distance=std::nullopt);
template<typename MaxDistance=std::nullopt_t>
std::size_t weighted_distance(std::string_view sentence1, std::string_view sentence2, MaxDistance max_distance=std::nullopt);
template<typename CharT, typename MaxDistance=std::nullopt_t>
std::size_t weighted_distance(string_view_vec<CharT> sentence1, string_view_vec<CharT> sentence2, MaxDistance max_distance=std::nullopt);
std::size_t weighted_distance(std::vector<std::wstring_view> sentence1, std::vector<std::wstring_view> sentence2, MaxDistance max_distance=std::nullopt);
size_t generic_distance(std::wstring_view source, std::wstring_view target, size_t insert_cost = 1, size_t delete_cost = 1, size_t replace_cost = 1);
std::size_t generic_distance(std::wstring_view source, std::wstring_view target, std::size_t insert_cost = 1, std::size_t delete_cost = 1, std::size_t replace_cost = 1);
/**
* Calculates a normalized score of the weighted Levenshtein algorithm between 0.0 and
@ -100,214 +91,8 @@ namespace levenshtein {
template<typename CharT>
inline levenshtein::Matrix levenshtein::matrix(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2) {
Affix affix = remove_common_affix(sentence1, sentence2);
std::size_t matrix_columns = sentence1.length() + 1;
std::size_t matrix_rows = sentence2.length() + 1;
std::vector<std::size_t> cache_matrix(matrix_rows*matrix_columns, 0);
for (std::size_t i = 0; i < matrix_rows; ++i) {
cache_matrix[i] = i;
}
for (std::size_t i = 1; i < matrix_columns; ++i) {
cache_matrix[matrix_rows*i] = i;
}
std::size_t sentence1_pos = 0;
for (const auto &char1 : sentence1) {
auto prev_cache = cache_matrix.begin() + sentence1_pos * matrix_rows;
auto result_cache = cache_matrix.begin() + (sentence1_pos + 1) * matrix_rows + 1;
std::size_t result = sentence1_pos + 1;
for (const auto &char2 : sentence2) {
result = std::min({
result + 1,
*prev_cache + (char1 != char2),
*(++prev_cache) + 1
});
*result_cache = result;
++result_cache;
}
++sentence1_pos;
}
return Matrix {
affix.prefix_len,
cache_matrix,
matrix_columns,
matrix_rows
};
}
template<typename CharT>
inline std::vector<levenshtein::EditOp>
levenshtein::editops(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2) {
auto m = matrix(sentence1, sentence2);
std::size_t matrix_columns = m.matrix_columns;
std::size_t matrix_rows = m.matrix_rows;
std::size_t prefix_len = m.prefix_len;
auto lev_matrix = m.matrix;
std::vector<EditOp> ops;
ops.reserve(lev_matrix[matrix_columns * matrix_rows - 1]);
std::size_t i = matrix_columns - 1;
std::size_t j = matrix_rows - 1;
std::size_t position = matrix_columns * matrix_rows - 1;
auto is_replace = [=](std::size_t pos) {
return lev_matrix[pos - matrix_rows - 1] < lev_matrix[pos];
};
auto is_insert = [=](std::size_t pos) {
return lev_matrix[pos - 1] < lev_matrix[pos];
};
auto is_delete = [=](std::size_t pos) {
return lev_matrix[pos - matrix_rows] < lev_matrix[pos];
};
auto is_keep = [=](std::size_t pos) {
return lev_matrix[pos - matrix_rows - 1] == lev_matrix[pos];
};
while (i > 0 || j > 0) {
EditType op_type;
if (i && j && is_replace(position)) {
op_type = EditType::EditReplace;
--i;
--j;
position -= matrix_rows + 1;
} else if (j && is_insert(position)) {
op_type = EditType::EditInsert;
--j;
--position;
} else if (i && is_delete(position)) {
op_type = EditType::EditDelete;
--i;
position -= matrix_rows;
} else if (is_keep(position)) {
--i;
--j;
position -= matrix_rows + 1;
// EditKeep does not has to be stored
continue;
} else {
throw std::logic_error("something went wrong extracting the editops from the levenshtein matrix");
}
ops.emplace_back(op_type, i + prefix_len, j + prefix_len);
}
std::reverse(ops.begin(), ops.end());
return ops;
}
template<typename CharT>
inline std::vector<levenshtein::MatchingBlock>
levenshtein::matching_blocks(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2) {
auto edit_ops = editops(sentence1, sentence2);
std::size_t first_start = 0;
std::size_t second_start = 0;
std::vector<MatchingBlock> mblocks;
for (const auto &op : edit_ops) {
if (op.op_type == EditType::EditKeep) {
continue;
}
if (first_start < op.first_start || second_start < op.second_start) {
mblocks.emplace_back(first_start, second_start, op.first_start - first_start);
first_start = op.first_start;
second_start = op.second_start;
}
switch (op.op_type) {
case EditType::EditReplace:
first_start += 1;
second_start += 1;
break;
case EditType::EditDelete:
first_start += 1;
break;
case EditType::EditInsert:
second_start += 1;
break;
case EditType::EditKeep:
break;
}
}
mblocks.emplace_back(sentence1.length(), sentence2.length(), 0);
return mblocks;
}
inline float levenshtein::normalized_distance(std::wstring_view sentence1, std::wstring_view sentence2, float min_ratio) {
if (sentence1.empty() || sentence2.empty()) {
return sentence1.empty() && sentence2.empty();
}
std::size_t sentence1_len = utils::joined_size(sentence1);
std::size_t sentence2_len = utils::joined_size(sentence2);
std::size_t max_len = std::max(sentence1_len, sentence2_len);
// constant time calculation to find a string ratio based on the string length
// so it can exit early without running any levenshtein calculations
std::size_t min_distance = (sentence1_len > sentence2_len)
? sentence1_len - sentence2_len
: sentence2_len - sentence1_len;
float len_ratio = 1.0 - (float)min_distance / (float)max_len;
if (len_ratio < min_ratio) {
return 0.0;
}
std::size_t dist = distance(sentence1, sentence2);
float ratio = 1.0 - (float)dist / (float)max_len;
return (ratio >= min_ratio) ? ratio : 0.0;
}
inline std::size_t levenshtein::distance(std::wstring_view sentence1, std::wstring_view sentence2) {
remove_common_affix(sentence1, sentence2);
if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2);
if (sentence2.empty()) {
return sentence1.length();
}
std::vector<std::size_t> cache(sentence2.length()+1);
std::iota(cache.begin(), cache.end(), 0);
for (const auto &char1 : sentence1) {
auto cache_iter = cache.begin();
size_t temp = *cache_iter;
*cache_iter += 1;
for (const auto& char2 : sentence2) {
if (char1 != char2) {
++temp;
}
temp = std::min({
*cache_iter + 1,
*(++cache_iter) + 1,
temp
});
std::swap(*cache_iter, temp);
}
}
return cache.back();
}
template<typename MaxDistanceCalc, typename CharT>
inline auto levenshtein::levenshtein_word_cmp(const CharT &letter_cmp, const string_view_vec<CharT> &words,
template<typename MaxDistanceCalc>
inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const std::vector<std::wstring_view> &words,
std::vector<std::size_t> &cache, std::size_t current_cache)
{
std::size_t result = current_cache + 1;
@ -315,7 +100,7 @@ inline auto levenshtein::levenshtein_word_cmp(const CharT &letter_cmp, const str
auto word_iter = words.begin();
auto min_distance = std::numeric_limits<std::size_t>::max();
auto charCmp = [&] (const CharT &char2) {
auto charCmp = [&] (const wchar_t &char2) {
if (letter_cmp == char2) { result = current_cache; }
else { ++result; }
@ -356,9 +141,9 @@ inline auto levenshtein::levenshtein_word_cmp(const CharT &letter_cmp, const str
}
template<typename CharT, typename MaxDistance>
inline std::size_t levenshtein::weighted_distance(string_view_vec<CharT> sentence1, string_view_vec<CharT> sentence2, MaxDistance max_distance) {
remove_common_affix(sentence1, sentence2);
template<typename MaxDistance>
inline std::size_t levenshtein::weighted_distance(std::vector<std::wstring_view> sentence1, std::vector<std::wstring_view> sentence2, MaxDistance max_distance) {
utils::remove_common_affix(sentence1, sentence2);
std::size_t sentence1_len = utils::joined_size(sentence1);
std::size_t sentence2_len = utils::joined_size(sentence2);
@ -380,7 +165,7 @@ inline std::size_t levenshtein::weighted_distance(string_view_vec<CharT> sentenc
// no delimiter in front of first word
for (const auto &letter : *word_iter) {
if constexpr(!std::is_same_v<MaxDistance, std::nullopt_t>) {
size_t min_distance = levenshtein_word_cmp<std::true_type>(letter, sentence2, cache, range1_pos);
std::size_t min_distance = levenshtein_word_cmp<std::true_type>(letter, sentence2, cache, range1_pos);
if (min_distance > max_distance) {
return std::numeric_limits<std::size_t>::max();
}
@ -395,19 +180,19 @@ inline std::size_t levenshtein::weighted_distance(string_view_vec<CharT> sentenc
for (; word_iter != sentence1.end(); ++word_iter) {
// whitespace between words
if constexpr(!std::is_same_v<MaxDistance, std::nullopt_t>) {
size_t min_distance = levenshtein_word_cmp<std::true_type>((CharT)0x20, sentence2, cache, range1_pos);
std::size_t min_distance = levenshtein_word_cmp<std::true_type>((wchar_t)0x20, sentence2, cache, range1_pos);
if (min_distance > max_distance) {
return std::numeric_limits<std::size_t>::max();
}
} else {
levenshtein_word_cmp((CharT)0x20, sentence2, cache, range1_pos);
levenshtein_word_cmp((wchar_t)0x20, sentence2, cache, range1_pos);
}
++range1_pos;
for (const auto &letter : *word_iter) {
if constexpr(!std::is_same_v<MaxDistance, std::nullopt_t>) {
size_t min_distance = levenshtein_word_cmp<std::true_type>(letter, sentence2, cache, range1_pos);
std::size_t min_distance = levenshtein_word_cmp<std::true_type>(letter, sentence2, cache, range1_pos);
if (min_distance > max_distance) {
return std::numeric_limits<std::size_t>::max();
}
@ -425,20 +210,7 @@ inline std::size_t levenshtein::weighted_distance(string_view_vec<CharT> sentenc
template<typename MaxDistance>
inline std::size_t levenshtein::weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, MaxDistance max_distance) {
return weighted_distance_impl(sentence1, sentence2, max_distance);
}
template<typename MaxDistance>
inline std::size_t levenshtein::weighted_distance(std::string_view sentence1, std::string_view sentence2, MaxDistance max_distance) {
return weighted_distance_impl(sentence1, sentence2, max_distance);
}
template<typename CharT, typename MaxDistance>
inline std::size_t levenshtein::weighted_distance_impl(std::basic_string_view<CharT> sentence1, std::basic_string_view<CharT> sentence2, MaxDistance max_distance) {
remove_common_affix(sentence1, sentence2);
utils::remove_common_affix(sentence1, sentence2);
if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2);
@ -488,43 +260,7 @@ inline std::size_t levenshtein::weighted_distance_impl(std::basic_string_view<Ch
}
inline size_t levenshtein::generic_distance(std::wstring_view sentence1, std::wstring_view sentence2,
size_t insert_cost, size_t delete_cost, size_t replace_cost)
{
remove_common_affix(sentence1, sentence2);
if (sentence1.size() > sentence2.size()) {
std::swap(sentence1, sentence2);
std::swap(insert_cost, delete_cost);
}
const size_t min_size = sentence1.size();
std::vector<size_t> cache(sentence1.size() + 1);
cache[0] = 0;
for (size_t i = 1; i < cache.size(); ++i) {
cache[i] = cache[i - 1] + delete_cost;
}
for (const auto &char2 : sentence2) {
auto cache_iter = cache.begin();
size_t temp = *cache_iter;
*cache_iter += insert_cost;
for (const auto &char1 : sentence1) {
if (char1 != char2) {
temp = std::min({
*cache_iter + delete_cost,
*(cache_iter+1) + insert_cost,
temp + replace_cost
});
}
++cache_iter;
std::swap(*cache_iter, temp);
}
}
return cache.back();
}
template<typename Sentence1, typename Sentence2>
@ -533,6 +269,7 @@ inline float levenshtein::normalized_weighted_distance(const Sentence1 &sentence
if (sentence1.empty() || sentence2.empty()) {
return sentence1.empty() && sentence2.empty();
}
return 1;
std::size_t sentence1_len = utils::joined_size(sentence1);
std::size_t sentence2_len = utils::joined_size(sentence2);

151
cpp/src/utils.cpp Normal file
View File

@ -0,0 +1,151 @@
#include "utils.hpp"
/**
* Finds the longest common prefix between two ranges
*/
template <typename InputIterator1, typename InputIterator2>
inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1,
InputIterator2 first2, InputIterator2 last2)
{
return std::distance(first1, std::mismatch(first1, last1, first2, last2).first);
}
/**
* Removes common prefix of two string views
*/
std::size_t remove_common_prefix(std::wstring_view& a, std::wstring_view& b) {
auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end());
a.remove_prefix(prefix);
b.remove_prefix(prefix);
return prefix;
}
/**
* Removes common suffix of two string views
*/
std::size_t remove_common_suffix(std::wstring_view& a, std::wstring_view& b) {
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
a.remove_suffix(suffix);
b.remove_suffix(suffix);
return suffix;
}
/**
* Removes common affix of two string views
*/
Affix utils::remove_common_affix(std::wstring_view& a, std::wstring_view& b) {
return Affix {
remove_common_prefix(a, b),
remove_common_suffix(a, b)
};
}
template<typename T>
void vec_remove_common_affix(T &a, T &b) {
auto prefix = std::mismatch(a.begin(), a.end(), b.begin(), b.end());
a.erase(a.begin(), prefix.first);
b.erase(b.begin(), prefix.second);
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
a.erase(a.end()-suffix, a.end());
b.erase(b.end()-suffix, b.end());
}
void utils::remove_common_affix(std::vector<std::wstring_view> &a, std::vector<std::wstring_view> &b)
{
vec_remove_common_affix(a, b);
if (!a.empty() && !b.empty()) {
remove_common_prefix(a.front(), b.front());
remove_common_suffix(a.back(), b.back());
}
}
std::wstring utils::join(const std::vector<std::wstring_view> &sentence) {
if (sentence.empty()) {
return std::wstring();
}
auto sentence_iter = sentence.begin();
std::wstring result {*sentence_iter};
const std::wstring whitespace {0x20};
++sentence_iter;
for (; sentence_iter != sentence.end(); ++sentence_iter) {
result.append(whitespace).append(std::wstring {*sentence_iter});
}
return result;
}
percent utils::result_cutoff(float result, percent score_cutoff) {
return (result >= score_cutoff) ? result : 0;
}
// trim from start (in place)
void ltrim(std::wstring &s) {
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
return !std::isspace(ch);
}));
}
// trim from end (in place)
void rtrim(std::wstring &s) {
s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
return !std::isspace(ch);
}).base(), s.end());
}
// trim from both ends (in place)
void utils::trim(std::wstring &s) {
ltrim(s);
rtrim(s);
}
void utils::lower_case(std::wstring &s) {
std::for_each(s.begin(), s.end(), [](wchar_t & c){
c = ::tolower(c);
});
}
std::wstring utils::default_process(std::wstring s) {
trim(s);
lower_case(s);
return s;
}
DecomposedSet utils::set_decomposition(std::vector<std::wstring_view> a, std::vector<std::wstring_view> b) {
std::vector<std::wstring_view> intersection;
std::vector<std::wstring_view> difference_ab;
a.erase(std::unique(a.begin(), a.end()), a.end());
b.erase(std::unique(b.begin(), b.end()), b.end());
for (const auto &current_a : a) {
auto element_b = std::find(b.begin(), b.end(), current_a);
if (element_b != b.end()) {
b.erase(element_b);
intersection.emplace_back(current_a);
} else {
difference_ab.emplace_back(current_a);
}
}
return DecomposedSet{intersection, difference_ab, b};
}
std::size_t utils::joined_size(const std::wstring_view &x){
return x.size();
}
std::size_t utils::joined_size(const std::vector<std::wstring_view> &x){
if (x.empty()) {
return 0;
}
// there is a whitespace between each word
std::size_t result = x.size() - 1;
for (const auto &y: x) result += y.size();
return result;
}

View File

@ -7,54 +7,34 @@
using percent = float;
template<typename CharT>
using string_view_vec = std::vector<std::basic_string_view<CharT>>;
namespace detail {
template<typename T>
auto char_type(T const*) -> T;
template<typename T, typename U = typename T::const_iterator>
auto char_type(T const&) -> typename std::iterator_traits<U>::value_type;
}
template<typename T>
using char_type = decltype(detail::char_type(std::declval<T const&>()));
template<typename CharT>
struct DecomposedSet {
string_view_vec<CharT> intersection;
string_view_vec<CharT> difference_ab;
string_view_vec<CharT> difference_ba;
DecomposedSet(string_view_vec<CharT> intersection, string_view_vec<CharT> difference_ab, string_view_vec<CharT> difference_ba)
std::vector<std::wstring_view> intersection;
std::vector<std::wstring_view> difference_ab;
std::vector<std::wstring_view> difference_ba;
DecomposedSet(std::vector<std::wstring_view> intersection, std::vector<std::wstring_view> difference_ab, std::vector<std::wstring_view> difference_ba)
: intersection(std::move(intersection)), difference_ab(std::move(difference_ab)), difference_ba(std::move(difference_ba)) {}
};
struct Affix {
std::size_t prefix_len;
std::size_t suffix_len;
};
namespace utils {
template<
typename T, typename CharT = char_type<T>,
typename = std::enable_if_t<std::is_convertible<T const&, std::basic_string_view<CharT>>{}>
>
string_view_vec<CharT> splitSV(const T &str);
template<typename CharT>
DecomposedSet<CharT> set_decomposition(string_view_vec<CharT> a, string_view_vec<CharT> b);
template<typename T>
std::size_t joined_size(const T &x);
std::vector<std::wstring_view> splitSV(const T &str);
template<typename T>
std::size_t joined_size(const std::vector<T> &x);
DecomposedSet set_decomposition(std::vector<std::wstring_view> a, std::vector<std::wstring_view> b);
template<typename CharT>
std::basic_string<CharT> join(const string_view_vec<CharT> &sentence);
std::size_t joined_size(const std::wstring_view &x);
std::size_t joined_size(const std::vector<std::wstring_view> &x);
std::wstring join(const std::vector<std::wstring_view> &sentence);
percent result_cutoff(float result, percent score_cutoff);
@ -62,12 +42,16 @@ namespace utils {
void lower_case(std::wstring &s);
std::wstring default_process(std::wstring s);
Affix remove_common_affix(std::wstring_view& a, std::wstring_view& b);
void remove_common_affix(std::vector<std::wstring_view> &a, std::vector<std::wstring_view> &b);
}
template<typename T, typename CharT, typename>
string_view_vec<CharT> utils::splitSV(const T &str) {
string_view_vec<CharT> output;
template<typename T>
inline std::vector<std::wstring_view> utils::splitSV(const T &str) {
std::vector<std::wstring_view> output;
// assume a word length of 6 + 1 whitespace
output.reserve(str.size() / 7);
@ -82,178 +66,3 @@ string_view_vec<CharT> utils::splitSV(const T &str) {
return output;
}
template<typename CharT>
DecomposedSet<CharT> utils::set_decomposition(string_view_vec<CharT> a, string_view_vec<CharT> b) {
string_view_vec<CharT> intersection;
string_view_vec<CharT> difference_ab;
a.erase(std::unique(a.begin(), a.end()), a.end());
b.erase(std::unique(b.begin(), b.end()), b.end());
for (const auto &current_a : a) {
auto element_b = std::find(b.begin(), b.end(), current_a);
if (element_b != b.end()) {
b.erase(element_b);
intersection.emplace_back(current_a);
} else {
difference_ab.emplace_back(current_a);
}
}
return DecomposedSet{intersection, difference_ab, b};
}
/**
* Finds the longest common prefix between two ranges
*/
template <typename InputIterator1, typename InputIterator2>
inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1,
InputIterator2 first2, InputIterator2 last2)
{
return std::distance(first1, std::mismatch(first1, last1, first2, last2).first);
}
/**
* Removes common prefix of two string views
*/
template<typename CharT>
inline std::size_t remove_common_prefix(std::basic_string_view<CharT>& a, std::basic_string_view<CharT>& b) {
auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end());
a.remove_prefix(prefix);
b.remove_prefix(prefix);
return prefix;
}
/**
* Removes common suffix of two string views
*/
template<typename CharT>
inline std::size_t remove_common_suffix(std::basic_string_view<CharT>& a, std::basic_string_view<CharT>& b) {
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
a.remove_suffix(suffix);
b.remove_suffix(suffix);
return suffix;
}
struct Affix {
std::size_t prefix_len;
std::size_t suffix_len;
};
/**
* Removes common affix of two string views
*/
template<typename CharT>
inline Affix remove_common_affix(std::basic_string_view<CharT>& a, std::basic_string_view<CharT>& b) {
return Affix {
remove_common_prefix(a, b),
remove_common_suffix(a, b)
};
}
template<typename T>
inline void vec_remove_common_affix(T &a, T &b) {
auto prefix = std::mismatch(a.begin(), a.end(), b.begin(), b.end());
a.erase(a.begin(), prefix.first);
b.erase(b.begin(), prefix.second);
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
a.erase(a.end()-suffix, a.end());
b.erase(b.end()-suffix, b.end());
}
template<typename T>
inline void vec_common_affix(std::vector<T> &a, std::vector<T> &b) {
iterable_remove_common_affix(a, b);
}
template<typename T>
inline void remove_common_affix(std::vector<T> &a, std::vector<T> &b)
{
vec_remove_common_affix(a, b);
if (!a.empty() && !b.empty()) {
remove_common_prefix(a.front(), b.front());
remove_common_suffix(a.back(), b.back());
}
}
template<typename T>
inline std::size_t utils::joined_size(const T &x){
return x.size();
}
template<typename T>
inline std::size_t utils::joined_size(const std::vector<T> &x){
if (x.empty()) {
return 0;
}
// there is a whitespace between each word
std::size_t result = x.size() - 1;
for (const auto &y: x) result += y.size();
return result;
}
template<typename CharT>
std::basic_string<CharT> utils::join(const string_view_vec<CharT> &sentence) {
if (sentence.empty()) {
return std::basic_string<CharT>();
}
auto sentence_iter = sentence.begin();
std::basic_string<CharT> result {*sentence_iter};
const std::basic_string<CharT> whitespace {0x20};
++sentence_iter;
for (; sentence_iter != sentence.end(); ++sentence_iter) {
result.append(whitespace).append(std::basic_string<CharT> {*sentence_iter});
}
return result;
}
inline percent utils::result_cutoff(float result, percent score_cutoff) {
return (result >= score_cutoff) ? result : 0;
}
// trim from start (in place)
inline void ltrim(std::wstring &s) {
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
return !std::isspace(ch);
}));
}
// trim from end (in place)
inline void rtrim(std::wstring &s) {
s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
return !std::isspace(ch);
}).base(), s.end());
}
// trim from both ends (in place)
inline void utils::trim(std::wstring &s) {
ltrim(s);
rtrim(s);
}
inline void utils::lower_case(std::wstring &s) {
std::for_each(s.begin(), s.end(), [](wchar_t & c){
c = ::tolower(c);
});
}
inline std::wstring utils::default_process(std::wstring s) {
trim(s);
lower_case(s);
return s;
}

View File

@ -34,7 +34,9 @@ ext_modules = [
[
'python/src/rapidfuzz.cpp',
'cpp/src/fuzz.cpp',
'cpp/src/process.cpp'
'cpp/src/process.cpp',
'cpp/src/levenshtein.cpp',
'cpp/src/utils.cpp'
],
include_dirs=[
# Path to pybind11 headers