From 028db547d1f455af5251dfc305f6425359688199 Mon Sep 17 00:00:00 2001 From: maxbachmann Date: Tue, 31 Mar 2020 15:16:03 +0200 Subject: [PATCH] reduce template usage to a minimum --- .gitignore | 1 + cpp/src/levenshtein.cpp | 241 ++++++++++++++++++++++++++++++++ cpp/src/levenshtein.hpp | 301 +++------------------------------------- cpp/src/utils.cpp | 151 ++++++++++++++++++++ cpp/src/utils.hpp | 239 ++++--------------------------- setup.py | 4 +- 6 files changed, 439 insertions(+), 498 deletions(-) create mode 100644 cpp/src/levenshtein.cpp create mode 100644 cpp/src/utils.cpp diff --git a/.gitignore b/.gitignore index 0319093..6c82153 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .vscode/ __pycache__/ .idea/ +build/ rapidfuzz.egg-info/ dist/ *.data diff --git a/cpp/src/levenshtein.cpp b/cpp/src/levenshtein.cpp new file mode 100644 index 0000000..b6eff0f --- /dev/null +++ b/cpp/src/levenshtein.cpp @@ -0,0 +1,241 @@ +#include "levenshtein.hpp" + +levenshtein::Matrix levenshtein::matrix(std::wstring_view sentence1, std::wstring_view sentence2) { + Affix affix = utils::remove_common_affix(sentence1, sentence2); + + std::size_t matrix_columns = sentence1.length() + 1; + std::size_t matrix_rows = sentence2.length() + 1; + + std::vector cache_matrix(matrix_rows*matrix_columns, 0); + + for (std::size_t i = 0; i < matrix_rows; ++i) { + cache_matrix[i] = i; + } + + for (std::size_t i = 1; i < matrix_columns; ++i) { + cache_matrix[matrix_rows*i] = i; + } + + std::size_t sentence1_pos = 0; + for (const auto &char1 : sentence1) { + auto prev_cache = cache_matrix.begin() + sentence1_pos * matrix_rows; + auto result_cache = cache_matrix.begin() + (sentence1_pos + 1) * matrix_rows + 1; + std::size_t result = sentence1_pos + 1; + for (const auto &char2 : sentence2) { + result = std::min({ + result + 1, + *prev_cache + (char1 != char2), + *(++prev_cache) + 1 + }); + *result_cache = result; + ++result_cache; + } + ++sentence1_pos; + } + + return Matrix { + affix.prefix_len, + cache_matrix, + matrix_columns, + matrix_rows + }; +} + + +std::vector levenshtein::editops(std::wstring_view sentence1, std::wstring_view sentence2) { + auto m = matrix(sentence1, sentence2); + std::size_t matrix_columns = m.matrix_columns; + std::size_t matrix_rows = m.matrix_rows; + std::size_t prefix_len = m.prefix_len; + auto lev_matrix = m.matrix; + + std::vector ops; + ops.reserve(lev_matrix[matrix_columns * matrix_rows - 1]); + + std::size_t i = matrix_columns - 1; + std::size_t j = matrix_rows - 1; + std::size_t position = matrix_columns * matrix_rows - 1; + + auto is_replace = [=](std::size_t pos) { + return lev_matrix[pos - matrix_rows - 1] < lev_matrix[pos]; + }; + auto is_insert = [=](std::size_t pos) { + return lev_matrix[pos - 1] < lev_matrix[pos]; + }; + auto is_delete = [=](std::size_t pos) { + return lev_matrix[pos - matrix_rows] < lev_matrix[pos]; + }; + auto is_keep = [=](std::size_t pos) { + return lev_matrix[pos - matrix_rows - 1] == lev_matrix[pos]; + }; + + while (i > 0 || j > 0) { + EditType op_type; + + if (i && j && is_replace(position)) { + op_type = EditType::EditReplace; + --i; + --j; + position -= matrix_rows + 1; + } else if (j && is_insert(position)) { + op_type = EditType::EditInsert; + --j; + --position; + } else if (i && is_delete(position)) { + op_type = EditType::EditDelete; + --i; + position -= matrix_rows; + } else if (is_keep(position)) { + --i; + --j; + position -= matrix_rows + 1; + // EditKeep does not has to be stored + continue; + } else { + throw std::logic_error("something went wrong extracting the editops from the levenshtein matrix"); + } + + ops.emplace_back(op_type, i + prefix_len, j + prefix_len); + } + + std::reverse(ops.begin(), ops.end()); + return ops; +} + + +std::vector levenshtein::matching_blocks(std::wstring_view sentence1, std::wstring_view sentence2) { + auto edit_ops = editops(sentence1, sentence2); + std::size_t first_start = 0; + std::size_t second_start = 0; + std::vector mblocks; + + for (const auto &op : edit_ops) { + if (op.op_type == EditType::EditKeep) { + continue; + } + + if (first_start < op.first_start || second_start < op.second_start) { + mblocks.emplace_back(first_start, second_start, op.first_start - first_start); + first_start = op.first_start; + second_start = op.second_start; + } + + switch (op.op_type) { + case EditType::EditReplace: + first_start += 1; + second_start += 1; + break; + case EditType::EditDelete: + first_start += 1; + break; + case EditType::EditInsert: + second_start += 1; + break; + case EditType::EditKeep: + break; + } + } + + mblocks.emplace_back(sentence1.length(), sentence2.length(), 0); + return mblocks; +} + + +float levenshtein::normalized_distance(std::wstring_view sentence1, std::wstring_view sentence2, float min_ratio) { + if (sentence1.empty() || sentence2.empty()) { + return sentence1.empty() && sentence2.empty(); + } + + std::size_t sentence1_len = utils::joined_size(sentence1); + std::size_t sentence2_len = utils::joined_size(sentence2); + std::size_t max_len = std::max(sentence1_len, sentence2_len); + + // constant time calculation to find a string ratio based on the string length + // so it can exit early without running any levenshtein calculations + std::size_t min_distance = (sentence1_len > sentence2_len) + ? sentence1_len - sentence2_len + : sentence2_len - sentence1_len; + + float len_ratio = 1.0 - (float)min_distance / (float)max_len; + if (len_ratio < min_ratio) { + return 0.0; + } + + std::size_t dist = distance(sentence1, sentence2); + + float ratio = 1.0 - (float)dist / (float)max_len; + return (ratio >= min_ratio) ? ratio : 0.0; +} + + +std::size_t levenshtein::distance(std::wstring_view sentence1, std::wstring_view sentence2) { + + utils::remove_common_affix(sentence1, sentence2); + + if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2); + + if (sentence2.empty()) { + return sentence1.length(); + } + + std::vector cache(sentence2.length()+1); + std::iota(cache.begin(), cache.end(), 0); + + for (const auto &char1 : sentence1) { + auto cache_iter = cache.begin(); + std::size_t temp = *cache_iter; + *cache_iter += 1; + + for (const auto& char2 : sentence2) { + if (char1 != char2) { + ++temp; + } + + temp = std::min({ + *cache_iter + 1, + *(++cache_iter) + 1, + temp + }); + std::swap(*cache_iter, temp); + } + } + return cache.back(); +} + + +std::size_t levenshtein::generic_distance(std::wstring_view sentence1, std::wstring_view sentence2, + std::size_t insert_cost, std::size_t delete_cost, std::size_t replace_cost) +{ + utils::remove_common_affix(sentence1, sentence2); + if (sentence1.size() > sentence2.size()) { + std::swap(sentence1, sentence2); + std::swap(insert_cost, delete_cost); + } + + std::vector cache(sentence1.size() + 1); + + cache[0] = 0; + for (std::size_t i = 1; i < cache.size(); ++i) { + cache[i] = cache[i - 1] + delete_cost; + } + + for (const auto &char2 : sentence2) { + auto cache_iter = cache.begin(); + std::size_t temp = *cache_iter; + *cache_iter += insert_cost; + + for (const auto &char1 : sentence1) { + if (char1 != char2) { + temp = std::min({ + *cache_iter + delete_cost, + *(cache_iter+1) + insert_cost, + temp + replace_cost + }); + } + ++cache_iter; + std::swap(*cache_iter, temp); + } + } + + return cache.back(); +} diff --git a/cpp/src/levenshtein.hpp b/cpp/src/levenshtein.hpp index 5828e2a..658172a 100644 --- a/cpp/src/levenshtein.hpp +++ b/cpp/src/levenshtein.hpp @@ -30,11 +30,9 @@ namespace levenshtein { std::size_t matrix_rows; }; - template - Matrix matrix(std::basic_string_view sentence1, std::basic_string_view sentence2); + Matrix matrix(std::wstring_view sentence1, std::wstring_view sentence2); - template - std::vector editops(std::basic_string_view sentence1, std::basic_string_view sentence2); + std::vector editops(std::wstring_view sentence1, std::wstring_view sentence2); struct MatchingBlock { std::size_t first_start; @@ -44,8 +42,7 @@ namespace levenshtein { : first_start(first_start), second_start(second_start), len(len) {} }; - template - std::vector matching_blocks(std::basic_string_view sentence1, std::basic_string_view sentence2); + std::vector matching_blocks(std::wstring_view sentence1, std::wstring_view sentence2); float normalized_distance(std::wstring_view sentence1, std::wstring_view sentence2, float min_ratio=0.0); @@ -53,8 +50,8 @@ namespace levenshtein { std::size_t distance(std::wstring_view sentence1, std::wstring_view sentence2); - template - auto levenshtein_word_cmp(const CharT &letter_cmp, const string_view_vec &words, + template + auto levenshtein_word_cmp(const wchar_t &letter_cmp, const std::vector &words, std::vector &cache, std::size_t current_cache); /** @@ -75,20 +72,14 @@ namespace levenshtein { * so when it can not exit early it should not be used * @return weighted levenshtein distance */ - template - std::size_t weighted_distance_impl(std::basic_string_view sentence1, std::basic_string_view sentence2, MaxDistance max_distance=std::nullopt); - template std::size_t weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, MaxDistance max_distance=std::nullopt); template - std::size_t weighted_distance(std::string_view sentence1, std::string_view sentence2, MaxDistance max_distance=std::nullopt); - - template - std::size_t weighted_distance(string_view_vec sentence1, string_view_vec sentence2, MaxDistance max_distance=std::nullopt); + std::size_t weighted_distance(std::vector sentence1, std::vector sentence2, MaxDistance max_distance=std::nullopt); - size_t generic_distance(std::wstring_view source, std::wstring_view target, size_t insert_cost = 1, size_t delete_cost = 1, size_t replace_cost = 1); + std::size_t generic_distance(std::wstring_view source, std::wstring_view target, std::size_t insert_cost = 1, std::size_t delete_cost = 1, std::size_t replace_cost = 1); /** * Calculates a normalized score of the weighted Levenshtein algorithm between 0.0 and @@ -100,214 +91,8 @@ namespace levenshtein { -template -inline levenshtein::Matrix levenshtein::matrix(std::basic_string_view sentence1, std::basic_string_view sentence2) { - Affix affix = remove_common_affix(sentence1, sentence2); - - std::size_t matrix_columns = sentence1.length() + 1; - std::size_t matrix_rows = sentence2.length() + 1; - - std::vector cache_matrix(matrix_rows*matrix_columns, 0); - - for (std::size_t i = 0; i < matrix_rows; ++i) { - cache_matrix[i] = i; - } - - for (std::size_t i = 1; i < matrix_columns; ++i) { - cache_matrix[matrix_rows*i] = i; - } - - std::size_t sentence1_pos = 0; - for (const auto &char1 : sentence1) { - auto prev_cache = cache_matrix.begin() + sentence1_pos * matrix_rows; - auto result_cache = cache_matrix.begin() + (sentence1_pos + 1) * matrix_rows + 1; - std::size_t result = sentence1_pos + 1; - for (const auto &char2 : sentence2) { - result = std::min({ - result + 1, - *prev_cache + (char1 != char2), - *(++prev_cache) + 1 - }); - *result_cache = result; - ++result_cache; - } - ++sentence1_pos; - } - - return Matrix { - affix.prefix_len, - cache_matrix, - matrix_columns, - matrix_rows - }; -} - - - -template -inline std::vector -levenshtein::editops(std::basic_string_view sentence1, std::basic_string_view sentence2) { - auto m = matrix(sentence1, sentence2); - std::size_t matrix_columns = m.matrix_columns; - std::size_t matrix_rows = m.matrix_rows; - std::size_t prefix_len = m.prefix_len; - auto lev_matrix = m.matrix; - - std::vector ops; - ops.reserve(lev_matrix[matrix_columns * matrix_rows - 1]); - - std::size_t i = matrix_columns - 1; - std::size_t j = matrix_rows - 1; - std::size_t position = matrix_columns * matrix_rows - 1; - - auto is_replace = [=](std::size_t pos) { - return lev_matrix[pos - matrix_rows - 1] < lev_matrix[pos]; - }; - auto is_insert = [=](std::size_t pos) { - return lev_matrix[pos - 1] < lev_matrix[pos]; - }; - auto is_delete = [=](std::size_t pos) { - return lev_matrix[pos - matrix_rows] < lev_matrix[pos]; - }; - auto is_keep = [=](std::size_t pos) { - return lev_matrix[pos - matrix_rows - 1] == lev_matrix[pos]; - }; - - while (i > 0 || j > 0) { - EditType op_type; - - if (i && j && is_replace(position)) { - op_type = EditType::EditReplace; - --i; - --j; - position -= matrix_rows + 1; - } else if (j && is_insert(position)) { - op_type = EditType::EditInsert; - --j; - --position; - } else if (i && is_delete(position)) { - op_type = EditType::EditDelete; - --i; - position -= matrix_rows; - } else if (is_keep(position)) { - --i; - --j; - position -= matrix_rows + 1; - // EditKeep does not has to be stored - continue; - } else { - throw std::logic_error("something went wrong extracting the editops from the levenshtein matrix"); - } - - ops.emplace_back(op_type, i + prefix_len, j + prefix_len); - } - - std::reverse(ops.begin(), ops.end()); - return ops; -} - - -template -inline std::vector -levenshtein::matching_blocks(std::basic_string_view sentence1, std::basic_string_view sentence2) { - auto edit_ops = editops(sentence1, sentence2); - std::size_t first_start = 0; - std::size_t second_start = 0; - std::vector mblocks; - - for (const auto &op : edit_ops) { - if (op.op_type == EditType::EditKeep) { - continue; - } - - if (first_start < op.first_start || second_start < op.second_start) { - mblocks.emplace_back(first_start, second_start, op.first_start - first_start); - first_start = op.first_start; - second_start = op.second_start; - } - - switch (op.op_type) { - case EditType::EditReplace: - first_start += 1; - second_start += 1; - break; - case EditType::EditDelete: - first_start += 1; - break; - case EditType::EditInsert: - second_start += 1; - break; - case EditType::EditKeep: - break; - } - } - - mblocks.emplace_back(sentence1.length(), sentence2.length(), 0); - return mblocks; -} - -inline float levenshtein::normalized_distance(std::wstring_view sentence1, std::wstring_view sentence2, float min_ratio) { - if (sentence1.empty() || sentence2.empty()) { - return sentence1.empty() && sentence2.empty(); - } - - std::size_t sentence1_len = utils::joined_size(sentence1); - std::size_t sentence2_len = utils::joined_size(sentence2); - std::size_t max_len = std::max(sentence1_len, sentence2_len); - - // constant time calculation to find a string ratio based on the string length - // so it can exit early without running any levenshtein calculations - std::size_t min_distance = (sentence1_len > sentence2_len) - ? sentence1_len - sentence2_len - : sentence2_len - sentence1_len; - - float len_ratio = 1.0 - (float)min_distance / (float)max_len; - if (len_ratio < min_ratio) { - return 0.0; - } - - std::size_t dist = distance(sentence1, sentence2); - - float ratio = 1.0 - (float)dist / (float)max_len; - return (ratio >= min_ratio) ? ratio : 0.0; -} - -inline std::size_t levenshtein::distance(std::wstring_view sentence1, std::wstring_view sentence2) { - - remove_common_affix(sentence1, sentence2); - - if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2); - - if (sentence2.empty()) { - return sentence1.length(); - } - - std::vector cache(sentence2.length()+1); - std::iota(cache.begin(), cache.end(), 0); - - for (const auto &char1 : sentence1) { - auto cache_iter = cache.begin(); - size_t temp = *cache_iter; - *cache_iter += 1; - - for (const auto& char2 : sentence2) { - if (char1 != char2) { - ++temp; - } - - temp = std::min({ - *cache_iter + 1, - *(++cache_iter) + 1, - temp - }); - std::swap(*cache_iter, temp); - } - } - return cache.back(); -} - -template -inline auto levenshtein::levenshtein_word_cmp(const CharT &letter_cmp, const string_view_vec &words, +template +inline auto levenshtein::levenshtein_word_cmp(const wchar_t &letter_cmp, const std::vector &words, std::vector &cache, std::size_t current_cache) { std::size_t result = current_cache + 1; @@ -315,7 +100,7 @@ inline auto levenshtein::levenshtein_word_cmp(const CharT &letter_cmp, const str auto word_iter = words.begin(); auto min_distance = std::numeric_limits::max(); - auto charCmp = [&] (const CharT &char2) { + auto charCmp = [&] (const wchar_t &char2) { if (letter_cmp == char2) { result = current_cache; } else { ++result; } @@ -356,9 +141,9 @@ inline auto levenshtein::levenshtein_word_cmp(const CharT &letter_cmp, const str } -template -inline std::size_t levenshtein::weighted_distance(string_view_vec sentence1, string_view_vec sentence2, MaxDistance max_distance) { - remove_common_affix(sentence1, sentence2); +template +inline std::size_t levenshtein::weighted_distance(std::vector sentence1, std::vector sentence2, MaxDistance max_distance) { + utils::remove_common_affix(sentence1, sentence2); std::size_t sentence1_len = utils::joined_size(sentence1); std::size_t sentence2_len = utils::joined_size(sentence2); @@ -380,7 +165,7 @@ inline std::size_t levenshtein::weighted_distance(string_view_vec sentenc // no delimiter in front of first word for (const auto &letter : *word_iter) { if constexpr(!std::is_same_v) { - size_t min_distance = levenshtein_word_cmp(letter, sentence2, cache, range1_pos); + std::size_t min_distance = levenshtein_word_cmp(letter, sentence2, cache, range1_pos); if (min_distance > max_distance) { return std::numeric_limits::max(); } @@ -395,19 +180,19 @@ inline std::size_t levenshtein::weighted_distance(string_view_vec sentenc for (; word_iter != sentence1.end(); ++word_iter) { // whitespace between words if constexpr(!std::is_same_v) { - size_t min_distance = levenshtein_word_cmp((CharT)0x20, sentence2, cache, range1_pos); + std::size_t min_distance = levenshtein_word_cmp((wchar_t)0x20, sentence2, cache, range1_pos); if (min_distance > max_distance) { return std::numeric_limits::max(); } } else { - levenshtein_word_cmp((CharT)0x20, sentence2, cache, range1_pos); + levenshtein_word_cmp((wchar_t)0x20, sentence2, cache, range1_pos); } ++range1_pos; for (const auto &letter : *word_iter) { if constexpr(!std::is_same_v) { - size_t min_distance = levenshtein_word_cmp(letter, sentence2, cache, range1_pos); + std::size_t min_distance = levenshtein_word_cmp(letter, sentence2, cache, range1_pos); if (min_distance > max_distance) { return std::numeric_limits::max(); } @@ -425,20 +210,7 @@ inline std::size_t levenshtein::weighted_distance(string_view_vec sentenc template inline std::size_t levenshtein::weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, MaxDistance max_distance) { - return weighted_distance_impl(sentence1, sentence2, max_distance); -} - - -template -inline std::size_t levenshtein::weighted_distance(std::string_view sentence1, std::string_view sentence2, MaxDistance max_distance) { - return weighted_distance_impl(sentence1, sentence2, max_distance); -} - - -template -inline std::size_t levenshtein::weighted_distance_impl(std::basic_string_view sentence1, std::basic_string_view sentence2, MaxDistance max_distance) { - - remove_common_affix(sentence1, sentence2); + utils::remove_common_affix(sentence1, sentence2); if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2); @@ -488,43 +260,7 @@ inline std::size_t levenshtein::weighted_distance_impl(std::basic_string_view sentence2.size()) { - std::swap(sentence1, sentence2); - std::swap(insert_cost, delete_cost); - } - const size_t min_size = sentence1.size(); - std::vector cache(sentence1.size() + 1); - - cache[0] = 0; - for (size_t i = 1; i < cache.size(); ++i) { - cache[i] = cache[i - 1] + delete_cost; - } - - for (const auto &char2 : sentence2) { - auto cache_iter = cache.begin(); - size_t temp = *cache_iter; - *cache_iter += insert_cost; - - for (const auto &char1 : sentence1) { - if (char1 != char2) { - temp = std::min({ - *cache_iter + delete_cost, - *(cache_iter+1) + insert_cost, - temp + replace_cost - }); - } - ++cache_iter; - std::swap(*cache_iter, temp); - } - } - - return cache.back(); -} template @@ -533,6 +269,7 @@ inline float levenshtein::normalized_weighted_distance(const Sentence1 &sentence if (sentence1.empty() || sentence2.empty()) { return sentence1.empty() && sentence2.empty(); } + return 1; std::size_t sentence1_len = utils::joined_size(sentence1); std::size_t sentence2_len = utils::joined_size(sentence2); diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp new file mode 100644 index 0000000..9835206 --- /dev/null +++ b/cpp/src/utils.cpp @@ -0,0 +1,151 @@ +#include "utils.hpp" + +/** + * Finds the longest common prefix between two ranges + */ +template +inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1, + InputIterator2 first2, InputIterator2 last2) +{ + return std::distance(first1, std::mismatch(first1, last1, first2, last2).first); +} + +/** + * Removes common prefix of two string views + */ +std::size_t remove_common_prefix(std::wstring_view& a, std::wstring_view& b) { + auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end()); + a.remove_prefix(prefix); + b.remove_prefix(prefix); + return prefix; +} + +/** + * Removes common suffix of two string views + */ +std::size_t remove_common_suffix(std::wstring_view& a, std::wstring_view& b) { + auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend()); + a.remove_suffix(suffix); + b.remove_suffix(suffix); + return suffix; +} + +/** + * Removes common affix of two string views + */ +Affix utils::remove_common_affix(std::wstring_view& a, std::wstring_view& b) { + return Affix { + remove_common_prefix(a, b), + remove_common_suffix(a, b) + }; +} + +template +void vec_remove_common_affix(T &a, T &b) { + auto prefix = std::mismatch(a.begin(), a.end(), b.begin(), b.end()); + a.erase(a.begin(), prefix.first); + b.erase(b.begin(), prefix.second); + + auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend()); + a.erase(a.end()-suffix, a.end()); + b.erase(b.end()-suffix, b.end()); +} + +void utils::remove_common_affix(std::vector &a, std::vector &b) +{ + vec_remove_common_affix(a, b); + if (!a.empty() && !b.empty()) { + remove_common_prefix(a.front(), b.front()); + remove_common_suffix(a.back(), b.back()); + } +} + +std::wstring utils::join(const std::vector &sentence) { + if (sentence.empty()) { + return std::wstring(); + } + + auto sentence_iter = sentence.begin(); + std::wstring result {*sentence_iter}; + const std::wstring whitespace {0x20}; + ++sentence_iter; + for (; sentence_iter != sentence.end(); ++sentence_iter) { + result.append(whitespace).append(std::wstring {*sentence_iter}); + } + return result; +} + +percent utils::result_cutoff(float result, percent score_cutoff) { + return (result >= score_cutoff) ? result : 0; +} + +// trim from start (in place) +void ltrim(std::wstring &s) { + s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) { + return !std::isspace(ch); + })); +} + + +// trim from end (in place) +void rtrim(std::wstring &s) { + s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { + return !std::isspace(ch); + }).base(), s.end()); +} + + +// trim from both ends (in place) +void utils::trim(std::wstring &s) { + ltrim(s); + rtrim(s); +} + + +void utils::lower_case(std::wstring &s) { + std::for_each(s.begin(), s.end(), [](wchar_t & c){ + c = ::tolower(c); + }); +} + +std::wstring utils::default_process(std::wstring s) { + trim(s); + lower_case(s); + return s; +} + +DecomposedSet utils::set_decomposition(std::vector a, std::vector b) { + std::vector intersection; + std::vector difference_ab; + a.erase(std::unique(a.begin(), a.end()), a.end()); + b.erase(std::unique(b.begin(), b.end()), b.end()); + + for (const auto ¤t_a : a) { + auto element_b = std::find(b.begin(), b.end(), current_a); + if (element_b != b.end()) { + b.erase(element_b); + intersection.emplace_back(current_a); + } else { + difference_ab.emplace_back(current_a); + } + } + + return DecomposedSet{intersection, difference_ab, b}; +} + +std::size_t utils::joined_size(const std::wstring_view &x){ + return x.size(); +} + + +std::size_t utils::joined_size(const std::vector &x){ + if (x.empty()) { + return 0; + } + + // there is a whitespace between each word + std::size_t result = x.size() - 1; + for (const auto &y: x) result += y.size(); + + return result; +} \ No newline at end of file diff --git a/cpp/src/utils.hpp b/cpp/src/utils.hpp index 32609d2..3dd781a 100644 --- a/cpp/src/utils.hpp +++ b/cpp/src/utils.hpp @@ -7,54 +7,34 @@ using percent = float; -template -using string_view_vec = std::vector>; - - -namespace detail { - template - auto char_type(T const*) -> T; - - template - auto char_type(T const&) -> typename std::iterator_traits::value_type; -} - -template -using char_type = decltype(detail::char_type(std::declval())); - - -template struct DecomposedSet { - string_view_vec intersection; - string_view_vec difference_ab; - string_view_vec difference_ba; - DecomposedSet(string_view_vec intersection, string_view_vec difference_ab, string_view_vec difference_ba) + std::vector intersection; + std::vector difference_ab; + std::vector difference_ba; + DecomposedSet(std::vector intersection, std::vector difference_ab, std::vector difference_ba) : intersection(std::move(intersection)), difference_ab(std::move(difference_ab)), difference_ba(std::move(difference_ba)) {} }; +struct Affix { + std::size_t prefix_len; + std::size_t suffix_len; +}; + namespace utils { - template< - typename T, typename CharT = char_type, - typename = std::enable_if_t>{}> - > - string_view_vec splitSV(const T &str); - - - template - DecomposedSet set_decomposition(string_view_vec a, string_view_vec b); - - template - std::size_t joined_size(const T &x); + std::vector splitSV(const T &str); - template - std::size_t joined_size(const std::vector &x); + DecomposedSet set_decomposition(std::vector a, std::vector b); - template - std::basic_string join(const string_view_vec &sentence); + std::size_t joined_size(const std::wstring_view &x); + + std::size_t joined_size(const std::vector &x); + + + std::wstring join(const std::vector &sentence); percent result_cutoff(float result, percent score_cutoff); @@ -62,12 +42,16 @@ namespace utils { void lower_case(std::wstring &s); std::wstring default_process(std::wstring s); + + Affix remove_common_affix(std::wstring_view& a, std::wstring_view& b); + + void remove_common_affix(std::vector &a, std::vector &b); } -template -string_view_vec utils::splitSV(const T &str) { - string_view_vec output; +template +inline std::vector utils::splitSV(const T &str) { + std::vector output; // assume a word length of 6 + 1 whitespace output.reserve(str.size() / 7); @@ -82,178 +66,3 @@ string_view_vec utils::splitSV(const T &str) { return output; } - - -template -DecomposedSet utils::set_decomposition(string_view_vec a, string_view_vec b) { - string_view_vec intersection; - string_view_vec difference_ab; - a.erase(std::unique(a.begin(), a.end()), a.end()); - b.erase(std::unique(b.begin(), b.end()), b.end()); - - for (const auto ¤t_a : a) { - auto element_b = std::find(b.begin(), b.end(), current_a); - if (element_b != b.end()) { - b.erase(element_b); - intersection.emplace_back(current_a); - } else { - difference_ab.emplace_back(current_a); - } - } - - return DecomposedSet{intersection, difference_ab, b}; -} - - -/** - * Finds the longest common prefix between two ranges - */ -template -inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1, - InputIterator2 first2, InputIterator2 last2) -{ - return std::distance(first1, std::mismatch(first1, last1, first2, last2).first); -} - -/** - * Removes common prefix of two string views - */ -template -inline std::size_t remove_common_prefix(std::basic_string_view& a, std::basic_string_view& b) { - auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end()); - a.remove_prefix(prefix); - b.remove_prefix(prefix); - return prefix; -} - -/** - * Removes common suffix of two string views - */ -template -inline std::size_t remove_common_suffix(std::basic_string_view& a, std::basic_string_view& b) { - auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend()); - a.remove_suffix(suffix); - b.remove_suffix(suffix); - return suffix; -} - -struct Affix { - std::size_t prefix_len; - std::size_t suffix_len; -}; - -/** - * Removes common affix of two string views - */ -template -inline Affix remove_common_affix(std::basic_string_view& a, std::basic_string_view& b) { - return Affix { - remove_common_prefix(a, b), - remove_common_suffix(a, b) - }; -} - - -template -inline void vec_remove_common_affix(T &a, T &b) { - auto prefix = std::mismatch(a.begin(), a.end(), b.begin(), b.end()); - a.erase(a.begin(), prefix.first); - b.erase(b.begin(), prefix.second); - - auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend()); - a.erase(a.end()-suffix, a.end()); - b.erase(b.end()-suffix, b.end()); -} - -template -inline void vec_common_affix(std::vector &a, std::vector &b) { - iterable_remove_common_affix(a, b); -} - -template -inline void remove_common_affix(std::vector &a, std::vector &b) -{ - vec_remove_common_affix(a, b); - if (!a.empty() && !b.empty()) { - remove_common_prefix(a.front(), b.front()); - remove_common_suffix(a.back(), b.back()); - } -} - - -template -inline std::size_t utils::joined_size(const T &x){ - return x.size(); -} - - -template -inline std::size_t utils::joined_size(const std::vector &x){ - if (x.empty()) { - return 0; - } - - // there is a whitespace between each word - std::size_t result = x.size() - 1; - for (const auto &y: x) result += y.size(); - - return result; -} - - -template -std::basic_string utils::join(const string_view_vec &sentence) { - if (sentence.empty()) { - return std::basic_string(); - } - - auto sentence_iter = sentence.begin(); - std::basic_string result {*sentence_iter}; - const std::basic_string whitespace {0x20}; - ++sentence_iter; - for (; sentence_iter != sentence.end(); ++sentence_iter) { - result.append(whitespace).append(std::basic_string {*sentence_iter}); - } - return result; -} - - -inline percent utils::result_cutoff(float result, percent score_cutoff) { - return (result >= score_cutoff) ? result : 0; -} - - -// trim from start (in place) -inline void ltrim(std::wstring &s) { - s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) { - return !std::isspace(ch); - })); -} - - -// trim from end (in place) -inline void rtrim(std::wstring &s) { - s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { - return !std::isspace(ch); - }).base(), s.end()); -} - - -// trim from both ends (in place) -inline void utils::trim(std::wstring &s) { - ltrim(s); - rtrim(s); -} - - -inline void utils::lower_case(std::wstring &s) { - std::for_each(s.begin(), s.end(), [](wchar_t & c){ - c = ::tolower(c); - }); -} - -inline std::wstring utils::default_process(std::wstring s) { - trim(s); - lower_case(s); - return s; -} \ No newline at end of file diff --git a/setup.py b/setup.py index da22cc9..4d29ed1 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,9 @@ ext_modules = [ [ 'python/src/rapidfuzz.cpp', 'cpp/src/fuzz.cpp', - 'cpp/src/process.cpp' + 'cpp/src/process.cpp', + 'cpp/src/levenshtein.cpp', + 'cpp/src/utils.cpp' ], include_dirs=[ # Path to pybind11 headers