From 097365692aecd9e86f0995d9434d794b9d66a56d Mon Sep 17 00:00:00 2001 From: maxbachmann Date: Sat, 21 Mar 2020 19:39:33 +0100 Subject: [PATCH] make levenshtein work string_view and wstring_view --- cpp/src/fuzz.cpp | 18 +-- cpp/src/levenshtein.cpp | 234 ------------------------------- cpp/src/levenshtein.hpp | 298 ++++++++++++++++++++++++++++++++-------- cpp/src/utils.hpp | 33 ++--- setup.py | 1 - 5 files changed, 264 insertions(+), 320 deletions(-) delete mode 100644 cpp/src/levenshtein.cpp diff --git a/cpp/src/fuzz.cpp b/cpp/src/fuzz.cpp index 109b421..876a60c 100644 --- a/cpp/src/fuzz.cpp +++ b/cpp/src/fuzz.cpp @@ -61,9 +61,9 @@ percent _token_ratio(const std::wstring &a, const std::wstring &b, percent score auto [intersection, difference_ab, difference_ba] = utils::set_decomposition(tokens_a, tokens_b); - std::size_t ab_len = utils::joined_size(difference_ab, 1); - std::size_t ba_len = utils::joined_size(difference_ba, 1); - std::size_t double_prefix = 2 * utils::joined_size(intersection, 1); + std::size_t ab_len = utils::joined_size(difference_ab); + std::size_t ba_len = utils::joined_size(difference_ba); + std::size_t double_prefix = 2 * utils::joined_size(intersection); // fuzzywuzzy joined sect and ab/ba for comparisions // this is not done here as an optimisation, so the lengths get incremented by 1 @@ -76,11 +76,11 @@ percent _token_ratio(const std::wstring &a, const std::wstring &b, percent score ++ba_len; } - float result = levenshtein::normalized_weighted_distance(tokens_a, tokens_b, score_cutoff / 100, L" "); + float result = levenshtein::normalized_weighted_distance(tokens_a, tokens_b, score_cutoff / 100); // TODO: could add score cutoff aswell, but would need to copy most things from normalized_score_cutoff // as an alternative add another utility function to levenshtein for this case - std::size_t sect_distance = levenshtein::weighted_distance(difference_ab, difference_ba, L" "); + std::size_t sect_distance = levenshtein::weighted_distance(difference_ab, difference_ba); if (sect_distance != std::numeric_limits::max()) { std::size_t lensum = ab_len + ba_len + double_prefix; result = std::max(result, (float)1.0 - sect_distance / (float)lensum); @@ -184,9 +184,9 @@ percent fuzz::token_set_ratio(const std::wstring &a, const std::wstring &b, perc auto [intersection, difference_ab, difference_ba] = utils::set_decomposition(tokens_a, tokens_b); - std::size_t ab_len = utils::joined_size(difference_ab, 1); - std::size_t ba_len = utils::joined_size(difference_ba, 1); - std::size_t double_prefix = 2 * utils::joined_size(intersection, 1); + std::size_t ab_len = utils::joined_size(difference_ab); + std::size_t ba_len = utils::joined_size(difference_ba); + std::size_t double_prefix = 2 * utils::joined_size(intersection); // fuzzywuzzy joined sect and ab/ba for comparisions // this is not done here as an optimisation, so the lengths get incremented by 1 @@ -201,7 +201,7 @@ percent fuzz::token_set_ratio(const std::wstring &a, const std::wstring &b, perc // TODO: could add score cutoff aswell, but would need to copy most things from normalized_score_cutoff // as an alternative add another utility function to levenshtein for this case - std::size_t sect_distance = levenshtein::weighted_distance(difference_ab, difference_ba, L" "); + std::size_t sect_distance = levenshtein::weighted_distance(difference_ab, difference_ba); float result = 0; if (sect_distance != std::numeric_limits::max()) { std::size_t lensum = ab_len + ba_len + double_prefix; diff --git a/cpp/src/levenshtein.cpp b/cpp/src/levenshtein.cpp deleted file mode 100644 index b844d4d..0000000 --- a/cpp/src/levenshtein.cpp +++ /dev/null @@ -1,234 +0,0 @@ -#include "levenshtein.hpp" -#include -#include - - -template -auto levenshtein_word_cmp(const char &letter_cmp, const string_view_vec &words, - std::vector &cache, std::size_t current_cache, Delimiter delimiter=std::nullopt) -{ - std::size_t result = current_cache + 1; - auto cache_iter = cache.begin(); - auto word_iter = words.begin(); - auto min_distance = std::numeric_limits::max(); - - auto charCmp = [&] (const char &char2) { - if (letter_cmp == char2) { result = current_cache; } - else { ++result; } - - current_cache = *cache_iter; - if (result > current_cache + 1) { - result = current_cache + 1; - } - - if constexpr(!std::is_same::value) { - if (current_cache < min_distance) { - min_distance = current_cache; - } - } - - *cache_iter = result; - ++cache_iter; - }; - - // no delimiter should be added in front of the first word - for (const auto &letter : *word_iter) { - charCmp(letter); - } - ++word_iter; - - for (; word_iter != words.end(); ++word_iter) { - // between every word there should be a delimiter if one exists - if constexpr(!std::is_same::value) { - for (const auto &letter : delimiter) { - charCmp(letter); - } - } - // check following word - for (const auto &letter : *word_iter) { - charCmp(letter); - } - } - - if constexpr(!std::is_same::value) { - return min_distance; - } -} - - -std::size_t levenshtein::weighted_distance(std::vector sentence1, std::vector sentence2, std::wstring_view delimiter) { - remove_common_affix(sentence1, sentence2); - std::size_t sentence1_len = utils::joined_size(sentence1, delimiter); - std::size_t sentence2_len = utils::joined_size(sentence2, delimiter); - - if (sentence2_len > sentence1_len) { - std::swap(sentence1, sentence2); - std::swap(sentence1_len, sentence2_len); - } - - if (!sentence2_len) { - return sentence1_len; - } - - std::vector cache(sentence2_len); - std::iota(cache.begin(), cache.end(), 1); - - std::size_t range1_pos = 0; - auto word_iter = sentence1.begin(); - - // no delimiter in front of first word - for (const auto &letter : *word_iter) { - levenshtein_word_cmp(letter, sentence2, cache, range1_pos, delimiter); - ++range1_pos; - } - - ++word_iter; - for (; word_iter != sentence1.end(); ++word_iter) { - // delimiter between words - for (const auto &letter : delimiter) { - levenshtein_word_cmp(letter, sentence2, cache, range1_pos, delimiter); - ++range1_pos; - } - - for (const auto &letter : *word_iter) { - levenshtein_word_cmp(letter, sentence2, cache, range1_pos, delimiter); - ++range1_pos; - } - } - - return cache.back(); -} - - -std::size_t levenshtein::weighted_distance(std::vector sentence1, std::vector sentence2, std::size_t max_distance, std::wstring_view delimiter) { - remove_common_affix(sentence1, sentence2); - std::size_t sentence1_len = utils::joined_size(sentence1, delimiter); - std::size_t sentence2_len = utils::joined_size(sentence2, delimiter); - - if (sentence2_len > sentence1_len) { - std::swap(sentence1, sentence2); - std::swap(sentence1_len, sentence2_len); - } - - if (!sentence2_len) { - return sentence1_len; - } - - std::vector cache(sentence2_len); - std::iota(cache.begin(), cache.end(), 1); - - std::size_t range1_pos = 0; - auto word_iter = sentence1.begin(); - - // no delimiter in front of first word - for (const auto &letter : *word_iter) { - auto min_distance = levenshtein_word_cmp(letter, sentence2, cache, range1_pos, delimiter); - if (min_distance > max_distance) { - return std::numeric_limits::max(); - } - ++range1_pos; - } - - ++word_iter; - for (; word_iter != sentence1.end(); ++word_iter) { - // delimiter between words - for (const auto &letter : delimiter) { - auto min_distance = levenshtein_word_cmp(letter, sentence2, cache, range1_pos, delimiter); - if (min_distance > max_distance) { - return std::numeric_limits::max(); - } - ++range1_pos; - } - - for (const auto &letter : *word_iter) { - auto min_distance = levenshtein_word_cmp(letter, sentence2, cache, range1_pos, delimiter); - if (min_distance > max_distance) { - return std::numeric_limits::max(); - } - ++range1_pos; - } - } - - return cache.back(); -} - - -std::size_t levenshtein::weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, std::wstring_view delimiter) { - remove_common_affix(sentence1, sentence2); - - if (sentence2.length() > sentence1.length()) std::swap(sentence1, sentence2); - - if (sentence2.empty()) { - return sentence1.length(); - } - - std::vector cache(sentence2.length()); - std::iota(cache.begin(), cache.end(), 1); - - std::size_t sentence1_pos = 0; - for (const auto &char1 : sentence1) { - auto cache_iter = cache.begin(); - std::size_t current_cache = sentence1_pos; - std::size_t result = sentence1_pos + 1; - for (const auto &char2 : sentence2) { - if (char1 == char2) { - result = current_cache; - } else { - ++result; - } - current_cache = *cache_iter; - if (result > current_cache + 1) { - result = current_cache + 1; - } - *cache_iter = result; - ++cache_iter; - } - ++sentence1_pos; - } - - return cache.back(); -} - - -std::size_t levenshtein::weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, std::size_t max_distance, std::wstring_view delimiter) { - remove_common_affix(sentence1, sentence2); - - if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2); - - if (sentence2.empty()) { - return sentence1.length(); - } - - std::vector cache(sentence2.length()); - std::iota(cache.begin(), cache.end(), 1); - - std::size_t sentence1_pos = 0; - for (const auto &char1 : sentence1) { - auto cache_iter = cache.begin(); - std::size_t current_cache = sentence1_pos; - std::size_t result = sentence1_pos+1; - auto min_distance = std::numeric_limits::max(); - for (const auto &char2 : sentence2) { - if (char1 == char2) { - result = current_cache; - } else { - ++result; - } - current_cache = *cache_iter; - if (result > current_cache + 1) { - result = current_cache + 1; - } - - if (current_cache < min_distance) { - min_distance = current_cache; - } - *cache_iter = result; - ++cache_iter; - } - if (min_distance > max_distance) { - return std::numeric_limits::max(); - } - ++sentence1_pos; - } - return cache.back(); -} diff --git a/cpp/src/levenshtein.hpp b/cpp/src/levenshtein.hpp index a133e85..f32f783 100644 --- a/cpp/src/levenshtein.hpp +++ b/cpp/src/levenshtein.hpp @@ -3,9 +3,10 @@ #include #include #include +#include +#include #include "utils.hpp" - namespace levenshtein { enum EditType { EditKeep, @@ -47,6 +48,10 @@ namespace levenshtein { std::vector matching_blocks(std::basic_string_view sentence1, std::basic_string_view sentence2); + template + auto levenshtein_word_cmp(const CharT &letter_cmp, const string_view_vec &words, + std::vector &cache, std::size_t current_cache); + /** * Calculates the minimum number of insertions, deletions, and substitutions * required to change one sequence into the other according to Levenshtein. @@ -58,68 +63,31 @@ namespace levenshtein { * Insert | 1 * Remove | 1 * Replace | 2 + * + * @param sentence1 first sentence to match (can be either a string type or a vector of strings) + * @param sentence2 second sentence to match (can be either a string type or a vector of strings) + * @param max_distance maximum distance to exit early. When using this the calculation is about 20% slower + * so when it can not exit early it should not be used + * @return weighted levenshtein distance */ - std::size_t weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, - std::wstring_view delimiter=L""); - std::size_t weighted_distance(std::vector sentence1, std::vector sentence2, - std::wstring_view delimiter=L""); + template + std::size_t weighted_distance_impl(std::basic_string_view sentence1, std::basic_string_view sentence2, MinDistance max_distance=std::nullopt); + template + std::size_t weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, MinDistance max_distance=std::nullopt); - /** - * These functions allow providing a max_distance parameter that can be used to exit early when the - * calculated levenshtein distance is at least as big as max_distance and will return the maximal - * possible value for std::size_t. - * This range check makes the levenshtein calculation about 20% slower, so it should be only used - * when it can usually exit early. - */ - std::size_t weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, - std::size_t max_distance, std::wstring_view delimiter=L""); - std::size_t weighted_distance(std::vector sentence1, std::vector sentence2, - std::size_t max_distance, std::wstring_view delimiter=L""); + template + std::size_t weighted_distance(std::string_view sentence1, std::string_view sentence2, MinDistance max_distance=std::nullopt); + + template + std::size_t weighted_distance(string_view_vec sentence1, string_view_vec sentence2, MinDistance max_distance=std::nullopt); /** * Calculates a normalized score of the weighted Levenshtein algorithm between 0.0 and * 1.0 (inclusive), where 1.0 means the sequences are the same. */ template - float normalized_weighted_distance(const Sentence1 &sentence1, const Sentence2 &sentence2, - float min_ratio=0.0, std::wstring_view delimiter=L"") - { - if (sentence1.empty() && sentence2.empty()) { - return 1.0; - } - - if (sentence1.empty() || sentence1.empty()) { - return 0.0; - } - - std::size_t sentence1_len = utils::joined_size(sentence1, delimiter); - std::size_t sentence2_len = utils::joined_size(sentence2, delimiter); - std::size_t lensum = sentence1_len + sentence2_len; - - // constant time calculation to find a string ratio based on the string length - // so it can exit early without running any levenshtein calculations - std::size_t min_distance = (sentence1_len > sentence2_len) - ? sentence1_len - sentence2_len - : sentence2_len - sentence1_len; - - float len_ratio = 1.0 - (float)min_distance / (float)lensum; - if (len_ratio < min_ratio) { - return 0.0; - } - - // TODO: this needs more thoughts when to start using score cutoff, since it performs slower when it can not exit early - // -> just because it has a smaller ratio does not mean levenshtein can always exit early - // has to be tested with some more real examples - std::size_t distance = (min_ratio > 0.7) - ? weighted_distance(sentence1, sentence2, std::ceil((float)lensum - min_ratio * lensum), delimiter) - : weighted_distance(sentence1, sentence2, delimiter); - - if (distance == std::numeric_limits::max()) { - return 0.0; - } - return 1.0 - (float)distance / (float)lensum; - } + float normalized_weighted_distance(const Sentence1 &sentence1, const Sentence2 &sentence2, float min_ratio=0.0); } @@ -267,4 +235,224 @@ levenshtein::matching_blocks(std::basic_string_view sentence1, std::basic mblocks.emplace_back(sentence1.length(), sentence2.length(), 0); return mblocks; -} \ No newline at end of file +} + + +template +inline auto levenshtein::levenshtein_word_cmp(const CharT &letter_cmp, const string_view_vec &words, + std::vector &cache, std::size_t current_cache) +{ + std::size_t result = current_cache + 1; + auto cache_iter = cache.begin(); + auto word_iter = words.begin(); + auto min_distance = std::numeric_limits::max(); + + auto charCmp = [&] (const CharT &char2) { + if (letter_cmp == char2) { result = current_cache; } + else { ++result; } + + current_cache = *cache_iter; + if (result > current_cache + 1) { + result = current_cache + 1; + } + + if constexpr(!std::is_same_v) { + if (current_cache < min_distance) { + min_distance = current_cache; + } + } + + *cache_iter = result; + ++cache_iter; + }; + + // no whitespace should be added in front of the first word + for (const auto &letter : *word_iter) { + charCmp(letter); + } + ++word_iter; + + for (; word_iter != words.end(); ++word_iter) { + // between every word there should be one whitespace + charCmp(0x20); + + // check following word + for (const auto &letter : *word_iter) { + charCmp(letter); + } + } + + if constexpr(!std::is_same_v) { + return min_distance; + } +} + + +template +inline std::size_t levenshtein::weighted_distance(string_view_vec sentence1, string_view_vec sentence2, + MinDistance max_distance) { + remove_common_affix(sentence1, sentence2); + std::size_t sentence1_len = utils::joined_size(sentence1); + std::size_t sentence2_len = utils::joined_size(sentence2); + + if (sentence2_len > sentence1_len) { + std::swap(sentence1, sentence2); + std::swap(sentence1_len, sentence2_len); + } + + if (!sentence2_len) { + return sentence1_len; + } + + std::vector cache(sentence2_len); + std::iota(cache.begin(), cache.end(), 1); + + std::size_t range1_pos = 0; + auto word_iter = sentence1.begin(); + + // no delimiter in front of first word + for (const auto &letter : *word_iter) { + if constexpr(!std::is_same_v) { + size_t min_distance = levenshtein_word_cmp(letter, sentence2, cache, range1_pos); + if (min_distance > max_distance) { + return std::numeric_limits::max(); + } + } else { + levenshtein_word_cmp(letter, sentence2, cache, range1_pos); + } + + ++range1_pos; + } + + ++word_iter; + for (; word_iter != sentence1.end(); ++word_iter) { + // whitespace between words + if constexpr(!std::is_same_v) { + size_t min_distance = levenshtein_word_cmp((CharT)0x20, sentence2, cache, range1_pos); + if (min_distance > max_distance) { + return std::numeric_limits::max(); + } + } else { + levenshtein_word_cmp((CharT)0x20, sentence2, cache, range1_pos); + } + + ++range1_pos; + + for (const auto &letter : *word_iter) { + if constexpr(!std::is_same_v) { + size_t min_distance = levenshtein_word_cmp(letter, sentence2, cache, range1_pos); + if (min_distance > max_distance) { + return std::numeric_limits::max(); + } + } else { + levenshtein_word_cmp(letter, sentence2, cache, range1_pos); + } + + ++range1_pos; + } + } + + return cache.back(); +} + + +template +inline std::size_t levenshtein::weighted_distance(std::wstring_view sentence1, std::wstring_view sentence2, MinDistance max_distance) { + return weighted_distance_impl(sentence1, sentence2, max_distance); +} + + +template +inline std::size_t levenshtein::weighted_distance(std::string_view sentence1, std::string_view sentence2, MinDistance max_distance) { + return weighted_distance_impl(sentence1, sentence2, max_distance); +} + + +template +inline std::size_t levenshtein::weighted_distance_impl(std::basic_string_view sentence1, std::basic_string_view sentence2, MinDistance max_distance) { + + remove_common_affix(sentence1, sentence2); + + if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2); + + if (sentence2.empty()) { + return sentence1.length(); + } + + std::vector cache(sentence2.length()); + std::iota(cache.begin(), cache.end(), 1); + + std::size_t sentence1_pos = 0; + for (const auto &char1 : sentence1) { + auto cache_iter = cache.begin(); + std::size_t current_cache = sentence1_pos; + std::size_t result = sentence1_pos+1; + auto min_distance = std::numeric_limits::max(); + for (const auto &char2 : sentence2) { + if (char1 == char2) { + result = current_cache; + } else { + ++result; + } + current_cache = *cache_iter; + if (result > current_cache + 1) { + result = current_cache + 1; + } + if constexpr(!std::is_same_v) { + if (current_cache < min_distance) { + min_distance = current_cache; + } + } + *cache_iter = result; + ++cache_iter; + } + if constexpr(!std::is_same_v) { + if (min_distance > max_distance) { + return std::numeric_limits::max(); + } + } + ++sentence1_pos; + } + return cache.back(); +} + + + +template +inline float levenshtein::normalized_weighted_distance(const Sentence1 &sentence1, const Sentence2 &sentence2, float min_ratio) +{ + if (sentence1.empty() && sentence2.empty()) { + return 1.0; + } + + if (sentence1.empty() || sentence1.empty()) { + return 0.0; + } + + std::size_t sentence1_len = utils::joined_size(sentence1); + std::size_t sentence2_len = utils::joined_size(sentence2); + std::size_t lensum = sentence1_len + sentence2_len; + + // constant time calculation to find a string ratio based on the string length + // so it can exit early without running any levenshtein calculations + std::size_t min_distance = (sentence1_len > sentence2_len) + ? sentence1_len - sentence2_len + : sentence2_len - sentence1_len; + + float len_ratio = 1.0 - (float)min_distance / (float)lensum; + if (len_ratio < min_ratio) { + return 0.0; + } + + // TODO: this needs more thoughts when to start using score cutoff, since it performs slower when it can not exit early + // -> just because it has a smaller ratio does not mean levenshtein can always exit early + // has to be tested with some more real examples + std::size_t distance = (min_ratio > 0.7) + ? weighted_distance(sentence1, sentence2, std::ceil((float)lensum - min_ratio * lensum)) + : weighted_distance(sentence1, sentence2); + + if (distance == std::numeric_limits::max()) { + return 0.0; + } + return 1.0 - (float)distance / (float)lensum; +} diff --git a/cpp/src/utils.hpp b/cpp/src/utils.hpp index d4165cd..a7b8e88 100644 --- a/cpp/src/utils.hpp +++ b/cpp/src/utils.hpp @@ -39,11 +39,11 @@ namespace utils { decomposed_set set_decomposition(string_view_vec a, string_view_vec b); - template - inline std::size_t joined_size(const T &x, const Delimiter &delimiter=std::nullopt); + template + std::size_t joined_size(const T &x); - template - inline std::size_t joined_size(const std::vector &x, const Delimiter &delimiter=std::nullopt); + template + std::size_t joined_size(const std::vector &x); template @@ -170,31 +170,22 @@ inline void remove_common_affix(std::vector &a, std::vector &b) } -template -inline std::size_t utils::joined_size(const T &x, const Delimiter &delimiter){ +template +inline std::size_t utils::joined_size(const T &x){ return x.size(); } -template -inline std::size_t utils::joined_size(const std::vector &x, const Delimiter &delimiter){ + +template +inline std::size_t utils::joined_size(const std::vector &x){ if (x.empty()) { return 0; } - std::size_t result; - if constexpr(!std::is_same::value) { - if constexpr(std::is_integral::value) { - result = (x.size() - 1) * delimiter; - } else { - result = (x.size() - 1) * delimiter.size(); - } - } else { - result = 0; - } + // there is a whitespace between each word + std::size_t result = x.size() - 1; + for (const auto &y: x) result += y.size(); - for (const auto &y: x) { - result += joined_size(y, delimiter); - } return result; } diff --git a/setup.py b/setup.py index f3d7bca..492999a 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,6 @@ ext_modules = [ '_rapidfuzz_cpp', [ 'python/src/rapidfuzz.cpp', - 'cpp/src/levenshtein.cpp', 'cpp/src/fuzz.cpp', 'cpp/src/process.cpp' ],