diff --git a/.gitignore b/.gitignore index 2f2b1eb..0319093 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ .vscode/ +__pycache__/ +.idea/ rapidfuzz.egg-info/ dist/ *.data diff --git a/cpp/src/levenshtein.hpp b/cpp/src/levenshtein.hpp index 7b83034..f6f4f37 100644 --- a/cpp/src/levenshtein.hpp +++ b/cpp/src/levenshtein.hpp @@ -87,6 +87,9 @@ namespace levenshtein { template std::size_t weighted_distance(string_view_vec sentence1, string_view_vec sentence2, MaxDistance max_distance=std::nullopt); + + size_t generic_distance(std::wstring_view source, std::wstring_view target, size_t insert_cost = 1, size_t delete_cost = 1, size_t replace_cost = 1); + /** * Calculates a normalized score of the weighted Levenshtein algorithm between 0.0 and * 1.0 (inclusive), where 1.0 means the sequences are the same. @@ -283,16 +286,20 @@ inline std::size_t levenshtein::distance(std::wstring_view sentence1, std::wstri std::iota(cache.begin(), cache.end(), 0); for (const auto &char1 : sentence1) { - size_t temp = cache[0]++; + auto cache_iter = cache.begin(); + size_t temp = *cache_iter; + *cache_iter += 1; + for (size_t j = 1; j < cache.size(); ++j) { - size_t p = cache[j - 1]; - size_t r = cache[j]; + size_t p = *cache_iter; + ++cache_iter; + size_t r = *cache_iter; temp = std::min( std::min(r, p) + 1, temp + (char1 == sentence2[j - 1] ? 0 : 1) ); - std::swap(cache[j], temp); + std::swap(*cache_iter, temp); } } return cache.back(); @@ -481,6 +488,44 @@ inline std::size_t levenshtein::weighted_distance_impl(std::basic_string_view sentence2.size()) { + std::swap(sentence1, sentence2); + std::swap(insert_cost, delete_cost); + } + + const size_t min_size = sentence1.size(); + std::vector cache(sentence1.size() + 1); + + cache[0] = 0; + for (size_t i = 1; i < cache.size(); ++i) { + cache[i] = cache[i - 1] + delete_cost; + } + + for (const auto &char2 : sentence2) { + auto cache_iter = cache.begin(); + size_t temp = *cache_iter; + *cache_iter += insert_cost; + + for (const auto &char1 : sentence1) { + if (char1 != char2) { + temp= std::min({ + *cache_iter + delete_cost, + *(cache_iter+1) + insert_cost, + temp + replace_cost + }); + } + ++cache_iter; + std::swap(*cache_iter, temp); + } + } + + return cache.back(); +} + template inline float levenshtein::normalized_weighted_distance(const Sentence1 &sentence1, const Sentence2 &sentence2, float min_ratio) diff --git a/python/src/rapidfuzz.cpp b/python/src/rapidfuzz.cpp index 12a8e65..4277ce4 100644 --- a/python/src/rapidfuzz.cpp +++ b/python/src/rapidfuzz.cpp @@ -233,29 +233,32 @@ PYBIND11_MODULE(_rapidfuzz_cpp, m) { py::arg("s1"), py::arg("s2"), py::arg("score_cutoff") = 0); mlevenshtein.def("weighted_distance", - [](std::wstring_view s1, std::wstring_view s2){ - return levenshtein::weighted_distance(s1, s2); + [](std::wstring_view s1, std::wstring_view s2, size_t insert_cost, size_t delete_cost, size_t replace_cost){ + if (insert_cost == 1 && delete_cost == 1) { + if (replace_cost == 1) { + return levenshtein::distance(s1, s2); + } else if (replace_cost == 2) { + return levenshtein::weighted_distance(s1, s2); + } + } + return levenshtein::generic_distance(s1, s2, insert_cost, delete_cost, replace_cost); }, R"pbdoc( Calculates the minimum number of insertions, deletions, and substitutions - required to change one sequence into the other according to Levenshtein. - Opposed to the normal distance function which has a cost of 1 for all edit operations, - it uses the following costs for edit operations: - - edit operation | cost - :------------- | :--- - Insert | 1 - Remove | 1 - Replace | 2 + required to change one sequence into the other according to Levenshtein with custom + costs for insertion, deletion and substitution Args: s1 (str): first string to compare s2 (str): second string to compare + insert_cost (int): cost for insertions + delete_cost (int): cost for deletions + replace_cost (int): cost for substitutions Returns: int: weighted levenshtein distance between s1 and s2 )pbdoc", - py::arg("s1"), py::arg("s2")); + py::arg("s1"), py::arg("s2"), py::arg("insert_cost")=1, py::arg("delete_cost")=1, py::arg("replace_cost")=1); mlevenshtein.def("normalized_weighted_distance", [](std::wstring_view s1, std::wstring_view s2, float score_cutoff){ @@ -263,6 +266,13 @@ PYBIND11_MODULE(_rapidfuzz_cpp, m) { }, R"pbdoc( Calculates a normalized levenshtein distance based on levenshtein.weighted_distance + It uses the following costs for edit operations: + + edit operation | cost + :------------- | :--- + Insert | 1 + Remove | 1 + Replace | 2 Args: s1 (str): first string to compare