Merge pull request #16 from rhasspy/generic_levenshtein

add custom weights for levenshtein
This commit is contained in:
maxbachmann 2020-03-31 11:56:34 +02:00 committed by GitHub
commit 5a759c4fa4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 73 additions and 16 deletions

2
.gitignore vendored
View File

@ -1,4 +1,6 @@
.vscode/
__pycache__/
.idea/
rapidfuzz.egg-info/
dist/
*.data

View File

@ -87,6 +87,9 @@ namespace levenshtein {
template<typename CharT, typename MaxDistance=std::nullopt_t>
std::size_t weighted_distance(string_view_vec<CharT> sentence1, string_view_vec<CharT> sentence2, MaxDistance max_distance=std::nullopt);
size_t generic_distance(std::wstring_view source, std::wstring_view target, size_t insert_cost = 1, size_t delete_cost = 1, size_t replace_cost = 1);
/**
* Calculates a normalized score of the weighted Levenshtein algorithm between 0.0 and
* 1.0 (inclusive), where 1.0 means the sequences are the same.
@ -283,16 +286,20 @@ inline std::size_t levenshtein::distance(std::wstring_view sentence1, std::wstri
std::iota(cache.begin(), cache.end(), 0);
for (const auto &char1 : sentence1) {
size_t temp = cache[0]++;
auto cache_iter = cache.begin();
size_t temp = *cache_iter;
*cache_iter += 1;
for (size_t j = 1; j < cache.size(); ++j)
{
size_t p = cache[j - 1];
size_t r = cache[j];
size_t p = *cache_iter;
++cache_iter;
size_t r = *cache_iter;
temp = std::min(
std::min(r, p) + 1,
temp + (char1 == sentence2[j - 1] ? 0 : 1)
);
std::swap(cache[j], temp);
std::swap(*cache_iter, temp);
}
}
return cache.back();
@ -481,6 +488,44 @@ inline std::size_t levenshtein::weighted_distance_impl(std::basic_string_view<Ch
}
inline size_t levenshtein::generic_distance(std::wstring_view sentence1, std::wstring_view sentence2,
size_t insert_cost, size_t delete_cost, size_t replace_cost)
{
remove_common_affix(sentence1, sentence2);
if (sentence1.size() > sentence2.size()) {
std::swap(sentence1, sentence2);
std::swap(insert_cost, delete_cost);
}
const size_t min_size = sentence1.size();
std::vector<size_t> cache(sentence1.size() + 1);
cache[0] = 0;
for (size_t i = 1; i < cache.size(); ++i) {
cache[i] = cache[i - 1] + delete_cost;
}
for (const auto &char2 : sentence2) {
auto cache_iter = cache.begin();
size_t temp = *cache_iter;
*cache_iter += insert_cost;
for (const auto &char1 : sentence1) {
if (char1 != char2) {
temp= std::min({
*cache_iter + delete_cost,
*(cache_iter+1) + insert_cost,
temp + replace_cost
});
}
++cache_iter;
std::swap(*cache_iter, temp);
}
}
return cache.back();
}
template<typename Sentence1, typename Sentence2>
inline float levenshtein::normalized_weighted_distance(const Sentence1 &sentence1, const Sentence2 &sentence2, float min_ratio)

View File

@ -233,29 +233,32 @@ PYBIND11_MODULE(_rapidfuzz_cpp, m) {
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff") = 0);
mlevenshtein.def("weighted_distance",
[](std::wstring_view s1, std::wstring_view s2){
return levenshtein::weighted_distance(s1, s2);
[](std::wstring_view s1, std::wstring_view s2, size_t insert_cost, size_t delete_cost, size_t replace_cost){
if (insert_cost == 1 && delete_cost == 1) {
if (replace_cost == 1) {
return levenshtein::distance(s1, s2);
} else if (replace_cost == 2) {
return levenshtein::weighted_distance(s1, s2);
}
}
return levenshtein::generic_distance(s1, s2, insert_cost, delete_cost, replace_cost);
},
R"pbdoc(
Calculates the minimum number of insertions, deletions, and substitutions
required to change one sequence into the other according to Levenshtein.
Opposed to the normal distance function which has a cost of 1 for all edit operations,
it uses the following costs for edit operations:
edit operation | cost
:------------- | :---
Insert | 1
Remove | 1
Replace | 2
required to change one sequence into the other according to Levenshtein with custom
costs for insertion, deletion and substitution
Args:
s1 (str): first string to compare
s2 (str): second string to compare
insert_cost (int): cost for insertions
delete_cost (int): cost for deletions
replace_cost (int): cost for substitutions
Returns:
int: weighted levenshtein distance between s1 and s2
)pbdoc",
py::arg("s1"), py::arg("s2"));
py::arg("s1"), py::arg("s2"), py::arg("insert_cost")=1, py::arg("delete_cost")=1, py::arg("replace_cost")=1);
mlevenshtein.def("normalized_weighted_distance",
[](std::wstring_view s1, std::wstring_view s2, float score_cutoff){
@ -263,6 +266,13 @@ PYBIND11_MODULE(_rapidfuzz_cpp, m) {
},
R"pbdoc(
Calculates a normalized levenshtein distance based on levenshtein.weighted_distance
It uses the following costs for edit operations:
edit operation | cost
:------------- | :---
Insert | 1
Remove | 1
Replace | 2
Args:
s1 (str): first string to compare