Merge pull request #16 from rhasspy/generic_levenshtein
add custom weights for levenshtein
This commit is contained in:
commit
5a759c4fa4
|
@ -1,4 +1,6 @@
|
|||
.vscode/
|
||||
__pycache__/
|
||||
.idea/
|
||||
rapidfuzz.egg-info/
|
||||
dist/
|
||||
*.data
|
||||
|
|
|
@ -87,6 +87,9 @@ namespace levenshtein {
|
|||
template<typename CharT, typename MaxDistance=std::nullopt_t>
|
||||
std::size_t weighted_distance(string_view_vec<CharT> sentence1, string_view_vec<CharT> sentence2, MaxDistance max_distance=std::nullopt);
|
||||
|
||||
|
||||
size_t generic_distance(std::wstring_view source, std::wstring_view target, size_t insert_cost = 1, size_t delete_cost = 1, size_t replace_cost = 1);
|
||||
|
||||
/**
|
||||
* Calculates a normalized score of the weighted Levenshtein algorithm between 0.0 and
|
||||
* 1.0 (inclusive), where 1.0 means the sequences are the same.
|
||||
|
@ -283,16 +286,20 @@ inline std::size_t levenshtein::distance(std::wstring_view sentence1, std::wstri
|
|||
std::iota(cache.begin(), cache.end(), 0);
|
||||
|
||||
for (const auto &char1 : sentence1) {
|
||||
size_t temp = cache[0]++;
|
||||
auto cache_iter = cache.begin();
|
||||
size_t temp = *cache_iter;
|
||||
*cache_iter += 1;
|
||||
|
||||
for (size_t j = 1; j < cache.size(); ++j)
|
||||
{
|
||||
size_t p = cache[j - 1];
|
||||
size_t r = cache[j];
|
||||
size_t p = *cache_iter;
|
||||
++cache_iter;
|
||||
size_t r = *cache_iter;
|
||||
temp = std::min(
|
||||
std::min(r, p) + 1,
|
||||
temp + (char1 == sentence2[j - 1] ? 0 : 1)
|
||||
);
|
||||
std::swap(cache[j], temp);
|
||||
std::swap(*cache_iter, temp);
|
||||
}
|
||||
}
|
||||
return cache.back();
|
||||
|
@ -481,6 +488,44 @@ inline std::size_t levenshtein::weighted_distance_impl(std::basic_string_view<Ch
|
|||
}
|
||||
|
||||
|
||||
inline size_t levenshtein::generic_distance(std::wstring_view sentence1, std::wstring_view sentence2,
|
||||
size_t insert_cost, size_t delete_cost, size_t replace_cost)
|
||||
{
|
||||
remove_common_affix(sentence1, sentence2);
|
||||
if (sentence1.size() > sentence2.size()) {
|
||||
std::swap(sentence1, sentence2);
|
||||
std::swap(insert_cost, delete_cost);
|
||||
}
|
||||
|
||||
const size_t min_size = sentence1.size();
|
||||
std::vector<size_t> cache(sentence1.size() + 1);
|
||||
|
||||
cache[0] = 0;
|
||||
for (size_t i = 1; i < cache.size(); ++i) {
|
||||
cache[i] = cache[i - 1] + delete_cost;
|
||||
}
|
||||
|
||||
for (const auto &char2 : sentence2) {
|
||||
auto cache_iter = cache.begin();
|
||||
size_t temp = *cache_iter;
|
||||
*cache_iter += insert_cost;
|
||||
|
||||
for (const auto &char1 : sentence1) {
|
||||
if (char1 != char2) {
|
||||
temp= std::min({
|
||||
*cache_iter + delete_cost,
|
||||
*(cache_iter+1) + insert_cost,
|
||||
temp + replace_cost
|
||||
});
|
||||
}
|
||||
++cache_iter;
|
||||
std::swap(*cache_iter, temp);
|
||||
}
|
||||
}
|
||||
|
||||
return cache.back();
|
||||
}
|
||||
|
||||
|
||||
template<typename Sentence1, typename Sentence2>
|
||||
inline float levenshtein::normalized_weighted_distance(const Sentence1 &sentence1, const Sentence2 &sentence2, float min_ratio)
|
||||
|
|
|
@ -233,29 +233,32 @@ PYBIND11_MODULE(_rapidfuzz_cpp, m) {
|
|||
py::arg("s1"), py::arg("s2"), py::arg("score_cutoff") = 0);
|
||||
|
||||
mlevenshtein.def("weighted_distance",
|
||||
[](std::wstring_view s1, std::wstring_view s2){
|
||||
[](std::wstring_view s1, std::wstring_view s2, size_t insert_cost, size_t delete_cost, size_t replace_cost){
|
||||
if (insert_cost == 1 && delete_cost == 1) {
|
||||
if (replace_cost == 1) {
|
||||
return levenshtein::distance(s1, s2);
|
||||
} else if (replace_cost == 2) {
|
||||
return levenshtein::weighted_distance(s1, s2);
|
||||
}
|
||||
}
|
||||
return levenshtein::generic_distance(s1, s2, insert_cost, delete_cost, replace_cost);
|
||||
},
|
||||
R"pbdoc(
|
||||
Calculates the minimum number of insertions, deletions, and substitutions
|
||||
required to change one sequence into the other according to Levenshtein.
|
||||
Opposed to the normal distance function which has a cost of 1 for all edit operations,
|
||||
it uses the following costs for edit operations:
|
||||
|
||||
edit operation | cost
|
||||
:------------- | :---
|
||||
Insert | 1
|
||||
Remove | 1
|
||||
Replace | 2
|
||||
required to change one sequence into the other according to Levenshtein with custom
|
||||
costs for insertion, deletion and substitution
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
s2 (str): second string to compare
|
||||
insert_cost (int): cost for insertions
|
||||
delete_cost (int): cost for deletions
|
||||
replace_cost (int): cost for substitutions
|
||||
|
||||
Returns:
|
||||
int: weighted levenshtein distance between s1 and s2
|
||||
)pbdoc",
|
||||
py::arg("s1"), py::arg("s2"));
|
||||
py::arg("s1"), py::arg("s2"), py::arg("insert_cost")=1, py::arg("delete_cost")=1, py::arg("replace_cost")=1);
|
||||
|
||||
mlevenshtein.def("normalized_weighted_distance",
|
||||
[](std::wstring_view s1, std::wstring_view s2, float score_cutoff){
|
||||
|
@ -263,6 +266,13 @@ PYBIND11_MODULE(_rapidfuzz_cpp, m) {
|
|||
},
|
||||
R"pbdoc(
|
||||
Calculates a normalized levenshtein distance based on levenshtein.weighted_distance
|
||||
It uses the following costs for edit operations:
|
||||
|
||||
edit operation | cost
|
||||
:------------- | :---
|
||||
Insert | 1
|
||||
Remove | 1
|
||||
Replace | 2
|
||||
|
||||
Args:
|
||||
s1 (str): first string to compare
|
||||
|
|
Loading…
Reference in New Issue