fix bug in mbleven implementation

This commit is contained in:
Max Bachmann 2021-03-20 12:04:12 +01:00
parent 0d84a8b933
commit 853681f7cf
5 changed files with 79 additions and 9 deletions

View File

@ -1 +1 @@
1.3.2
1.3.3

@ -1 +1 @@
Subproject commit ba0fa392db8a75e4c07ef86c3d308bf0ad2575c1
Subproject commit 5a5a0fe1c34c2833f723b02762348fd4e6b0ef06

View File

@ -3,6 +3,6 @@ rapid string matching library
"""
__author__ = "Max Bachmann"
__license__ = "MIT"
__version__ = "1.3.2"
__version__ = "1.3.3"
from rapidfuzz import process, fuzz, utils, levenshtein, string_metric

View File

@ -81,6 +81,8 @@ class RatioTest(unittest.TestCase):
self.assertAlmostEqual(fuzz.partial_ratio("physics 2 vid", "study physics physics 2"), 81.81818, places=4)
self.assertEqual(fuzz.partial_ratio("physics 2 vid", "study physics physics 2 video"), 100)
def testIssue90(self):
self.assertAlmostEqual(fuzz.partial_ratio("ax b", "a b a c b"), 75.0, places=4)
@pytest.mark.parametrize("scorer", scorers)
def test_empty_string(scorer):

View File

@ -8,6 +8,7 @@ import pytest
from rapidfuzz import fuzz, process, utils, string_metric
import random
from math import isclose
def levenshtein(s1, s2, weights=(1, 1, 1)):
"""
@ -44,7 +45,30 @@ def levenshtein(s1, s2, weights=(1, 1, 1)):
return dist[-1][-1]
def normalize_distance(dist, s1, s2, weights=(1, 1, 1)):
insert, delete, substitute = weights
if len(s1) > len(s2):
max_dist = min([
# delete all characters from s1 and insert all characters from s2
len(s1) * delete + len(s2) * insert,
# replace all characters and delete the remaining characters from s1
len(s2) * substitute + (len(s1) - len(s2)) * delete
])
else:
max_dist = min([
# delete all characters from s1 and insert all characters from s2
len(s1) * delete + len(s2) * insert,
# replace all characters and insert the remaining characters into s1
len(s1) * substitute + (len(s2) - len(s1)) * insert
])
return 100 - 100 * dist / max_dist if max_dist else 100
def extractOne_scorer(s1, s2, scorer, processor=None, **kwargs):
return process.extractOne(s1, [s2], processor=processor, scorer=scorer, **kwargs)[1]
def extract_scorer(s1, s2, scorer, processor=None, **kwargs):
return process.extract(s1, [s2], processor=processor, scorer=scorer, **kwargs)[0][1]
HYPOTHESIS_ALPHABET = ascii_letters + digits + punctuation
@ -72,6 +96,17 @@ PROCESSORS = [
utils.default_process
]
@given(s1=st.text(), s2=st.text())
@settings(max_examples=500, deadline=None)
def test_partial_ratio(s1, s2):
"""
test partial_ratio. Currently this only tests, so there are no exceptions
In the future this should validate the implementation. However this requires
a correct implementation to be found.
"""
fuzz.partial_ratio(s1, s2)
@given(s1=st.text(), s2=st.text())
@settings(max_examples=500, deadline=None)
def test_token_ratio(s1, s2):
@ -95,8 +130,19 @@ def test_levenshtein_word(s1, s2):
"""
Test short Levenshtein implementation against simple implementation
"""
assert string_metric.levenshtein(s1, s2) == levenshtein(s1, s2)
assert string_metric.levenshtein(s1, s2, (1,1,2)) == levenshtein(s1, s2, (1,1,2))
reference_dist = levenshtein(s1, s2)
reference_sim = normalize_distance(reference_dist, s1, s2)
assert string_metric.levenshtein(s1, s2) == reference_dist
assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim)
assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
reference_dist = levenshtein(s1, s2, (1,1,2))
reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2))
assert string_metric.levenshtein(s1, s2, (1,1,2)) == reference_dist
assert isclose(string_metric.normalized_levenshtein(s1, s2, (1,1,2)), reference_sim)
assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
@given(s1=st.text(min_size=65), s2=st.text(min_size=65))
@ -105,8 +151,19 @@ def test_levenshtein_block(s1, s2):
"""
Test blockwise Levenshtein implementation against simple implementation
"""
assert string_metric.levenshtein(s1, s2) == levenshtein(s1, s2)
assert string_metric.levenshtein(s1, s2, (1,1,2)) == levenshtein(s1, s2, (1,1,2))
reference_dist = levenshtein(s1, s2)
reference_sim = normalize_distance(reference_dist, s1, s2)
assert string_metric.levenshtein(s1, s2) == reference_dist
assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim)
assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
reference_dist = levenshtein(s1, s2, (1,1,2))
reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2))
assert string_metric.levenshtein(s1, s2, (1,1,2)) == reference_dist
assert isclose(string_metric.normalized_levenshtein(s1, s2, (1,1,2)), reference_sim)
assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
@given(s1=st.text(), s2=st.text())
@ -115,8 +172,19 @@ def test_levenshtein_random(s1, s2):
"""
Test mixed strings to test through all implementations of Levenshtein
"""
assert string_metric.levenshtein(s1, s2) == levenshtein(s1, s2)
assert string_metric.levenshtein(s1, s2, (1,1,2)) == levenshtein(s1, s2, (1,1,2))
reference_dist = levenshtein(s1, s2)
reference_sim = normalize_distance(reference_dist, s1, s2)
assert string_metric.levenshtein(s1, s2) == reference_dist
assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim)
assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
reference_dist = levenshtein(s1, s2, (1,1,2))
reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2))
assert string_metric.levenshtein(s1, s2, (1,1,2)) == reference_dist
assert isclose(string_metric.normalized_levenshtein(s1, s2, (1,1,2)), reference_sim)
assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
@given(sentence=st.text())