From 853681f7cf4b87d83c62f292726dc25d8dfea605 Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Sat, 20 Mar 2021 12:04:12 +0100 Subject: [PATCH] fix bug in mbleven implementation --- VERSION | 2 +- src/rapidfuzz-cpp | 2 +- src/rapidfuzz/__init__.py | 2 +- tests/test_fuzz.py | 2 + tests/test_hypothesis.py | 80 ++++++++++++++++++++++++++++++++++++--- 5 files changed, 79 insertions(+), 9 deletions(-) diff --git a/VERSION b/VERSION index d5e98f7..785cda8 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.3.2 \ No newline at end of file +1.3.3 \ No newline at end of file diff --git a/src/rapidfuzz-cpp b/src/rapidfuzz-cpp index ba0fa39..5a5a0fe 160000 --- a/src/rapidfuzz-cpp +++ b/src/rapidfuzz-cpp @@ -1 +1 @@ -Subproject commit ba0fa392db8a75e4c07ef86c3d308bf0ad2575c1 +Subproject commit 5a5a0fe1c34c2833f723b02762348fd4e6b0ef06 diff --git a/src/rapidfuzz/__init__.py b/src/rapidfuzz/__init__.py index 58d0a05..8b5d9b9 100644 --- a/src/rapidfuzz/__init__.py +++ b/src/rapidfuzz/__init__.py @@ -3,6 +3,6 @@ rapid string matching library """ __author__ = "Max Bachmann" __license__ = "MIT" -__version__ = "1.3.2" +__version__ = "1.3.3" from rapidfuzz import process, fuzz, utils, levenshtein, string_metric diff --git a/tests/test_fuzz.py b/tests/test_fuzz.py index 2db72ea..7ae9a0a 100644 --- a/tests/test_fuzz.py +++ b/tests/test_fuzz.py @@ -81,6 +81,8 @@ class RatioTest(unittest.TestCase): self.assertAlmostEqual(fuzz.partial_ratio("physics 2 vid", "study physics physics 2"), 81.81818, places=4) self.assertEqual(fuzz.partial_ratio("physics 2 vid", "study physics physics 2 video"), 100) + def testIssue90(self): + self.assertAlmostEqual(fuzz.partial_ratio("ax b", "a b a c b"), 75.0, places=4) @pytest.mark.parametrize("scorer", scorers) def test_empty_string(scorer): diff --git a/tests/test_hypothesis.py b/tests/test_hypothesis.py index ec874fb..a518b18 100644 --- a/tests/test_hypothesis.py +++ b/tests/test_hypothesis.py @@ -8,6 +8,7 @@ import pytest from rapidfuzz import fuzz, process, utils, string_metric import random +from math import isclose def levenshtein(s1, s2, weights=(1, 1, 1)): """ @@ -44,7 +45,30 @@ def levenshtein(s1, s2, weights=(1, 1, 1)): return dist[-1][-1] +def normalize_distance(dist, s1, s2, weights=(1, 1, 1)): + insert, delete, substitute = weights + if len(s1) > len(s2): + max_dist = min([ + # delete all characters from s1 and insert all characters from s2 + len(s1) * delete + len(s2) * insert, + # replace all characters and delete the remaining characters from s1 + len(s2) * substitute + (len(s1) - len(s2)) * delete + ]) + else: + max_dist = min([ + # delete all characters from s1 and insert all characters from s2 + len(s1) * delete + len(s2) * insert, + # replace all characters and insert the remaining characters into s1 + len(s1) * substitute + (len(s2) - len(s1)) * insert + ]) + return 100 - 100 * dist / max_dist if max_dist else 100 + +def extractOne_scorer(s1, s2, scorer, processor=None, **kwargs): + return process.extractOne(s1, [s2], processor=processor, scorer=scorer, **kwargs)[1] + +def extract_scorer(s1, s2, scorer, processor=None, **kwargs): + return process.extract(s1, [s2], processor=processor, scorer=scorer, **kwargs)[0][1] HYPOTHESIS_ALPHABET = ascii_letters + digits + punctuation @@ -72,6 +96,17 @@ PROCESSORS = [ utils.default_process ] +@given(s1=st.text(), s2=st.text()) +@settings(max_examples=500, deadline=None) +def test_partial_ratio(s1, s2): + """ + test partial_ratio. Currently this only tests, so there are no exceptions + In the future this should validate the implementation. However this requires + a correct implementation to be found. + """ + fuzz.partial_ratio(s1, s2) + + @given(s1=st.text(), s2=st.text()) @settings(max_examples=500, deadline=None) def test_token_ratio(s1, s2): @@ -95,8 +130,19 @@ def test_levenshtein_word(s1, s2): """ Test short Levenshtein implementation against simple implementation """ - assert string_metric.levenshtein(s1, s2) == levenshtein(s1, s2) - assert string_metric.levenshtein(s1, s2, (1,1,2)) == levenshtein(s1, s2, (1,1,2)) + reference_dist = levenshtein(s1, s2) + reference_sim = normalize_distance(reference_dist, s1, s2) + assert string_metric.levenshtein(s1, s2) == reference_dist + assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim) + assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim) + assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim) + + reference_dist = levenshtein(s1, s2, (1,1,2)) + reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2)) + assert string_metric.levenshtein(s1, s2, (1,1,2)) == reference_dist + assert isclose(string_metric.normalized_levenshtein(s1, s2, (1,1,2)), reference_sim) + assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim) + assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim) @given(s1=st.text(min_size=65), s2=st.text(min_size=65)) @@ -105,8 +151,19 @@ def test_levenshtein_block(s1, s2): """ Test blockwise Levenshtein implementation against simple implementation """ - assert string_metric.levenshtein(s1, s2) == levenshtein(s1, s2) - assert string_metric.levenshtein(s1, s2, (1,1,2)) == levenshtein(s1, s2, (1,1,2)) + reference_dist = levenshtein(s1, s2) + reference_sim = normalize_distance(reference_dist, s1, s2) + assert string_metric.levenshtein(s1, s2) == reference_dist + assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim) + assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim) + assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim) + + reference_dist = levenshtein(s1, s2, (1,1,2)) + reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2)) + assert string_metric.levenshtein(s1, s2, (1,1,2)) == reference_dist + assert isclose(string_metric.normalized_levenshtein(s1, s2, (1,1,2)), reference_sim) + assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim) + assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim) @given(s1=st.text(), s2=st.text()) @@ -115,8 +172,19 @@ def test_levenshtein_random(s1, s2): """ Test mixed strings to test through all implementations of Levenshtein """ - assert string_metric.levenshtein(s1, s2) == levenshtein(s1, s2) - assert string_metric.levenshtein(s1, s2, (1,1,2)) == levenshtein(s1, s2, (1,1,2)) + reference_dist = levenshtein(s1, s2) + reference_sim = normalize_distance(reference_dist, s1, s2) + assert string_metric.levenshtein(s1, s2) == reference_dist + assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim) + assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim) + assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim) + + reference_dist = levenshtein(s1, s2, (1,1,2)) + reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2)) + assert string_metric.levenshtein(s1, s2, (1,1,2)) == reference_dist + assert isclose(string_metric.normalized_levenshtein(s1, s2, (1,1,2)), reference_sim) + assert isclose(extractOne_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim) + assert isclose(extract_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim) @given(sentence=st.text())