From cc1e9a11ea80a425e61da5ec5b979db0835891f9 Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Mon, 30 Oct 2023 19:44:38 +0100 Subject: [PATCH] test preprocessing function with distance scorers --- .codespell-ignore-lines | 2 -- .pre-commit-config.yaml | 2 +- tests/distance/test_DamerauLevenshtein.py | 6 ++++++ tests/distance/test_Hamming.py | 12 ++++++++++++ tests/distance/test_Indel.py | 6 ++++++ tests/distance/test_Jaro.py | 6 ++++++ tests/distance/test_JaroWinkler.py | 12 ++++++++++++ tests/distance/test_LCSseq.py | 6 ++++++ tests/distance/test_Levenshtein.py | 6 ++++++ tests/distance/test_OSA.py | 6 ++++++ tests/distance/test_Postfix.py | 6 ++++++ tests/distance/test_Prefix.py | 6 ++++++ tests/test_fuzz.py | 10 +++++++++- tests/test_process.py | 11 +++++++++++ 14 files changed, 93 insertions(+), 4 deletions(-) diff --git a/.codespell-ignore-lines b/.codespell-ignore-lines index 618378b..7c08e7c 100644 --- a/.codespell-ignore-lines +++ b/.codespell-ignore-lines @@ -1,3 +1 @@ - "C'est la vie", - "c est la vie", >>> s2 = "cetain" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d6b77e..52c8fbe 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -111,7 +111,7 @@ repos: rev: "v2.2.4" hooks: - id: codespell - exclude: ".supp$" + exclude: ".*/test_.*.py" args: ["-x", ".codespell-ignore-lines"] # Check for common shell mistakes diff --git a/tests/distance/test_DamerauLevenshtein.py b/tests/distance/test_DamerauLevenshtein.py index 7b9eca9..55f2b20 100644 --- a/tests/distance/test_DamerauLevenshtein.py +++ b/tests/distance/test_DamerauLevenshtein.py @@ -2,6 +2,7 @@ from __future__ import annotations import pytest +from rapidfuzz import utils_cpp, utils_py from tests.distance.common import DamerauLevenshtein @@ -28,3 +29,8 @@ from tests.distance.common import DamerauLevenshtein ) def test_distance(left, right, distance): assert DamerauLevenshtein.distance(left, right) == distance + + +def testCaseInsensitive(): + assert DamerauLevenshtein.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0 + assert DamerauLevenshtein.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0 diff --git a/tests/distance/test_Hamming.py b/tests/distance/test_Hamming.py index 26fb006..7e85871 100644 --- a/tests/distance/test_Hamming.py +++ b/tests/distance/test_Hamming.py @@ -2,6 +2,7 @@ from __future__ import annotations import pytest +from rapidfuzz import utils_cpp, utils_py from rapidfuzz.distance import metrics_cpp, metrics_py from tests.distance.common import Hamming @@ -45,6 +46,12 @@ def test_disable_padding(): with pytest.raises(ValueError, match="Sequences are not the same length."): Hamming.distance("aaaa", "aaaaa", pad=False) + with pytest.raises(ValueError, match="Sequences are not the same length."): + metrics_cpp.hamming_editops("aaaa", "aaaaa", pad=False) + + with pytest.raises(ValueError, match="Sequences are not the same length."): + metrics_py.hamming_editops("aaaa", "aaaaa", pad=False) + def test_score_cutoff(): """ @@ -80,3 +87,8 @@ def test_Editops(): ops = hamming_editops("aaabaaa", "abbaaabba") assert ops.src_len == 7 assert ops.dest_len == 9 + + +def testCaseInsensitive(): + assert Hamming.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0 + assert Hamming.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0 diff --git a/tests/distance/test_Indel.py b/tests/distance/test_Indel.py index 4d9ec81..f07ce22 100644 --- a/tests/distance/test_Indel.py +++ b/tests/distance/test_Indel.py @@ -1,5 +1,6 @@ from __future__ import annotations +from rapidfuzz import utils_cpp, utils_py from tests.distance.common import Indel @@ -53,3 +54,8 @@ def test_Editops(): ops = Indel.editops("aaabaaa", "abbaaabba") assert ops.src_len == 7 assert ops.dest_len == 9 + + +def testCaseInsensitive(): + assert Indel.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0 + assert Indel.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0 diff --git a/tests/distance/test_Jaro.py b/tests/distance/test_Jaro.py index f333a04..805a3e2 100644 --- a/tests/distance/test_Jaro.py +++ b/tests/distance/test_Jaro.py @@ -2,6 +2,7 @@ from __future__ import annotations import pytest +from rapidfuzz import utils_cpp, utils_py from tests.distance.common import Jaro @@ -33,3 +34,8 @@ def test_edge_case_lengths(): "00000000000000000000000000000000000000000000000000000000000000" ) assert pytest.approx(Jaro.similarity(s2, s1)) == 0.8359375 + + +def testCaseInsensitive(): + assert pytest.approx(Jaro.similarity("new york mets", "new YORK mets", processor=utils_cpp.default_process)) == 1.0 + assert pytest.approx(Jaro.similarity("new york mets", "new YORK mets", processor=utils_py.default_process)) == 1.0 diff --git a/tests/distance/test_JaroWinkler.py b/tests/distance/test_JaroWinkler.py index 376d45f..4caadfe 100644 --- a/tests/distance/test_JaroWinkler.py +++ b/tests/distance/test_JaroWinkler.py @@ -2,6 +2,7 @@ from __future__ import annotations import pytest +from rapidfuzz import utils_cpp, utils_py from tests.distance.common import JaroWinkler @@ -33,3 +34,14 @@ def test_edge_case_lengths(): "00000000000000000000000000000000000000000000000000000000000000" ) assert pytest.approx(JaroWinkler.similarity(s2, s1)) == 0.852344 + + +def testCaseInsensitive(): + assert ( + pytest.approx(JaroWinkler.similarity("new york mets", "new YORK mets", processor=utils_cpp.default_process)) + == 1.0 + ) + assert ( + pytest.approx(JaroWinkler.similarity("new york mets", "new YORK mets", processor=utils_py.default_process)) + == 1.0 + ) diff --git a/tests/distance/test_LCSseq.py b/tests/distance/test_LCSseq.py index 420048f..5c93db7 100644 --- a/tests/distance/test_LCSseq.py +++ b/tests/distance/test_LCSseq.py @@ -1,5 +1,6 @@ from __future__ import annotations +from rapidfuzz import utils_cpp, utils_py from tests.distance.common import LCSseq @@ -41,3 +42,8 @@ def test_Editops(): ops = LCSseq.editops("aaabaaa", "abbaaabba") assert ops.src_len == 7 assert ops.dest_len == 9 + + +def testCaseInsensitive(): + assert LCSseq.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0 + assert LCSseq.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0 diff --git a/tests/distance/test_Levenshtein.py b/tests/distance/test_Levenshtein.py index 8f1600c..2e2ab78 100644 --- a/tests/distance/test_Levenshtein.py +++ b/tests/distance/test_Levenshtein.py @@ -1,5 +1,6 @@ from __future__ import annotations +from rapidfuzz import utils_cpp, utils_py from tests.distance.common import Levenshtein @@ -123,3 +124,8 @@ def test_mbleven(): assert Levenshtein.distance("0", "101", score_cutoff=1) == 2 assert Levenshtein.distance("0", "101", score_cutoff=2) == 2 assert Levenshtein.distance("0", "101", score_cutoff=3) == 2 + + +def testCaseInsensitive(): + assert Levenshtein.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0 + assert Levenshtein.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0 diff --git a/tests/distance/test_OSA.py b/tests/distance/test_OSA.py index e5c32bc..6f1c30c 100644 --- a/tests/distance/test_OSA.py +++ b/tests/distance/test_OSA.py @@ -1,5 +1,6 @@ from __future__ import annotations +from rapidfuzz import utils_cpp, utils_py from rapidfuzz.distance import metrics_cpp from tests.distance.common import OSA @@ -73,3 +74,8 @@ def test_simple_unicode_tests(): s2 = "ABCD" assert OSA.distance(s1, s2) == 4 assert OSA.distance(s1, s1) == 0 + + +def testCaseInsensitive(): + assert OSA.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0 + assert OSA.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0 diff --git a/tests/distance/test_Postfix.py b/tests/distance/test_Postfix.py index 471f454..770465b 100644 --- a/tests/distance/test_Postfix.py +++ b/tests/distance/test_Postfix.py @@ -1,5 +1,6 @@ from __future__ import annotations +from rapidfuzz import utils_cpp, utils_py from tests.distance.common import Postfix @@ -19,3 +20,8 @@ def test_score_cutoff(): assert Postfix.distance("abcd", "eebcd", score_cutoff=2) == 2 assert Postfix.distance("abcd", "eebcd", score_cutoff=1) == 2 assert Postfix.distance("abcd", "eebcd", score_cutoff=0) == 1 + + +def testCaseInsensitive(): + assert Postfix.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0 + assert Postfix.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0 diff --git a/tests/distance/test_Prefix.py b/tests/distance/test_Prefix.py index d6bc8c0..0074bf6 100644 --- a/tests/distance/test_Prefix.py +++ b/tests/distance/test_Prefix.py @@ -1,5 +1,6 @@ from __future__ import annotations +from rapidfuzz import utils_cpp, utils_py from tests.distance.common import Prefix @@ -19,3 +20,8 @@ def test_score_cutoff(): assert Prefix.distance("abcd", "abcee", score_cutoff=2) == 2 assert Prefix.distance("abcd", "abcee", score_cutoff=1) == 2 assert Prefix.distance("abcd", "abcee", score_cutoff=0) == 1 + + +def testCaseInsensitive(): + assert Prefix.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0 + assert Prefix.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0 diff --git a/tests/test_fuzz.py b/tests/test_fuzz.py index ed400da..88fb64f 100644 --- a/tests/test_fuzz.py +++ b/tests/test_fuzz.py @@ -308,7 +308,15 @@ def test_simple_unicode_tests(scorer): assert scorer(s1, s1) == 100 -@pytest.mark.parametrize("processor", [utils_cpp.default_process, lambda s: utils_cpp.default_process(s), utils_py.default_process, lambda s: utils_py.default_process(s)]) +@pytest.mark.parametrize( + "processor", + [ + utils_cpp.default_process, + lambda s: utils_cpp.default_process(s), + utils_py.default_process, + lambda s: utils_py.default_process(s), + ], +) @pytest.mark.parametrize("scorer", scorers) def test_scorer_case_insensitive(processor, scorer): """ diff --git a/tests/test_process.py b/tests/test_process.py index 5d3c244..5f4dc16 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -332,6 +332,17 @@ def test_result_order(): assert best[0][2] == 0 +def test_extract_limits(): + """ + test process.extract with special limits + """ + bests = process.extract("test", ["tes", "tes"], limit=1, score_cutoff=100) + assert bests == [] + + bests = process.extract("test", ["te", "test"], limit=None, scorer=Levenshtein.distance) + assert bests == [("test", 0, 1), ("te", 2, 0)] + + def test_empty_strings(): choices = [ "",