test preprocessing function with distance scorers

This commit is contained in:
Max Bachmann 2023-10-30 19:44:38 +01:00
parent 41dfce987a
commit cc1e9a11ea
14 changed files with 93 additions and 4 deletions

View File

@ -1,3 +1 @@
"C'est la vie",
"c est la vie",
>>> s2 = "cetain"

View File

@ -111,7 +111,7 @@ repos:
rev: "v2.2.4"
hooks:
- id: codespell
exclude: ".supp$"
exclude: ".*/test_.*.py"
args: ["-x", ".codespell-ignore-lines"]
# Check for common shell mistakes

View File

@ -2,6 +2,7 @@ from __future__ import annotations
import pytest
from rapidfuzz import utils_cpp, utils_py
from tests.distance.common import DamerauLevenshtein
@ -28,3 +29,8 @@ from tests.distance.common import DamerauLevenshtein
)
def test_distance(left, right, distance):
assert DamerauLevenshtein.distance(left, right) == distance
def testCaseInsensitive():
assert DamerauLevenshtein.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0
assert DamerauLevenshtein.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0

View File

@ -2,6 +2,7 @@ from __future__ import annotations
import pytest
from rapidfuzz import utils_cpp, utils_py
from rapidfuzz.distance import metrics_cpp, metrics_py
from tests.distance.common import Hamming
@ -45,6 +46,12 @@ def test_disable_padding():
with pytest.raises(ValueError, match="Sequences are not the same length."):
Hamming.distance("aaaa", "aaaaa", pad=False)
with pytest.raises(ValueError, match="Sequences are not the same length."):
metrics_cpp.hamming_editops("aaaa", "aaaaa", pad=False)
with pytest.raises(ValueError, match="Sequences are not the same length."):
metrics_py.hamming_editops("aaaa", "aaaaa", pad=False)
def test_score_cutoff():
"""
@ -80,3 +87,8 @@ def test_Editops():
ops = hamming_editops("aaabaaa", "abbaaabba")
assert ops.src_len == 7
assert ops.dest_len == 9
def testCaseInsensitive():
assert Hamming.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0
assert Hamming.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0

View File

@ -1,5 +1,6 @@
from __future__ import annotations
from rapidfuzz import utils_cpp, utils_py
from tests.distance.common import Indel
@ -53,3 +54,8 @@ def test_Editops():
ops = Indel.editops("aaabaaa", "abbaaabba")
assert ops.src_len == 7
assert ops.dest_len == 9
def testCaseInsensitive():
assert Indel.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0
assert Indel.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0

View File

@ -2,6 +2,7 @@ from __future__ import annotations
import pytest
from rapidfuzz import utils_cpp, utils_py
from tests.distance.common import Jaro
@ -33,3 +34,8 @@ def test_edge_case_lengths():
"00000000000000000000000000000000000000000000000000000000000000"
)
assert pytest.approx(Jaro.similarity(s2, s1)) == 0.8359375
def testCaseInsensitive():
assert pytest.approx(Jaro.similarity("new york mets", "new YORK mets", processor=utils_cpp.default_process)) == 1.0
assert pytest.approx(Jaro.similarity("new york mets", "new YORK mets", processor=utils_py.default_process)) == 1.0

View File

@ -2,6 +2,7 @@ from __future__ import annotations
import pytest
from rapidfuzz import utils_cpp, utils_py
from tests.distance.common import JaroWinkler
@ -33,3 +34,14 @@ def test_edge_case_lengths():
"00000000000000000000000000000000000000000000000000000000000000"
)
assert pytest.approx(JaroWinkler.similarity(s2, s1)) == 0.852344
def testCaseInsensitive():
assert (
pytest.approx(JaroWinkler.similarity("new york mets", "new YORK mets", processor=utils_cpp.default_process))
== 1.0
)
assert (
pytest.approx(JaroWinkler.similarity("new york mets", "new YORK mets", processor=utils_py.default_process))
== 1.0
)

View File

@ -1,5 +1,6 @@
from __future__ import annotations
from rapidfuzz import utils_cpp, utils_py
from tests.distance.common import LCSseq
@ -41,3 +42,8 @@ def test_Editops():
ops = LCSseq.editops("aaabaaa", "abbaaabba")
assert ops.src_len == 7
assert ops.dest_len == 9
def testCaseInsensitive():
assert LCSseq.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0
assert LCSseq.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0

View File

@ -1,5 +1,6 @@
from __future__ import annotations
from rapidfuzz import utils_cpp, utils_py
from tests.distance.common import Levenshtein
@ -123,3 +124,8 @@ def test_mbleven():
assert Levenshtein.distance("0", "101", score_cutoff=1) == 2
assert Levenshtein.distance("0", "101", score_cutoff=2) == 2
assert Levenshtein.distance("0", "101", score_cutoff=3) == 2
def testCaseInsensitive():
assert Levenshtein.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0
assert Levenshtein.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0

View File

@ -1,5 +1,6 @@
from __future__ import annotations
from rapidfuzz import utils_cpp, utils_py
from rapidfuzz.distance import metrics_cpp
from tests.distance.common import OSA
@ -73,3 +74,8 @@ def test_simple_unicode_tests():
s2 = "ABCD"
assert OSA.distance(s1, s2) == 4
assert OSA.distance(s1, s1) == 0
def testCaseInsensitive():
assert OSA.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0
assert OSA.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0

View File

@ -1,5 +1,6 @@
from __future__ import annotations
from rapidfuzz import utils_cpp, utils_py
from tests.distance.common import Postfix
@ -19,3 +20,8 @@ def test_score_cutoff():
assert Postfix.distance("abcd", "eebcd", score_cutoff=2) == 2
assert Postfix.distance("abcd", "eebcd", score_cutoff=1) == 2
assert Postfix.distance("abcd", "eebcd", score_cutoff=0) == 1
def testCaseInsensitive():
assert Postfix.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0
assert Postfix.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0

View File

@ -1,5 +1,6 @@
from __future__ import annotations
from rapidfuzz import utils_cpp, utils_py
from tests.distance.common import Prefix
@ -19,3 +20,8 @@ def test_score_cutoff():
assert Prefix.distance("abcd", "abcee", score_cutoff=2) == 2
assert Prefix.distance("abcd", "abcee", score_cutoff=1) == 2
assert Prefix.distance("abcd", "abcee", score_cutoff=0) == 1
def testCaseInsensitive():
assert Prefix.distance("new york mets", "new YORK mets", processor=utils_cpp.default_process) == 0
assert Prefix.distance("new york mets", "new YORK mets", processor=utils_py.default_process) == 0

View File

@ -308,7 +308,15 @@ def test_simple_unicode_tests(scorer):
assert scorer(s1, s1) == 100
@pytest.mark.parametrize("processor", [utils_cpp.default_process, lambda s: utils_cpp.default_process(s), utils_py.default_process, lambda s: utils_py.default_process(s)])
@pytest.mark.parametrize(
"processor",
[
utils_cpp.default_process,
lambda s: utils_cpp.default_process(s),
utils_py.default_process,
lambda s: utils_py.default_process(s),
],
)
@pytest.mark.parametrize("scorer", scorers)
def test_scorer_case_insensitive(processor, scorer):
"""

View File

@ -332,6 +332,17 @@ def test_result_order():
assert best[0][2] == 0
def test_extract_limits():
"""
test process.extract with special limits
"""
bests = process.extract("test", ["tes", "tes"], limit=1, score_cutoff=100)
assert bests == []
bests = process.extract("test", ["te", "test"], limit=None, scorer=Levenshtein.distance)
assert bests == [("test", 0, 1), ("te", 2, 0)]
def test_empty_strings():
choices = [
"",