From 5946aeb1bc444cc596fbffff765300a46c5cfdbd Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Sun, 8 Jan 2023 20:24:27 +0100 Subject: [PATCH] fix handling of non symmetric scorers in pure python mode --- CHANGELOG.rst | 6 +++ src/rapidfuzz/_utils.py | 28 +++++++---- src/rapidfuzz/distance/Levenshtein.py | 67 +++++++++++++++++++++++++-- src/rapidfuzz/fuzz.py | 6 ++- src/rapidfuzz/fuzz_cpp.pyx | 5 -- src/rapidfuzz/process_cpp_impl.pyx | 2 + src/rapidfuzz/process_py.py | 16 ++++++- tests/test_process.py | 17 ++++++- 8 files changed, 124 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ef3ee39..3dba39a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,12 @@ Changelog --------- +[2.14.0] - +^^^^^^^^^^^^^^^^^^^^^ +Fixed +~~~~~~~ +- fix handling of non symmetric scorers in pure python version of ``process.cdist`` + [2.13.7] - 2022-12-20 ^^^^^^^^^^^^^^^^^^^^^ Fixed diff --git a/src/rapidfuzz/_utils.py b/src/rapidfuzz/_utils.py index edc0261..cb80630 100644 --- a/src/rapidfuzz/_utils.py +++ b/src/rapidfuzz/_utils.py @@ -18,7 +18,7 @@ def _get_scorer_flags_distance(**_kwargs: Any) -> dict[str, Any]: return { "optimal_score": 0, "worst_score": 2**63 - 1, - "flags": ScorerFlag.RESULT_I64, + "flags": ScorerFlag.RESULT_I64 | ScorerFlag.SYMMETRIC, } @@ -26,7 +26,23 @@ def _get_scorer_flags_similarity(**_kwargs: Any) -> dict[str, Any]: return { "optimal_score": 2**63 - 1, "worst_score": 0, - "flags": ScorerFlag.RESULT_I64, + "flags": ScorerFlag.RESULT_I64 | ScorerFlag.SYMMETRIC, + } + + +def _get_scorer_flags_normalized_distance(**_kwargs: Any) -> dict[str, Any]: + return { + "optimal_score": 0, + "worst_score": 1, + "flags": ScorerFlag.RESULT_F64 | ScorerFlag.SYMMETRIC, + } + + +def _get_scorer_flags_normalized_similarity(**_kwargs: Any) -> dict[str, Any]: + return { + "optimal_score": 1, + "worst_score": 0, + "flags": ScorerFlag.RESULT_F64 | ScorerFlag.SYMMETRIC, } @@ -40,14 +56,6 @@ def is_none(s: Any) -> bool: return False -def _get_scorer_flags_normalized_distance(**_kwargs: Any) -> dict[str, Any]: - return {"optimal_score": 0, "worst_score": 1, "flags": ScorerFlag.RESULT_F64} - - -def _get_scorer_flags_normalized_similarity(**_kwargs: Any) -> dict[str, Any]: - return {"optimal_score": 1, "worst_score": 0, "flags": ScorerFlag.RESULT_F64} - - def _create_scorer( func: Any, cached_scorer_call: dict[str, Callable[..., dict[str, Any]]] ): diff --git a/src/rapidfuzz/distance/Levenshtein.py b/src/rapidfuzz/distance/Levenshtein.py index 3c83f5e..3d05398 100644 --- a/src/rapidfuzz/distance/Levenshtein.py +++ b/src/rapidfuzz/distance/Levenshtein.py @@ -9,12 +9,71 @@ substitutions required to transform s1 into s2. from __future__ import annotations -from rapidfuzz._utils import default_distance_attribute as _dist_attr -from rapidfuzz._utils import default_normalized_distance_attribute as _norm_dist_attr -from rapidfuzz._utils import default_normalized_similarity_attribute as _norm_sim_attr -from rapidfuzz._utils import default_similarity_attribute as _sim_attr +from rapidfuzz._utils import ScorerFlag as _ScorerFlag from rapidfuzz._utils import fallback_import as _fallback_import + +def _get_scorer_flags_distance( + weights: tuple[int, int, int] | None = (1, 1, 1) +) -> dict[str, Any]: + flags = _ScorerFlag.RESULT_I64 + if weights is None or weights[0] == weights[1]: + flags |= _ScorerFlag.SYMMETRIC + + return { + "optimal_score": 0, + "worst_score": 2**63 - 1, + "flags": flags, + } + + +def _get_scorer_flags_similarity( + weights: tuple[int, int, int] | None = (1, 1, 1) +) -> dict[str, Any]: + flags = _ScorerFlag.RESULT_I64 + if weights is None or weights[0] == weights[1]: + flags |= _ScorerFlag.SYMMETRIC + + return { + "optimal_score": 2**63 - 1, + "worst_score": 0, + "flags": flags, + } + + +def _get_scorer_flags_normalized_distance( + weights: tuple[int, int, int] | None = (1, 1, 1) +) -> dict[str, Any]: + flags = _ScorerFlag.RESULT_F64 + if weights is None or weights[0] == weights[1]: + flags |= _ScorerFlag.SYMMETRIC + + return {"optimal_score": 0, "worst_score": 1, "flags": flags} + + +def _get_scorer_flags_normalized_similarity( + weights: tuple[int, int, int] | None = (1, 1, 1) +) -> dict[str, Any]: + flags = _ScorerFlag.RESULT_F64 + if weights is None or weights[0] == weights[1]: + flags |= _ScorerFlag.SYMMETRIC + + return {"optimal_score": 1, "worst_score": 0, "flags": flags} + + +_dist_attr: dict[str, Callable[..., dict[str, Any]]] = { + "get_scorer_flags": _get_scorer_flags_distance +} +_sim_attr: dict[str, Callable[..., dict[str, Any]]] = { + "get_scorer_flags": _get_scorer_flags_similarity +} +_norm_dist_attr: dict[str, Callable[..., dict[str, Any]]] = { + "get_scorer_flags": _get_scorer_flags_normalized_distance +} +_norm_sim_attr: dict[str, Callable[..., dict[str, Any]]] = { + "get_scorer_flags": _get_scorer_flags_normalized_similarity +} + _mod = "rapidfuzz.distance.Levenshtein" distance = _fallback_import(_mod, "distance", cached_scorer_call=_dist_attr) similarity = _fallback_import(_mod, "similarity", cached_scorer_call=_sim_attr) diff --git a/src/rapidfuzz/fuzz.py b/src/rapidfuzz/fuzz.py index dd01b75..34623fe 100644 --- a/src/rapidfuzz/fuzz.py +++ b/src/rapidfuzz/fuzz.py @@ -10,7 +10,11 @@ from rapidfuzz._utils import fallback_import as _fallback_import def _get_scorer_flags_fuzz(**_kwargs: Any) -> dict[str, Any]: - return {"optimal_score": 100, "worst_score": 0, "flags": _ScorerFlag.RESULT_F64} + return { + "optimal_score": 100, + "worst_score": 0, + "flags": _ScorerFlag.RESULT_F64 | _ScorerFlag.SYMMETRIC, + } _fuzz_attribute: dict[str, Callable[..., dict[str, Any]]] = { diff --git a/src/rapidfuzz/fuzz_cpp.pyx b/src/rapidfuzz/fuzz_cpp.pyx index 99c51c8..14f864e 100644 --- a/src/rapidfuzz/fuzz_cpp.pyx +++ b/src/rapidfuzz/fuzz_cpp.pyx @@ -221,11 +221,6 @@ cdef bool GetScorerFlagsFuzzRatio(const RF_Kwargs* self, RF_ScorerFlags* scorer_ scorer_flags.worst_score.f64 = 0 return True -def _GetScorerFlagsSimilarity(**kwargs): - return {"optimal_score": 100, "worst_score": 0, "flags": (1 << 5)} - -cdef dict FuzzContextPy = CreateScorerContextPy(_GetScorerFlagsSimilarity) - cdef RF_Scorer RatioContext = CreateScorerContext(NoKwargsInit, GetScorerFlagsFuzzRatio, RatioInit) ratio._RF_Scorer = PyCapsule_New(&RatioContext, NULL, NULL) diff --git a/src/rapidfuzz/process_cpp_impl.pyx b/src/rapidfuzz/process_cpp_impl.pyx index b89d5ab..c02d8eb 100644 --- a/src/rapidfuzz/process_cpp_impl.pyx +++ b/src/rapidfuzz/process_cpp_impl.pyx @@ -1470,6 +1470,8 @@ cdef Matrix cdist_single_list( @cython.boundscheck(False) @cython.wraparound(False) cdef cdist_py(queries, choices, scorer, processor, score_cutoff, dtype, workers, dict kwargs): + # todo this should handle two similar sequences more efficiently + proc_queries = preprocess_py(queries, processor) proc_choices = preprocess_py(choices, processor) cdef double score diff --git a/src/rapidfuzz/process_py.py b/src/rapidfuzz/process_py.py index ab051e6..4109712 100644 --- a/src/rapidfuzz/process_py.py +++ b/src/rapidfuzz/process_py.py @@ -520,8 +520,10 @@ def extract( return heapq.nsmallest(limit, result_iter, key=lambda i: i[1]) -if TYPE_CHECKING: +try: import numpy as np +except: + pass def _dtype_to_type_num( @@ -544,6 +546,16 @@ def _dtype_to_type_num( return np.float32 +def _is_symmetric(scorer: Callable[..., int | float], **kwargs: dict[str, Any]) -> bool: + params = getattr(scorer, "_RF_ScorerPy", None) + if params is not None: + flags = params["get_scorer_flags"](**kwargs) + if flags["flags"] & ScorerFlag.SYMMETRIC: + return True + + return False + + def cdist( queries: Collection[Sequence[Hashable] | None], choices: Collection[Sequence[Hashable] | None], @@ -616,7 +628,7 @@ def cdist( dtype = _dtype_to_type_num(dtype, scorer, **kwargs) results = np.zeros((len(queries), len(choices)), dtype=dtype) - if queries is choices: + if queries is choices and _is_symmetric(scorer, **kwargs): if processor is None: proc_queries = list(queries) else: diff --git a/tests/test_process.py b/tests/test_process.py index 6f72679..044782b 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -1,6 +1,7 @@ import pytest from rapidfuzz import fuzz, process_cpp, process_py +from rapidfuzz.distance import Levenshtein def wrapped(func): @@ -39,12 +40,14 @@ class process: @staticmethod def cdist(*args, **kwargs): + import numpy as np + res1 = process_cpp.cdist(*args, **kwargs) res2 = process_py.cdist(*args, **kwargs) assert res1.dtype == res2.dtype assert res1.shape == res2.shape if res1.size and res2.size: - assert res1 == res2 + assert np.array_equal(res1, res2) return res1 @@ -382,3 +385,15 @@ def test_wrapped_function(scorer): assert process.cdist(["test"], [float("nan")], scorer=scorer)[0, 0] == 100 assert process.cdist(["test"], [None], scorer=scorer)[0, 0] == 100 assert process.cdist(["test"], ["tes"], scorer=scorer)[0, 0] == 100 + + +def test_cdist_not_symmetric(): + pytest.importorskip("numpy") + import numpy as np + + strings = ["test", "test2"] + expected_res = np.array([[0, 1], [2, 0]]) + assert np.array_equal( + process.cdist(strings, strings, scorer=Levenshtein.distance, weights=(1, 2, 1)), + expected_res, + )