From f69be93ca84e28354d16c874cb0b2d6407f238cd Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Sun, 18 Sep 2022 16:17:29 +0200 Subject: [PATCH] release v2.10.0 --- CHANGELOG.md | 16 ++++++++-- CMakeLists.txt | 2 +- docs/conf.py | 2 +- extern/rapidfuzz-cpp | 2 +- setup.py | 2 +- src/rapidfuzz/__init__.py | 2 +- src/rapidfuzz/distance/Hamming.py | 2 ++ src/rapidfuzz/distance/Hamming.pyi | 13 ++++++++ src/rapidfuzz/distance/Hamming_cpp.py | 2 ++ src/rapidfuzz/distance/Hamming_py.py | 43 ++++++++++++++++++++++++++ src/rapidfuzz/distance/Hamming_py.pyi | 13 ++++++++ src/rapidfuzz/distance/metrics.hpp | 7 +++++ src/rapidfuzz/distance/metrics_cpp.pyi | 13 +++++++- src/rapidfuzz/distance/metrics_cpp.pyx | 20 ++++++++++++ src/rapidfuzz/utils.hpp | 1 + 15 files changed, 131 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a35d26..470b90e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ ## Changelog +### [2.10.0] - 2022-09-18 +#### Added +- add editops to hamming distance + +#### Performance +- strip common affix in osa distance + +#### Fixed +- ignore missing pandas in Python3.11 tests + ### [2.9.0] - 2022-09-16 #### Added - add optimal string alignment (OSA) @@ -151,7 +161,7 @@ #### Changed - add tests to sdist - remove cython dependency for sdist - + ### [2.0.11] - 2022-04-23 #### Changed - relax version requirements of dependencies to simplify packaging @@ -538,7 +548,7 @@ The old algorithm is used again until this bug is fixed. ### [0.11.2] - 2020-09-12 #### Added -- added rapidfuzz.\_\_author\_\_, rapidfuzz.\_\_license\_\_ and rapidfuzz.\_\_version\_\_ +- added rapidfuzz.\_\_author\_\_, rapidfuzz.\_\_license\_\_ and rapidfuzz.\_\_version\_\_ ### [0.11.1] - 2020-09-01 #### Fixed @@ -546,7 +556,7 @@ The old algorithm is used again until this bug is fixed. ### [0.11.0] - 2020-08-22 #### Changed -- support for python 2.7 added #40 +- support for python 2.7 added #40 - add wheels for python2.7 (both pypy and cpython) on MacOS and Linux ### [0.10.0] - 2020-08-17 diff --git a/CMakeLists.txt b/CMakeLists.txt index 0bf7d67..aae90d4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,7 +34,7 @@ else() add_library(Taskflow::Taskflow ALIAS Taskflow) endif() -find_package(rapidfuzz 1.5.0 QUIET) +find_package(rapidfuzz 1.7.0 QUIET) if (rapidfuzz_FOUND) message("Using system supplied version of rapidfuzz-cpp") else() diff --git a/docs/conf.py b/docs/conf.py index d0dd701..db7be25 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,7 +22,7 @@ copyright = '2021, Max Bachmann' author = 'Max Bachmann' # The full version, including alpha/beta/rc tags -release = '2.9.0' +release = '2.10.0' # -- General configuration --------------------------------------------------- diff --git a/extern/rapidfuzz-cpp b/extern/rapidfuzz-cpp index 06c5821..75e1075 160000 --- a/extern/rapidfuzz-cpp +++ b/extern/rapidfuzz-cpp @@ -1 +1 @@ -Subproject commit 06c582124a33f4642132137c621af8abad3dea0e +Subproject commit 75e10756124a2805752d8f4713918466cd67902f diff --git a/setup.py b/setup.py index 65a67de..85d3f46 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ with open('README.md', 'rt', encoding="utf8") as f: setup_args = { "name": "rapidfuzz", - "version": "2.9.0", + "version": "2.10.0", "install_requires": ["jarowinkler >= 1.2.0, < 2.0.0"], "extras_require": {'full': ['numpy']}, "url": "https://github.com/maxbachmann/RapidFuzz", diff --git a/src/rapidfuzz/__init__.py b/src/rapidfuzz/__init__.py index 9008071..1c9f429 100644 --- a/src/rapidfuzz/__init__.py +++ b/src/rapidfuzz/__init__.py @@ -3,6 +3,6 @@ rapid string matching library """ __author__: str = "Max Bachmann" __license__: str = "MIT" -__version__: str = "2.9.0" +__version__: str = "2.10.0" from rapidfuzz import process, distance, fuzz, string_metric, utils diff --git a/src/rapidfuzz/distance/Hamming.py b/src/rapidfuzz/distance/Hamming.py index f395fd5..f0544c5 100644 --- a/src/rapidfuzz/distance/Hamming.py +++ b/src/rapidfuzz/distance/Hamming.py @@ -14,6 +14,8 @@ distance = _fallback_import(_mod, "distance") similarity = _fallback_import(_mod, "similarity") normalized_distance = _fallback_import(_mod, "normalized_distance") normalized_similarity = _fallback_import(_mod, "normalized_similarity") +editops = _fallback_import(_mod, "editops") +opcodes = _fallback_import(_mod, "opcodes") distance._RF_ScorerPy = _dist_attr similarity._RF_ScorerPy = _sim_attr diff --git a/src/rapidfuzz/distance/Hamming.pyi b/src/rapidfuzz/distance/Hamming.pyi index b90c571..0caa762 100644 --- a/src/rapidfuzz/distance/Hamming.pyi +++ b/src/rapidfuzz/distance/Hamming.pyi @@ -1,4 +1,5 @@ from typing import Callable, Hashable, Sequence, Optional, TypeVar +from rapidfuzz.distance import Editops, Opcodes _StringType = Sequence[Hashable] _S1 = TypeVar("_S1") @@ -32,3 +33,15 @@ def normalized_similarity( processor: Optional[Callable[..., _StringType]] = None, score_cutoff: Optional[float] = 0 ) -> float: ... +def editops( + s1: _S1, + s2: _S2, + *, + processor: Optional[Callable[..., _StringType]] = None +) -> Editops: ... +def opcodes( + s1: _S1, + s2: _S2, + *, + processor: Optional[Callable[..., _StringType]] = None +) -> Opcodes: ... diff --git a/src/rapidfuzz/distance/Hamming_cpp.py b/src/rapidfuzz/distance/Hamming_cpp.py index 13efeda..18ba6c2 100644 --- a/src/rapidfuzz/distance/Hamming_cpp.py +++ b/src/rapidfuzz/distance/Hamming_cpp.py @@ -6,4 +6,6 @@ from rapidfuzz.distance.metrics_cpp import ( hamming_similarity as similarity, hamming_normalized_distance as normalized_distance, hamming_normalized_similarity as normalized_similarity, + hamming_editops as editops, + hamming_opcodes as opcodes ) diff --git a/src/rapidfuzz/distance/Hamming_py.py b/src/rapidfuzz/distance/Hamming_py.py index d601200..094d5a4 100644 --- a/src/rapidfuzz/distance/Hamming_py.py +++ b/src/rapidfuzz/distance/Hamming_py.py @@ -155,3 +155,46 @@ def normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): norm_sim = 1 - norm_dist return norm_sim if (score_cutoff is None or norm_dist >= score_cutoff) else 0.0 + +def editops(s1, s2, *, processor=None): + """ + Return Editops describing how to turn s1 into s2. + + Parameters + ---------- + s1 : Sequence[Hashable] + First string to compare. + s2 : Sequence[Hashable] + Second string to compare. + processor: callable, optional + Optional callable that is used to preprocess the strings before + comparing them. Default is None, which deactivates this behaviour. + + Returns + ------- + editops : Editops + edit operations required to turn s1 into s2 + """ + raise NotImplementedError + + +def opcodes(s1, s2, *, processor=None): + """ + Return Opcodes describing how to turn s1 into s2. + + Parameters + ---------- + s1 : Sequence[Hashable] + First string to compare. + s2 : Sequence[Hashable] + Second string to compare. + processor: callable, optional + Optional callable that is used to preprocess the strings before + comparing them. Default is None, which deactivates this behaviour. + + Returns + ------- + opcodes : Opcodes + edit operations required to turn s1 into s2 + """ + raise NotImplementedError diff --git a/src/rapidfuzz/distance/Hamming_py.pyi b/src/rapidfuzz/distance/Hamming_py.pyi index b494af6..df6ac3f 100644 --- a/src/rapidfuzz/distance/Hamming_py.pyi +++ b/src/rapidfuzz/distance/Hamming_py.pyi @@ -1,5 +1,6 @@ from typing import Callable, Hashable, Sequence, Optional, TypeVar, Any, Dict from typing_extensions import Protocol +from rapidfuzz.distance import Editops, Opcodes class _ScorerAttributes(Protocol): _RF_ScorerPy: Dict @@ -43,3 +44,15 @@ def normalized_similarity( processor: Optional[Callable[..., _StringType]] = None, score_cutoff: Optional[float] = 0 ) -> float: ... +def editops( + s1: _S1, + s2: _S2, + *, + processor: Optional[Callable[..., _StringType]] = None +) -> Editops: ... +def opcodes( + s1: _S1, + s2: _S2, + *, + processor: Optional[Callable[..., _StringType]] = None +) -> Opcodes: ... diff --git a/src/rapidfuzz/distance/metrics.hpp b/src/rapidfuzz/distance/metrics.hpp index b2e3809..1df1fa8 100644 --- a/src/rapidfuzz/distance/metrics.hpp +++ b/src/rapidfuzz/distance/metrics.hpp @@ -278,6 +278,13 @@ static inline bool LCSseqNormalizedSimilarityInit(RF_ScorerFunc* self, const RF_ return normalized_similarity_init(self, str_count, str); } +static inline rapidfuzz::Editops hamming_editops_func(const RF_String& str1, const RF_String& str2) +{ + return visitor(str1, str2, [&](auto s1, auto s2) { + return rapidfuzz::hamming_editops(s1, s2); + }); +} + static inline rapidfuzz::Editops levenshtein_editops_func(const RF_String& str1, const RF_String& str2, int64_t score_hint) { diff --git a/src/rapidfuzz/distance/metrics_cpp.pyi b/src/rapidfuzz/distance/metrics_cpp.pyi index ca0be4a..16696af 100644 --- a/src/rapidfuzz/distance/metrics_cpp.pyi +++ b/src/rapidfuzz/distance/metrics_cpp.pyi @@ -147,6 +147,18 @@ def hamming_normalized_similarity( processor: Optional[Callable[..., _StringType]] = None, score_cutoff: Optional[float] = 0 ) -> float: ... +def hamming_editops( + s1: _S1, + s2: _S2, + *, + processor: Optional[Callable[..., _StringType]] = None +) -> Editops: ... +def hamming_opcodes( + s1: _S1, + s2: _S2, + *, + processor: Optional[Callable[..., _StringType]] = None +) -> Opcodes: ... def damerau_levenshtein_distance( s1: _S1, s2: _S2, @@ -203,4 +215,3 @@ def osa_normalized_similarity( processor: Optional[Callable[..., _StringType]] = None, score_cutoff: Optional[float] = 0 ) -> float: ... - diff --git a/src/rapidfuzz/distance/metrics_cpp.pyx b/src/rapidfuzz/distance/metrics_cpp.pyx index eb9ba51..71cff2a 100644 --- a/src/rapidfuzz/distance/metrics_cpp.pyx +++ b/src/rapidfuzz/distance/metrics_cpp.pyx @@ -86,6 +86,8 @@ cdef extern from "metrics.hpp": bool HammingSimilarityInit( RF_ScorerFunc*, const RF_Kwargs*, int64_t, const RF_String*) nogil except False bool HammingNormalizedSimilarityInit(RF_ScorerFunc*, const RF_Kwargs*, int64_t, const RF_String*) nogil except False + RfEditops hamming_editops_func(const RF_String&, const RF_String&) nogil except + + # Damerau Levenshtein double osa_normalized_distance_func( const RF_String&, const RF_String&, double) nogil except + int64_t osa_distance_func( const RF_String&, const RF_String&, int64_t) nogil except + @@ -563,6 +565,24 @@ def hamming_normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): return hamming_normalized_similarity_func(s1_proc.string, s2_proc.string, c_score_cutoff) +def hamming_editops(s1, s2, *, processor=None): + cdef RF_StringWrapper s1_proc, s2_proc + cdef Editops ops = Editops.__new__(Editops) + + preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) + ops.editops = hamming_editops_func(s1_proc.string, s2_proc.string) + return ops + + +def hamming_opcodes(s1, s2, *, processor=None): + cdef RF_StringWrapper s1_proc, s2_proc + cdef Editops ops = Editops.__new__(Editops) + + preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) + ops.editops = hamming_editops_func(s1_proc.string, s2_proc.string) + return ops.as_opcodes() + + cdef bool GetScorerFlagsHammingDistance(const RF_Kwargs* self, RF_ScorerFlags* scorer_flags) nogil except False: dereference(scorer_flags).flags = RF_SCORER_FLAG_RESULT_I64 | RF_SCORER_FLAG_SYMMETRIC dereference(scorer_flags).optimal_score.i64 = 0 diff --git a/src/rapidfuzz/utils.hpp b/src/rapidfuzz/utils.hpp index a6c2d9a..f041ab9 100644 --- a/src/rapidfuzz/utils.hpp +++ b/src/rapidfuzz/utils.hpp @@ -10,6 +10,7 @@ #include #include #include +#include uint32_t UnicodeDefaultProcess(uint32_t ch);