From e4bfaec4d6c2461a680512100cf0266416fa8f90 Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Wed, 16 Aug 2023 16:32:33 +0200 Subject: [PATCH] implement pure Python editops for Indel/LCSseq --- api_differences.md | 6 +- src/rapidfuzz/CMakeLists.txt | 16 ++--- src/rapidfuzz/_common_py.py | 28 +++++++++ src/rapidfuzz/distance/CMakeLists.txt | 1 - src/rapidfuzz/distance/Indel_py.py | 11 ++-- src/rapidfuzz/distance/LCSseq_py.py | 80 ++++++++++++++++++++++-- src/rapidfuzz/distance/_initialize_py.py | 12 ++-- src/rapidfuzz/fuzz_py.py | 5 +- src/rapidfuzz/process_cpp.hpp | 2 +- tests/common.py | 43 ++++++++++++- tests/distance/common.py | 4 ++ tests/distance/test_Indel.py | 34 ++++++++++ tests/distance/test_LCSseq.py | 34 ++++++++++ tests/test_fuzz.py | 1 + tests/test_hypothesis.py | 8 +-- 15 files changed, 247 insertions(+), 38 deletions(-) diff --git a/api_differences.md b/api_differences.md index b81c2d8..a0907a1 100644 --- a/api_differences.md +++ b/api_differences.md @@ -13,14 +13,14 @@ This leads to different results depending on the version in use. `RapidFuzz` alw fallback implementation and the C++ based implementation to provide consistent matching results. ## partial_ratio implementation -`fuzzywuzzy` uses searches fo the optimal matching substring and then calculates the similarity using `ratio`. This substring is searches using either: +`fuzzywuzzy` searches for the optimal matching substring and then calculates the similarity using `ratio`. This substring is searches using either: 1) `difflib.SequenceMatcher.get_matching_blocks` (based on Ratcliff and Obershelp algorithm) 2) `Levenshtein.matching_blocks` (backtracks Levenshtein matrix) This implementation has a couple of issues: 1) in the pure Python implementation the automatic junk heuristic of difflib is not deactivated. This heuristic improves the performance for long strings, but can lead to completely incorrect results. -2) the accellerated version backtracks the Levenshtein matrix to find the same alignment found by the Python implementation. However the algorithm just uses +2) the accelerated version backtracks the Levenshtein matrix to find the same alignment found by the Python implementation. However the algorithm just uses one of multiple optimal alignment. There is no guarantee for this alignment to include the longest common substring. 3) the optimal substring is assumed to start at one of these `matching_blocks`. However this is not guaranteed. @@ -63,4 +63,4 @@ In `RapidFuzz` these functions are sometimes available under different names: - `extractOne` is available under the same name - `dedupe` is not available -In addition these functions do not preprocess strings by default. However preprocessing can be enabled using the `processor` argument. \ No newline at end of file +In addition these functions do not preprocess strings by default. However preprocessing can be enabled using the `processor` argument. diff --git a/src/rapidfuzz/CMakeLists.txt b/src/rapidfuzz/CMakeLists.txt index 6033b3c..864fb99 100644 --- a/src/rapidfuzz/CMakeLists.txt +++ b/src/rapidfuzz/CMakeLists.txt @@ -88,7 +88,6 @@ if(RAPIDFUZZ_ARCH_X86) install(TARGETS fuzz_cpp_sse2 LIBRARY DESTINATION src/rapidfuzz) endif() - create_cython_target(process_cpp_impl) rf_add_library(process_cpp_impl ${process_cpp_impl}) target_compile_features(process_cpp_impl PUBLIC cxx_std_17) @@ -145,19 +144,20 @@ if(NOT Windows) HAVE_CXX_ATOMICS_UNSIGNED_WITH_LIB) if(NOT HAVE_CXX_ATOMICS_INT_WITH_LIB) message( - FATAL_ERROR "No native support for std::atomic, or libatomic not found!" - ) + FATAL_ERROR + "No native support for std::atomic, or libatomic not found!") elseif(NOT HAVE_CXX_ATOMICS_SIZE_T_WITH_LIB) message( - FATAL_ERROR "No native support for std::atomic, or libatomic not found!" - ) + FATAL_ERROR + "No native support for std::atomic, or libatomic not found!") elseif(NOT HAVE_CXX_ATOMICS_VOID_PTR_WITH_LIB) message( - FATAL_ERROR "No native support for std::atomic, or libatomic not found!" - ) + FATAL_ERROR + "No native support for std::atomic, or libatomic not found!") elseif(NOT HAVE_CXX_ATOMICS_UNSIGNED_WITH_LIB) message( - FATAL_ERROR "No native support for std::atomic, or libatomic not found!" + FATAL_ERROR + "No native support for std::atomic, or libatomic not found!" ) else() message(STATUS "Linking with libatomic for atomics support") diff --git a/src/rapidfuzz/_common_py.py b/src/rapidfuzz/_common_py.py index 64d39cd..164b571 100644 --- a/src/rapidfuzz/_common_py.py +++ b/src/rapidfuzz/_common_py.py @@ -43,3 +43,31 @@ def conv_sequences(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> Sequence[H return s1, s2 return conv_sequence(s1), conv_sequence(s2) + + +def common_prefix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int: + prefix_len = 0 + for ch1, ch2 in zip(s1, s2): + if ch1 != ch2: + break + + prefix_len += 1 + + return prefix_len + + +def common_suffix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int: + suffix_len = 0 + for ch1, ch2 in zip(reversed(s1), reversed(s2)): + if ch1 != ch2: + break + + suffix_len += 1 + + return suffix_len + + +def common_affix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> (int, int): + prefix_len = common_prefix(s1, s2) + suffix_len = common_suffix(s1[prefix_len:], s2[prefix_len:]) + return (prefix_len, suffix_len) diff --git a/src/rapidfuzz/distance/CMakeLists.txt b/src/rapidfuzz/distance/CMakeLists.txt index de2bc02..7deacb4 100644 --- a/src/rapidfuzz/distance/CMakeLists.txt +++ b/src/rapidfuzz/distance/CMakeLists.txt @@ -87,4 +87,3 @@ if(RAPIDFUZZ_ARCH_X86) target_link_libraries(metrics_cpp_sse2 PRIVATE rapidfuzz::rapidfuzz) install(TARGETS metrics_cpp_sse2 LIBRARY DESTINATION src/rapidfuzz/distance) endif() - diff --git a/src/rapidfuzz/distance/Indel_py.py b/src/rapidfuzz/distance/Indel_py.py index 6a1780f..4eb1b67 100644 --- a/src/rapidfuzz/distance/Indel_py.py +++ b/src/rapidfuzz/distance/Indel_py.py @@ -9,6 +9,8 @@ from rapidfuzz._common_py import conv_sequences from rapidfuzz._utils import is_none from rapidfuzz.distance._initialize_py import Editops, Opcodes from rapidfuzz.distance.LCSseq_py import _block_similarity as lcs_seq_block_similarity +from rapidfuzz.distance.LCSseq_py import editops as lcs_seq_editops +from rapidfuzz.distance.LCSseq_py import opcodes as lcs_seq_opcodes from rapidfuzz.distance.LCSseq_py import similarity as lcs_seq_similarity @@ -300,12 +302,7 @@ def editops( insert s1[4] s2[2] insert s1[6] s2[5] """ - if processor is not None: - s1 = processor(s1) - s2 = processor(s2) - - s1, s2 = conv_sequences(s1, s2) - raise NotImplementedError + return lcs_seq_editops(s1, s2, processor=processor) def opcodes( @@ -358,4 +355,4 @@ def opcodes( equal a[4:6] (cd) b[3:5] (cd) insert a[6:6] () b[5:6] (f) """ - return editops(s1, s2, processor=processor).as_opcodes() + return lcs_seq_opcodes(s1, s2, processor=processor) diff --git a/src/rapidfuzz/distance/LCSseq_py.py b/src/rapidfuzz/distance/LCSseq_py.py index e592a14..92e61a2 100644 --- a/src/rapidfuzz/distance/LCSseq_py.py +++ b/src/rapidfuzz/distance/LCSseq_py.py @@ -5,9 +5,9 @@ from __future__ import annotations from typing import Callable, Hashable, Sequence -from rapidfuzz._common_py import conv_sequences +from rapidfuzz._common_py import common_affix, conv_sequences from rapidfuzz._utils import is_none -from rapidfuzz.distance._initialize_py import Editops, Opcodes +from rapidfuzz.distance._initialize_py import Editop, Editops, Opcodes def similarity( @@ -254,6 +254,30 @@ def normalized_similarity( return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0 +def _matrix(s1: Sequence[Hashable], s2: Sequence[Hashable]): + if not s1: + return (0, []) + + S = (1 << len(s1)) - 1 + block: dict[Hashable, int] = {} + block_get = block.get + x = 1 + for ch1 in s1: + block[ch1] = block_get(ch1, 0) | x + x <<= 1 + + matrix = [] + for ch2 in s2: + Matches = block_get(ch2, 0) + u = S & Matches + S = (S + u) | (S - u) + matrix.append(S) + + # calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0 + sim = bin(S)[-len(s1) :].count("0") + return (sim, matrix) + + def editops( s1: Sequence[Hashable], s2: Sequence[Hashable], @@ -298,8 +322,56 @@ def editops( insert s1[4] s2[2] insert s1[6] s2[5] """ - _ = s1, s2, processor - raise NotImplementedError + if processor is not None: + s1 = processor(s1) + s2 = processor(s2) + + s1, s2 = conv_sequences(s1, s2) + prefix_len, suffix_len = common_affix(s1, s2) + s1 = s1[prefix_len : len(s1) - suffix_len] + s2 = s2[prefix_len : len(s2) - suffix_len] + sim, matrix = _matrix(s1, s2) + + editops = Editops([], 0, 0) + editops._src_len = len(s1) + prefix_len + suffix_len + editops._dest_len = len(s2) + prefix_len + suffix_len + + dist = len(s1) + len(s2) - 2 * sim + if dist == 0: + return editops + + editop_list = [None] * dist + col = len(s1) + row = len(s2) + while row != 0 and col != 0: + # deletion + if matrix[row - 1] & (1 << (col - 1)): + dist -= 1 + col -= 1 + editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len) + else: + row -= 1 + + # insertion + if row and not (matrix[row - 1] & (1 << (col - 1))): + dist -= 1 + editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len) + # match + else: + col -= 1 + + while col != 0: + dist -= 1 + col -= 1 + editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len) + + while row != 0: + dist -= 1 + row -= 1 + editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len) + + editops._editops = editop_list + return editops def opcodes( diff --git a/src/rapidfuzz/distance/_initialize_py.py b/src/rapidfuzz/distance/_initialize_py.py index 99158f9..2a1631a 100644 --- a/src/rapidfuzz/distance/_initialize_py.py +++ b/src/rapidfuzz/distance/_initialize_py.py @@ -342,7 +342,7 @@ class Editops: This is the equivalent of ``[x for x in editops]`` """ - return self._editops + return [tuple(op) for op in self._editops] def copy(self) -> Editops: """ @@ -472,15 +472,15 @@ class Editops: for op in self._editops: # matches between last and current editop - while src_pos < op.dest_pos: + while src_pos < op.src_pos: res_str += source_string[src_pos] src_pos += 1 if op.tag == "replace": - res_str += destination_string[src_pos] + res_str += destination_string[op.dest_pos] src_pos += 1 elif op.tag == "insert": - res_str += destination_string[src_pos] + res_str += destination_string[op.dest_pos] elif op.tag == "delete": src_pos += 1 @@ -618,7 +618,7 @@ class Opcode: def __repr__(self) -> str: return ( - f"Opcode(tag={self.tag}, src_start={self.src_start}, src_end={self.src_end}, " + f"Opcode(tag={self.tag!r}, src_start={self.src_start}, src_end={self.src_end}, " f"dest_start={self.dest_start}, dest_end={self.dest_end})" ) @@ -711,7 +711,7 @@ class Opcodes: This is the equivalent of ``[x for x in opcodes]`` """ - return self._opcodes[::] + return [tuple(op) for op in self._opcodes] def copy(self) -> Opcodes: """ diff --git a/src/rapidfuzz/fuzz_py.py b/src/rapidfuzz/fuzz_py.py index 335137a..82796a4 100644 --- a/src/rapidfuzz/fuzz_py.py +++ b/src/rapidfuzz/fuzz_py.py @@ -3,7 +3,6 @@ from __future__ import annotations from math import ceil -import itertools from typing import Any, Callable, Hashable, Sequence from rapidfuzz._common_py import conv_sequences @@ -35,7 +34,7 @@ def _norm_distance(dist: int, lensum: int, score_cutoff: float) -> float: def _split_sequence(seq: Sequence[Hashable]) -> list[Sequence[Hashable]]: - if isinstance(seq, str) or isinstance(seq, bytes): + if isinstance(seq, (str, bytes)): return seq.split() splitted_seq = [[]] @@ -60,7 +59,7 @@ def _join_splitted_sequence(seq_list: list[Sequence[Hashable]]): joined = [] for seq in seq_list: joined += seq - joined += [ord(' ')] + joined += [ord(" ")] return joined[:-1] diff --git a/src/rapidfuzz/process_cpp.hpp b/src/rapidfuzz/process_cpp.hpp index 367facb..214f4da 100644 --- a/src/rapidfuzz/process_cpp.hpp +++ b/src/rapidfuzz/process_cpp.hpp @@ -1,8 +1,8 @@ #pragma once #include "cpp_common.hpp" #include "rapidfuzz.h" -#include "taskflow/taskflow.hpp" #include "taskflow/algorithm/for_each.hpp" +#include "taskflow/taskflow.hpp" #include #include #include diff --git a/tests/common.py b/tests/common.py index 0bcead8..7ef8b94 100644 --- a/tests/common.py +++ b/tests/common.py @@ -40,7 +40,10 @@ def call_and_maybe_catch(call, *args, **kwargs): def compare_exceptions(e1, e2): - return type(e1) is type(e2) and str(e1) == str(e2) + try: + return str(e1) == str(e2) + except Exception: + return False def scorer_tester(scorer, s1, s2, **kwargs): @@ -156,6 +159,8 @@ class Scorer: similarity: Any normalized_distance: Any normalized_similarity: Any + editops: Any + opcodes: Any class GenericScorer: @@ -185,6 +190,28 @@ class GenericScorer: self.get_scorer_flags = get_scorer_flags + def _editops(self, s1, s2, **kwargs): + results = [call_and_maybe_catch(scorer.editops, s1, s2, **kwargs) for scorer in self.scorers] + + for result in results: + assert compare_exceptions(result, results[0]) + + if any(isinstance(result, Exception) for result in results): + raise results[0] + + return results[0] + + def _opcodes(self, s1, s2, **kwargs): + results = [call_and_maybe_catch(scorer.opcodes, s1, s2, **kwargs) for scorer in self.scorers] + + for result in results: + assert compare_exceptions(result, results[0]) + + if any(isinstance(result, Exception) for result in results): + raise results[0] + + return results[0] + def _distance(self, s1, s2, **kwargs): symmetric = self.get_scorer_flags(s1, s2, **kwargs)["symmetric"] tester = symmetric_scorer_tester if symmetric else scorer_tester @@ -303,3 +330,17 @@ class GenericScorer: if "score_cutoff" not in kwargs: return norm_sim return self._normalized_similarity(s1, s2, **kwargs) + + def editops(self, s1, s2, **kwargs): + editops_ = self._editops(s1, s2, **kwargs) + opcodes_ = self._opcodes(s1, s2, **kwargs) + assert opcodes_.as_editops() == editops_ + assert opcodes_ == editops_.as_opcodes() + return editops_ + + def opcodes(self, s1, s2, **kwargs): + editops_ = self._editops(s1, s2, **kwargs) + opcodes_ = self._opcodes(s1, s2, **kwargs) + assert opcodes_.as_editops() == editops_ + assert opcodes_ == editops_.as_opcodes() + return opcodes_ diff --git a/tests/distance/common.py b/tests/distance/common.py index 8918902..ca2c394 100644 --- a/tests/distance/common.py +++ b/tests/distance/common.py @@ -21,6 +21,8 @@ def create_generic_scorer(func_name, get_scorer_flags): similarity=getattr(metrics_py, func_name + "_similarity"), normalized_distance=getattr(metrics_py, func_name + "_normalized_distance"), normalized_similarity=getattr(metrics_py, func_name + "_normalized_similarity"), + editops=getattr(metrics_py, func_name + "_editops", None), + opcodes=getattr(metrics_py, func_name + "_opcodes", None), ) ] @@ -30,6 +32,8 @@ def create_generic_scorer(func_name, get_scorer_flags): similarity=getattr(mod, func_name + "_similarity"), normalized_distance=getattr(mod, func_name + "_normalized_distance"), normalized_similarity=getattr(mod, func_name + "_normalized_similarity"), + editops=getattr(metrics_cpp, func_name + "_editops", None), + opcodes=getattr(metrics_cpp, func_name + "_opcodes", None), ) for mod in cpp_scorer_modules ] diff --git a/tests/distance/test_Indel.py b/tests/distance/test_Indel.py index ca5567d..4d9ec81 100644 --- a/tests/distance/test_Indel.py +++ b/tests/distance/test_Indel.py @@ -19,3 +19,37 @@ def test_issue_196(): assert Indel.distance("South Korea", "North Korea", score_cutoff=2) == 3 assert Indel.distance("South Korea", "North Korea", score_cutoff=1) == 2 assert Indel.distance("South Korea", "North Korea", score_cutoff=0) == 1 + + +def test_Editops(): + """ + basic test for Indel.editops + """ + assert Indel.editops("0", "").as_list() == [("delete", 0, 0)] + assert Indel.editops("", "0").as_list() == [("insert", 0, 0)] + + assert Indel.editops("00", "0").as_list() == [("delete", 1, 1)] + assert Indel.editops("0", "00").as_list() == [("insert", 1, 1)] + + assert Indel.editops("qabxcd", "abycdf").as_list() == [ + ("delete", 0, 0), + ("insert", 3, 2), + ("delete", 3, 3), + ("insert", 6, 5), + ] + assert Indel.editops("Lorem ipsum.", "XYZLorem ABC iPsum").as_list() == [ + ("insert", 0, 0), + ("insert", 0, 1), + ("insert", 0, 2), + ("insert", 6, 9), + ("insert", 6, 10), + ("insert", 6, 11), + ("insert", 6, 12), + ("insert", 7, 14), + ("delete", 7, 15), + ("delete", 11, 18), + ] + + ops = Indel.editops("aaabaaa", "abbaaabba") + assert ops.src_len == 7 + assert ops.dest_len == 9 diff --git a/tests/distance/test_LCSseq.py b/tests/distance/test_LCSseq.py index 1b74570..420048f 100644 --- a/tests/distance/test_LCSseq.py +++ b/tests/distance/test_LCSseq.py @@ -7,3 +7,37 @@ def test_basic(): assert LCSseq.distance("", "") == 0 assert LCSseq.distance("test", "test") == 0 assert LCSseq.distance("aaaa", "bbbb") == 4 + + +def test_Editops(): + """ + basic test for LCSseq.editops + """ + assert LCSseq.editops("0", "").as_list() == [("delete", 0, 0)] + assert LCSseq.editops("", "0").as_list() == [("insert", 0, 0)] + + assert LCSseq.editops("00", "0").as_list() == [("delete", 1, 1)] + assert LCSseq.editops("0", "00").as_list() == [("insert", 1, 1)] + + assert LCSseq.editops("qabxcd", "abycdf").as_list() == [ + ("delete", 0, 0), + ("insert", 3, 2), + ("delete", 3, 3), + ("insert", 6, 5), + ] + assert LCSseq.editops("Lorem ipsum.", "XYZLorem ABC iPsum").as_list() == [ + ("insert", 0, 0), + ("insert", 0, 1), + ("insert", 0, 2), + ("insert", 6, 9), + ("insert", 6, 10), + ("insert", 6, 11), + ("insert", 6, 12), + ("insert", 7, 14), + ("delete", 7, 15), + ("delete", 11, 18), + ] + + ops = LCSseq.editops("aaabaaa", "abbaaabba") + assert ops.src_len == 7 + assert ops.dest_len == 9 diff --git a/tests/test_fuzz.py b/tests/test_fuzz.py index 12f8d2f..8fd7a13 100644 --- a/tests/test_fuzz.py +++ b/tests/test_fuzz.py @@ -106,6 +106,7 @@ scorers = [ fuzz.QRatio, ] + def test_no_processor(): assert fuzz.ratio("new york mets", "new york mets") == 100 assert fuzz.ratio("new york mets", "new YORK mets") != 100 diff --git a/tests/test_hypothesis.py b/tests/test_hypothesis.py index 9c35797..3ffd233 100644 --- a/tests/test_hypothesis.py +++ b/tests/test_hypothesis.py @@ -239,7 +239,7 @@ def test_indel_editops(s1, s2): """ test Indel.editops with any sizes """ - ops = metrics_cpp.indel_editops(s1, s2) + ops = Indel.editops(s1, s2) assert ops.apply(s1, s2) == s2 @@ -249,7 +249,7 @@ def test_indel_editops_block(s1, s2): """ test Indel.editops for long strings """ - ops = metrics_cpp.indel_editops(s1, s2) + ops = Indel.editops(s1, s2) assert ops.apply(s1, s2) == s2 @@ -279,7 +279,7 @@ def test_indel_opcodes(s1, s2): """ test Indel.opcodes with any sizes """ - ops = metrics_cpp.indel_opcodes(s1, s2) + ops = Indel.opcodes(s1, s2) assert ops.apply(s1, s2) == s2 @@ -289,7 +289,7 @@ def test_indel_opcodes_block(s1, s2): """ test Indel.opcodes for long strings """ - ops = metrics_cpp.indel_opcodes(s1, s2) + ops = Indel.opcodes(s1, s2) assert ops.apply(s1, s2) == s2