implement pure Python editops for Indel/LCSseq

2023-08-16 16:32:33 +02:00 · 2023-08-16 16:32:33 +02:00 · e4bfaec4d6
parent 6b24ce8a71
commit e4bfaec4d6
15 changed files with 247 additions and 38 deletions
--- a/api_differences.md
+++ b/api_differences.md
@ -13,14 +13,14 @@ This leads to different results depending on the version in use. `RapidFuzz` alw
 fallback implementation and the C++ based implementation to provide consistent matching results.

 ## partial_ratio implementation
-`fuzzywuzzy` uses searches fo the optimal matching substring and then calculates the similarity using `ratio`. This substring is searches using either:
+`fuzzywuzzy` searches for the optimal matching substring and then calculates the similarity using `ratio`. This substring is searches using either:
 1) `difflib.SequenceMatcher.get_matching_blocks` (based on Ratcliff and Obershelp algorithm)
 2) `Levenshtein.matching_blocks` (backtracks Levenshtein matrix)

 This implementation has a couple of issues:
 1) in the pure Python implementation the automatic junk heuristic of difflib is not deactivated. This heuristic improves the performance for long strings,
 but can lead to completely incorrect results.
-2) the accellerated version backtracks the Levenshtein matrix to find the same alignment found by the Python implementation. However the algorithm just uses
+2) the accelerated version backtracks the Levenshtein matrix to find the same alignment found by the Python implementation. However the algorithm just uses
 one of multiple optimal alignment. There is no guarantee for this alignment to include the longest common substring.
 3) the optimal substring is assumed to start at one of these `matching_blocks`. However this is not guaranteed.

@ -63,4 +63,4 @@ In `RapidFuzz` these functions are sometimes available under different names:
 - `extractOne` is available under the same name
 - `dedupe` is not available

-In addition these functions do not preprocess strings by default. However preprocessing can be enabled using the `processor` argument.
+In addition these functions do not preprocess strings by default. However preprocessing can be enabled using the `processor` argument.
--- a/src/rapidfuzz/CMakeLists.txt
+++ b/src/rapidfuzz/CMakeLists.txt
@ -88,7 +88,6 @@ if(RAPIDFUZZ_ARCH_X86)
  install(TARGETS fuzz_cpp_sse2 LIBRARY DESTINATION src/rapidfuzz)
 endif()

-
 create_cython_target(process_cpp_impl)
 rf_add_library(process_cpp_impl ${process_cpp_impl})
 target_compile_features(process_cpp_impl PUBLIC cxx_std_17)
@ -145,19 +144,20 @@ if(NOT Windows)
                              HAVE_CXX_ATOMICS_UNSIGNED_WITH_LIB)
    if(NOT HAVE_CXX_ATOMICS_INT_WITH_LIB)
      message(
-        FATAL_ERROR "No native support for std::atomic<int>, or libatomic not found!"
-      )
+        FATAL_ERROR
+          "No native support for std::atomic<int>, or libatomic not found!")
    elseif(NOT HAVE_CXX_ATOMICS_SIZE_T_WITH_LIB)
      message(
-        FATAL_ERROR "No native support for std::atomic<size_t>, or libatomic not found!"
-      )
+        FATAL_ERROR
+          "No native support for std::atomic<size_t>, or libatomic not found!")
    elseif(NOT HAVE_CXX_ATOMICS_VOID_PTR_WITH_LIB)
      message(
-        FATAL_ERROR "No native support for std::atomic<void*>, or libatomic not found!"
-      )
+        FATAL_ERROR
+          "No native support for std::atomic<void*>, or libatomic not found!")
    elseif(NOT HAVE_CXX_ATOMICS_UNSIGNED_WITH_LIB)
      message(
-        FATAL_ERROR "No native support for std::atomic<unsigned>, or libatomic not found!"
+        FATAL_ERROR
+          "No native support for std::atomic<unsigned>, or libatomic not found!"
      )
    else()
      message(STATUS "Linking with libatomic for atomics support")
--- a/src/rapidfuzz/_common_py.py
+++ b/src/rapidfuzz/_common_py.py
@ -43,3 +43,31 @@ def conv_sequences(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> Sequence[H
        return s1, s2

    return conv_sequence(s1), conv_sequence(s2)
+
+
+def common_prefix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
+    prefix_len = 0
+    for ch1, ch2 in zip(s1, s2):
+        if ch1 != ch2:
+            break
+
+        prefix_len += 1
+
+    return prefix_len
+
+
+def common_suffix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
+    suffix_len = 0
+    for ch1, ch2 in zip(reversed(s1), reversed(s2)):
+        if ch1 != ch2:
+            break
+
+        suffix_len += 1
+
+    return suffix_len
+
+
+def common_affix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> (int, int):
+    prefix_len = common_prefix(s1, s2)
+    suffix_len = common_suffix(s1[prefix_len:], s2[prefix_len:])
+    return (prefix_len, suffix_len)
--- a/src/rapidfuzz/distance/CMakeLists.txt
+++ b/src/rapidfuzz/distance/CMakeLists.txt
@ -87,4 +87,3 @@ if(RAPIDFUZZ_ARCH_X86)
  target_link_libraries(metrics_cpp_sse2 PRIVATE rapidfuzz::rapidfuzz)
  install(TARGETS metrics_cpp_sse2 LIBRARY DESTINATION src/rapidfuzz/distance)
 endif()
-
--- a/src/rapidfuzz/distance/Indel_py.py
+++ b/src/rapidfuzz/distance/Indel_py.py
@ -9,6 +9,8 @@ from rapidfuzz._common_py import conv_sequences
 from rapidfuzz._utils import is_none
 from rapidfuzz.distance._initialize_py import Editops, Opcodes
 from rapidfuzz.distance.LCSseq_py import _block_similarity as lcs_seq_block_similarity
+from rapidfuzz.distance.LCSseq_py import editops as lcs_seq_editops
+from rapidfuzz.distance.LCSseq_py import opcodes as lcs_seq_opcodes
 from rapidfuzz.distance.LCSseq_py import similarity as lcs_seq_similarity


@ -300,12 +302,7 @@ def editops(
     insert s1[4] s2[2]
     insert s1[6] s2[5]
    """
-    if processor is not None:
-        s1 = processor(s1)
-        s2 = processor(s2)
-
-    s1, s2 = conv_sequences(s1, s2)
-    raise NotImplementedError
+    return lcs_seq_editops(s1, s2, processor=processor)


 def opcodes(
@ -358,4 +355,4 @@ def opcodes(
      equal a[4:6] (cd) b[3:5] (cd)
     insert a[6:6] () b[5:6] (f)
    """
-    return editops(s1, s2, processor=processor).as_opcodes()
+    return lcs_seq_opcodes(s1, s2, processor=processor)
--- a/src/rapidfuzz/distance/LCSseq_py.py
+++ b/src/rapidfuzz/distance/LCSseq_py.py
@ -5,9 +5,9 @@ from __future__ import annotations

 from typing import Callable, Hashable, Sequence

-from rapidfuzz._common_py import conv_sequences
+from rapidfuzz._common_py import common_affix, conv_sequences
 from rapidfuzz._utils import is_none
-from rapidfuzz.distance._initialize_py import Editops, Opcodes
+from rapidfuzz.distance._initialize_py import Editop, Editops, Opcodes


 def similarity(
@ -254,6 +254,30 @@ def normalized_similarity(
    return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0


+def _matrix(s1: Sequence[Hashable], s2: Sequence[Hashable]):
+    if not s1:
+        return (0, [])
+
+    S = (1 << len(s1)) - 1
+    block: dict[Hashable, int] = {}
+    block_get = block.get
+    x = 1
+    for ch1 in s1:
+        block[ch1] = block_get(ch1, 0) | x
+        x <<= 1
+
+    matrix = []
+    for ch2 in s2:
+        Matches = block_get(ch2, 0)
+        u = S & Matches
+        S = (S + u) | (S - u)
+        matrix.append(S)
+
+    # calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
+    sim = bin(S)[-len(s1) :].count("0")
+    return (sim, matrix)
+
+
 def editops(
    s1: Sequence[Hashable],
    s2: Sequence[Hashable],
@ -298,8 +322,56 @@ def editops(
     insert s1[4] s2[2]
     insert s1[6] s2[5]
    """
-    _ = s1, s2, processor
-    raise NotImplementedError
+    if processor is not None:
+        s1 = processor(s1)
+        s2 = processor(s2)
+
+    s1, s2 = conv_sequences(s1, s2)
+    prefix_len, suffix_len = common_affix(s1, s2)
+    s1 = s1[prefix_len : len(s1) - suffix_len]
+    s2 = s2[prefix_len : len(s2) - suffix_len]
+    sim, matrix = _matrix(s1, s2)
+
+    editops = Editops([], 0, 0)
+    editops._src_len = len(s1) + prefix_len + suffix_len
+    editops._dest_len = len(s2) + prefix_len + suffix_len
+
+    dist = len(s1) + len(s2) - 2 * sim
+    if dist == 0:
+        return editops
+
+    editop_list = [None] * dist
+    col = len(s1)
+    row = len(s2)
+    while row != 0 and col != 0:
+        # deletion
+        if matrix[row - 1] & (1 << (col - 1)):
+            dist -= 1
+            col -= 1
+            editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
+        else:
+            row -= 1
+
+            # insertion
+            if row and not (matrix[row - 1] & (1 << (col - 1))):
+                dist -= 1
+                editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
+            # match
+            else:
+                col -= 1
+
+    while col != 0:
+        dist -= 1
+        col -= 1
+        editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
+
+    while row != 0:
+        dist -= 1
+        row -= 1
+        editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
+
+    editops._editops = editop_list
+    return editops


 def opcodes(
--- a/src/rapidfuzz/distance/_initialize_py.py
+++ b/src/rapidfuzz/distance/_initialize_py.py
@ -342,7 +342,7 @@ class Editops:

        This is the equivalent of ``[x for x in editops]``
        """
-        return self._editops
+        return [tuple(op) for op in self._editops]

    def copy(self) -> Editops:
        """
@ -472,15 +472,15 @@ class Editops:

        for op in self._editops:
            # matches between last and current editop
-            while src_pos < op.dest_pos:
+            while src_pos < op.src_pos:
                res_str += source_string[src_pos]
                src_pos += 1

            if op.tag == "replace":
-                res_str += destination_string[src_pos]
+                res_str += destination_string[op.dest_pos]
                src_pos += 1
            elif op.tag == "insert":
-                res_str += destination_string[src_pos]
+                res_str += destination_string[op.dest_pos]
            elif op.tag == "delete":
                src_pos += 1

@ -618,7 +618,7 @@ class Opcode:

    def __repr__(self) -> str:
        return (
-            f"Opcode(tag={self.tag}, src_start={self.src_start}, src_end={self.src_end}, "
+            f"Opcode(tag={self.tag!r}, src_start={self.src_start}, src_end={self.src_end}, "
            f"dest_start={self.dest_start}, dest_end={self.dest_end})"
        )

@ -711,7 +711,7 @@ class Opcodes:

        This is the equivalent of ``[x for x in opcodes]``
        """
-        return self._opcodes[::]
+        return [tuple(op) for op in self._opcodes]

    def copy(self) -> Opcodes:
        """
--- a/src/rapidfuzz/fuzz_py.py
+++ b/src/rapidfuzz/fuzz_py.py
@ -3,7 +3,6 @@
 from __future__ import annotations

 from math import ceil
-import itertools
 from typing import Any, Callable, Hashable, Sequence

 from rapidfuzz._common_py import conv_sequences
@ -35,7 +34,7 @@ def _norm_distance(dist: int, lensum: int, score_cutoff: float) -> float:


 def _split_sequence(seq: Sequence[Hashable]) -> list[Sequence[Hashable]]:
-    if isinstance(seq, str) or isinstance(seq, bytes):
+    if isinstance(seq, (str, bytes)):
        return seq.split()

    splitted_seq = [[]]
@ -60,7 +59,7 @@ def _join_splitted_sequence(seq_list: list[Sequence[Hashable]]):
    joined = []
    for seq in seq_list:
        joined += seq
-        joined += [ord(' ')]
+        joined += [ord(" ")]
    return joined[:-1]


--- a/src/rapidfuzz/process_cpp.hpp
+++ b/src/rapidfuzz/process_cpp.hpp
@ -1,8 +1,8 @@
 #pragma once
 #include "cpp_common.hpp"
 #include "rapidfuzz.h"
-#include "taskflow/taskflow.hpp"
 #include "taskflow/algorithm/for_each.hpp"
+#include "taskflow/taskflow.hpp"
 #include <atomic>
 #include <chrono>
 #include <exception>
--- a/tests/common.py
+++ b/tests/common.py
@ -40,7 +40,10 @@ def call_and_maybe_catch(call, *args, **kwargs):


 def compare_exceptions(e1, e2):
-    return type(e1) is type(e2) and str(e1) == str(e2)
+    try:
+        return str(e1) == str(e2)
+    except Exception:
+        return False


 def scorer_tester(scorer, s1, s2, **kwargs):
@ -156,6 +159,8 @@ class Scorer:
    similarity: Any
    normalized_distance: Any
    normalized_similarity: Any
+    editops: Any
+    opcodes: Any


 class GenericScorer:
@ -185,6 +190,28 @@ class GenericScorer:

        self.get_scorer_flags = get_scorer_flags

+    def _editops(self, s1, s2, **kwargs):
+        results = [call_and_maybe_catch(scorer.editops, s1, s2, **kwargs) for scorer in self.scorers]
+
+        for result in results:
+            assert compare_exceptions(result, results[0])
+
+        if any(isinstance(result, Exception) for result in results):
+            raise results[0]
+
+        return results[0]
+
+    def _opcodes(self, s1, s2, **kwargs):
+        results = [call_and_maybe_catch(scorer.opcodes, s1, s2, **kwargs) for scorer in self.scorers]
+
+        for result in results:
+            assert compare_exceptions(result, results[0])
+
+        if any(isinstance(result, Exception) for result in results):
+            raise results[0]
+
+        return results[0]
+
    def _distance(self, s1, s2, **kwargs):
        symmetric = self.get_scorer_flags(s1, s2, **kwargs)["symmetric"]
        tester = symmetric_scorer_tester if symmetric else scorer_tester
@ -303,3 +330,17 @@ class GenericScorer:
            if "score_cutoff" not in kwargs:
                return norm_sim
        return self._normalized_similarity(s1, s2, **kwargs)
+
+    def editops(self, s1, s2, **kwargs):
+        editops_ = self._editops(s1, s2, **kwargs)
+        opcodes_ = self._opcodes(s1, s2, **kwargs)
+        assert opcodes_.as_editops() == editops_
+        assert opcodes_ == editops_.as_opcodes()
+        return editops_
+
+    def opcodes(self, s1, s2, **kwargs):
+        editops_ = self._editops(s1, s2, **kwargs)
+        opcodes_ = self._opcodes(s1, s2, **kwargs)
+        assert opcodes_.as_editops() == editops_
+        assert opcodes_ == editops_.as_opcodes()
+        return opcodes_
--- a/tests/distance/common.py
+++ b/tests/distance/common.py
@ -21,6 +21,8 @@ def create_generic_scorer(func_name, get_scorer_flags):
            similarity=getattr(metrics_py, func_name + "_similarity"),
            normalized_distance=getattr(metrics_py, func_name + "_normalized_distance"),
            normalized_similarity=getattr(metrics_py, func_name + "_normalized_similarity"),
+            editops=getattr(metrics_py, func_name + "_editops", None),
+            opcodes=getattr(metrics_py, func_name + "_opcodes", None),
        )
    ]

@ -30,6 +32,8 @@ def create_generic_scorer(func_name, get_scorer_flags):
            similarity=getattr(mod, func_name + "_similarity"),
            normalized_distance=getattr(mod, func_name + "_normalized_distance"),
            normalized_similarity=getattr(mod, func_name + "_normalized_similarity"),
+            editops=getattr(metrics_cpp, func_name + "_editops", None),
+            opcodes=getattr(metrics_cpp, func_name + "_opcodes", None),
        )
        for mod in cpp_scorer_modules
    ]
--- a/tests/distance/test_Indel.py
+++ b/tests/distance/test_Indel.py
@ -19,3 +19,37 @@ def test_issue_196():
    assert Indel.distance("South Korea", "North Korea", score_cutoff=2) == 3
    assert Indel.distance("South Korea", "North Korea", score_cutoff=1) == 2
    assert Indel.distance("South Korea", "North Korea", score_cutoff=0) == 1
+
+
+def test_Editops():
+    """
+    basic test for Indel.editops
+    """
+    assert Indel.editops("0", "").as_list() == [("delete", 0, 0)]
+    assert Indel.editops("", "0").as_list() == [("insert", 0, 0)]
+
+    assert Indel.editops("00", "0").as_list() == [("delete", 1, 1)]
+    assert Indel.editops("0", "00").as_list() == [("insert", 1, 1)]
+
+    assert Indel.editops("qabxcd", "abycdf").as_list() == [
+        ("delete", 0, 0),
+        ("insert", 3, 2),
+        ("delete", 3, 3),
+        ("insert", 6, 5),
+    ]
+    assert Indel.editops("Lorem ipsum.", "XYZLorem ABC iPsum").as_list() == [
+        ("insert", 0, 0),
+        ("insert", 0, 1),
+        ("insert", 0, 2),
+        ("insert", 6, 9),
+        ("insert", 6, 10),
+        ("insert", 6, 11),
+        ("insert", 6, 12),
+        ("insert", 7, 14),
+        ("delete", 7, 15),
+        ("delete", 11, 18),
+    ]
+
+    ops = Indel.editops("aaabaaa", "abbaaabba")
+    assert ops.src_len == 7
+    assert ops.dest_len == 9
--- a/tests/distance/test_LCSseq.py
+++ b/tests/distance/test_LCSseq.py
@ -7,3 +7,37 @@ def test_basic():
    assert LCSseq.distance("", "") == 0
    assert LCSseq.distance("test", "test") == 0
    assert LCSseq.distance("aaaa", "bbbb") == 4
+
+
+def test_Editops():
+    """
+    basic test for LCSseq.editops
+    """
+    assert LCSseq.editops("0", "").as_list() == [("delete", 0, 0)]
+    assert LCSseq.editops("", "0").as_list() == [("insert", 0, 0)]
+
+    assert LCSseq.editops("00", "0").as_list() == [("delete", 1, 1)]
+    assert LCSseq.editops("0", "00").as_list() == [("insert", 1, 1)]
+
+    assert LCSseq.editops("qabxcd", "abycdf").as_list() == [
+        ("delete", 0, 0),
+        ("insert", 3, 2),
+        ("delete", 3, 3),
+        ("insert", 6, 5),
+    ]
+    assert LCSseq.editops("Lorem ipsum.", "XYZLorem ABC iPsum").as_list() == [
+        ("insert", 0, 0),
+        ("insert", 0, 1),
+        ("insert", 0, 2),
+        ("insert", 6, 9),
+        ("insert", 6, 10),
+        ("insert", 6, 11),
+        ("insert", 6, 12),
+        ("insert", 7, 14),
+        ("delete", 7, 15),
+        ("delete", 11, 18),
+    ]
+
+    ops = LCSseq.editops("aaabaaa", "abbaaabba")
+    assert ops.src_len == 7
+    assert ops.dest_len == 9
--- a/tests/test_fuzz.py
+++ b/tests/test_fuzz.py
@ -106,6 +106,7 @@ scorers = [
    fuzz.QRatio,
 ]

+
 def test_no_processor():
    assert fuzz.ratio("new york mets", "new york mets") == 100
    assert fuzz.ratio("new york mets", "new YORK mets") != 100
--- a/tests/test_hypothesis.py
+++ b/tests/test_hypothesis.py
@ -239,7 +239,7 @@ def test_indel_editops(s1, s2):
    """
    test Indel.editops with any sizes
    """
-    ops = metrics_cpp.indel_editops(s1, s2)
+    ops = Indel.editops(s1, s2)
    assert ops.apply(s1, s2) == s2


@ -249,7 +249,7 @@ def test_indel_editops_block(s1, s2):
    """
    test Indel.editops for long strings
    """
-    ops = metrics_cpp.indel_editops(s1, s2)
+    ops = Indel.editops(s1, s2)
    assert ops.apply(s1, s2) == s2


@ -279,7 +279,7 @@ def test_indel_opcodes(s1, s2):
    """
    test Indel.opcodes with any sizes
    """
-    ops = metrics_cpp.indel_opcodes(s1, s2)
+    ops = Indel.opcodes(s1, s2)
    assert ops.apply(s1, s2) == s2


@ -289,7 +289,7 @@ def test_indel_opcodes_block(s1, s2):
    """
    test Indel.opcodes for long strings
    """
-    ops = metrics_cpp.indel_opcodes(s1, s2)
+    ops = Indel.opcodes(s1, s2)
    assert ops.apply(s1, s2) == s2