implement pure Python editops for Indel/LCSseq

This commit is contained in:
Max Bachmann 2023-08-16 16:32:33 +02:00
parent 6b24ce8a71
commit e4bfaec4d6
15 changed files with 247 additions and 38 deletions

View File

@ -13,14 +13,14 @@ This leads to different results depending on the version in use. `RapidFuzz` alw
fallback implementation and the C++ based implementation to provide consistent matching results.
## partial_ratio implementation
`fuzzywuzzy` uses searches fo the optimal matching substring and then calculates the similarity using `ratio`. This substring is searches using either:
`fuzzywuzzy` searches for the optimal matching substring and then calculates the similarity using `ratio`. This substring is searches using either:
1) `difflib.SequenceMatcher.get_matching_blocks` (based on Ratcliff and Obershelp algorithm)
2) `Levenshtein.matching_blocks` (backtracks Levenshtein matrix)
This implementation has a couple of issues:
1) in the pure Python implementation the automatic junk heuristic of difflib is not deactivated. This heuristic improves the performance for long strings,
but can lead to completely incorrect results.
2) the accellerated version backtracks the Levenshtein matrix to find the same alignment found by the Python implementation. However the algorithm just uses
2) the accelerated version backtracks the Levenshtein matrix to find the same alignment found by the Python implementation. However the algorithm just uses
one of multiple optimal alignment. There is no guarantee for this alignment to include the longest common substring.
3) the optimal substring is assumed to start at one of these `matching_blocks`. However this is not guaranteed.
@ -63,4 +63,4 @@ In `RapidFuzz` these functions are sometimes available under different names:
- `extractOne` is available under the same name
- `dedupe` is not available
In addition these functions do not preprocess strings by default. However preprocessing can be enabled using the `processor` argument.
In addition these functions do not preprocess strings by default. However preprocessing can be enabled using the `processor` argument.

View File

@ -88,7 +88,6 @@ if(RAPIDFUZZ_ARCH_X86)
install(TARGETS fuzz_cpp_sse2 LIBRARY DESTINATION src/rapidfuzz)
endif()
create_cython_target(process_cpp_impl)
rf_add_library(process_cpp_impl ${process_cpp_impl})
target_compile_features(process_cpp_impl PUBLIC cxx_std_17)
@ -145,19 +144,20 @@ if(NOT Windows)
HAVE_CXX_ATOMICS_UNSIGNED_WITH_LIB)
if(NOT HAVE_CXX_ATOMICS_INT_WITH_LIB)
message(
FATAL_ERROR "No native support for std::atomic<int>, or libatomic not found!"
)
FATAL_ERROR
"No native support for std::atomic<int>, or libatomic not found!")
elseif(NOT HAVE_CXX_ATOMICS_SIZE_T_WITH_LIB)
message(
FATAL_ERROR "No native support for std::atomic<size_t>, or libatomic not found!"
)
FATAL_ERROR
"No native support for std::atomic<size_t>, or libatomic not found!")
elseif(NOT HAVE_CXX_ATOMICS_VOID_PTR_WITH_LIB)
message(
FATAL_ERROR "No native support for std::atomic<void*>, or libatomic not found!"
)
FATAL_ERROR
"No native support for std::atomic<void*>, or libatomic not found!")
elseif(NOT HAVE_CXX_ATOMICS_UNSIGNED_WITH_LIB)
message(
FATAL_ERROR "No native support for std::atomic<unsigned>, or libatomic not found!"
FATAL_ERROR
"No native support for std::atomic<unsigned>, or libatomic not found!"
)
else()
message(STATUS "Linking with libatomic for atomics support")

View File

@ -43,3 +43,31 @@ def conv_sequences(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> Sequence[H
return s1, s2
return conv_sequence(s1), conv_sequence(s2)
def common_prefix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
prefix_len = 0
for ch1, ch2 in zip(s1, s2):
if ch1 != ch2:
break
prefix_len += 1
return prefix_len
def common_suffix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
suffix_len = 0
for ch1, ch2 in zip(reversed(s1), reversed(s2)):
if ch1 != ch2:
break
suffix_len += 1
return suffix_len
def common_affix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> (int, int):
prefix_len = common_prefix(s1, s2)
suffix_len = common_suffix(s1[prefix_len:], s2[prefix_len:])
return (prefix_len, suffix_len)

View File

@ -87,4 +87,3 @@ if(RAPIDFUZZ_ARCH_X86)
target_link_libraries(metrics_cpp_sse2 PRIVATE rapidfuzz::rapidfuzz)
install(TARGETS metrics_cpp_sse2 LIBRARY DESTINATION src/rapidfuzz/distance)
endif()

View File

@ -9,6 +9,8 @@ from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import is_none
from rapidfuzz.distance._initialize_py import Editops, Opcodes
from rapidfuzz.distance.LCSseq_py import _block_similarity as lcs_seq_block_similarity
from rapidfuzz.distance.LCSseq_py import editops as lcs_seq_editops
from rapidfuzz.distance.LCSseq_py import opcodes as lcs_seq_opcodes
from rapidfuzz.distance.LCSseq_py import similarity as lcs_seq_similarity
@ -300,12 +302,7 @@ def editops(
insert s1[4] s2[2]
insert s1[6] s2[5]
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
raise NotImplementedError
return lcs_seq_editops(s1, s2, processor=processor)
def opcodes(
@ -358,4 +355,4 @@ def opcodes(
equal a[4:6] (cd) b[3:5] (cd)
insert a[6:6] () b[5:6] (f)
"""
return editops(s1, s2, processor=processor).as_opcodes()
return lcs_seq_opcodes(s1, s2, processor=processor)

View File

@ -5,9 +5,9 @@ from __future__ import annotations
from typing import Callable, Hashable, Sequence
from rapidfuzz._common_py import conv_sequences
from rapidfuzz._common_py import common_affix, conv_sequences
from rapidfuzz._utils import is_none
from rapidfuzz.distance._initialize_py import Editops, Opcodes
from rapidfuzz.distance._initialize_py import Editop, Editops, Opcodes
def similarity(
@ -254,6 +254,30 @@ def normalized_similarity(
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
def _matrix(s1: Sequence[Hashable], s2: Sequence[Hashable]):
if not s1:
return (0, [])
S = (1 << len(s1)) - 1
block: dict[Hashable, int] = {}
block_get = block.get
x = 1
for ch1 in s1:
block[ch1] = block_get(ch1, 0) | x
x <<= 1
matrix = []
for ch2 in s2:
Matches = block_get(ch2, 0)
u = S & Matches
S = (S + u) | (S - u)
matrix.append(S)
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
sim = bin(S)[-len(s1) :].count("0")
return (sim, matrix)
def editops(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
@ -298,8 +322,56 @@ def editops(
insert s1[4] s2[2]
insert s1[6] s2[5]
"""
_ = s1, s2, processor
raise NotImplementedError
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
prefix_len, suffix_len = common_affix(s1, s2)
s1 = s1[prefix_len : len(s1) - suffix_len]
s2 = s2[prefix_len : len(s2) - suffix_len]
sim, matrix = _matrix(s1, s2)
editops = Editops([], 0, 0)
editops._src_len = len(s1) + prefix_len + suffix_len
editops._dest_len = len(s2) + prefix_len + suffix_len
dist = len(s1) + len(s2) - 2 * sim
if dist == 0:
return editops
editop_list = [None] * dist
col = len(s1)
row = len(s2)
while row != 0 and col != 0:
# deletion
if matrix[row - 1] & (1 << (col - 1)):
dist -= 1
col -= 1
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
else:
row -= 1
# insertion
if row and not (matrix[row - 1] & (1 << (col - 1))):
dist -= 1
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
# match
else:
col -= 1
while col != 0:
dist -= 1
col -= 1
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
while row != 0:
dist -= 1
row -= 1
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
editops._editops = editop_list
return editops
def opcodes(

View File

@ -342,7 +342,7 @@ class Editops:
This is the equivalent of ``[x for x in editops]``
"""
return self._editops
return [tuple(op) for op in self._editops]
def copy(self) -> Editops:
"""
@ -472,15 +472,15 @@ class Editops:
for op in self._editops:
# matches between last and current editop
while src_pos < op.dest_pos:
while src_pos < op.src_pos:
res_str += source_string[src_pos]
src_pos += 1
if op.tag == "replace":
res_str += destination_string[src_pos]
res_str += destination_string[op.dest_pos]
src_pos += 1
elif op.tag == "insert":
res_str += destination_string[src_pos]
res_str += destination_string[op.dest_pos]
elif op.tag == "delete":
src_pos += 1
@ -618,7 +618,7 @@ class Opcode:
def __repr__(self) -> str:
return (
f"Opcode(tag={self.tag}, src_start={self.src_start}, src_end={self.src_end}, "
f"Opcode(tag={self.tag!r}, src_start={self.src_start}, src_end={self.src_end}, "
f"dest_start={self.dest_start}, dest_end={self.dest_end})"
)
@ -711,7 +711,7 @@ class Opcodes:
This is the equivalent of ``[x for x in opcodes]``
"""
return self._opcodes[::]
return [tuple(op) for op in self._opcodes]
def copy(self) -> Opcodes:
"""

View File

@ -3,7 +3,6 @@
from __future__ import annotations
from math import ceil
import itertools
from typing import Any, Callable, Hashable, Sequence
from rapidfuzz._common_py import conv_sequences
@ -35,7 +34,7 @@ def _norm_distance(dist: int, lensum: int, score_cutoff: float) -> float:
def _split_sequence(seq: Sequence[Hashable]) -> list[Sequence[Hashable]]:
if isinstance(seq, str) or isinstance(seq, bytes):
if isinstance(seq, (str, bytes)):
return seq.split()
splitted_seq = [[]]
@ -60,7 +59,7 @@ def _join_splitted_sequence(seq_list: list[Sequence[Hashable]]):
joined = []
for seq in seq_list:
joined += seq
joined += [ord(' ')]
joined += [ord(" ")]
return joined[:-1]

View File

@ -1,8 +1,8 @@
#pragma once
#include "cpp_common.hpp"
#include "rapidfuzz.h"
#include "taskflow/taskflow.hpp"
#include "taskflow/algorithm/for_each.hpp"
#include "taskflow/taskflow.hpp"
#include <atomic>
#include <chrono>
#include <exception>

View File

@ -40,7 +40,10 @@ def call_and_maybe_catch(call, *args, **kwargs):
def compare_exceptions(e1, e2):
return type(e1) is type(e2) and str(e1) == str(e2)
try:
return str(e1) == str(e2)
except Exception:
return False
def scorer_tester(scorer, s1, s2, **kwargs):
@ -156,6 +159,8 @@ class Scorer:
similarity: Any
normalized_distance: Any
normalized_similarity: Any
editops: Any
opcodes: Any
class GenericScorer:
@ -185,6 +190,28 @@ class GenericScorer:
self.get_scorer_flags = get_scorer_flags
def _editops(self, s1, s2, **kwargs):
results = [call_and_maybe_catch(scorer.editops, s1, s2, **kwargs) for scorer in self.scorers]
for result in results:
assert compare_exceptions(result, results[0])
if any(isinstance(result, Exception) for result in results):
raise results[0]
return results[0]
def _opcodes(self, s1, s2, **kwargs):
results = [call_and_maybe_catch(scorer.opcodes, s1, s2, **kwargs) for scorer in self.scorers]
for result in results:
assert compare_exceptions(result, results[0])
if any(isinstance(result, Exception) for result in results):
raise results[0]
return results[0]
def _distance(self, s1, s2, **kwargs):
symmetric = self.get_scorer_flags(s1, s2, **kwargs)["symmetric"]
tester = symmetric_scorer_tester if symmetric else scorer_tester
@ -303,3 +330,17 @@ class GenericScorer:
if "score_cutoff" not in kwargs:
return norm_sim
return self._normalized_similarity(s1, s2, **kwargs)
def editops(self, s1, s2, **kwargs):
editops_ = self._editops(s1, s2, **kwargs)
opcodes_ = self._opcodes(s1, s2, **kwargs)
assert opcodes_.as_editops() == editops_
assert opcodes_ == editops_.as_opcodes()
return editops_
def opcodes(self, s1, s2, **kwargs):
editops_ = self._editops(s1, s2, **kwargs)
opcodes_ = self._opcodes(s1, s2, **kwargs)
assert opcodes_.as_editops() == editops_
assert opcodes_ == editops_.as_opcodes()
return opcodes_

View File

@ -21,6 +21,8 @@ def create_generic_scorer(func_name, get_scorer_flags):
similarity=getattr(metrics_py, func_name + "_similarity"),
normalized_distance=getattr(metrics_py, func_name + "_normalized_distance"),
normalized_similarity=getattr(metrics_py, func_name + "_normalized_similarity"),
editops=getattr(metrics_py, func_name + "_editops", None),
opcodes=getattr(metrics_py, func_name + "_opcodes", None),
)
]
@ -30,6 +32,8 @@ def create_generic_scorer(func_name, get_scorer_flags):
similarity=getattr(mod, func_name + "_similarity"),
normalized_distance=getattr(mod, func_name + "_normalized_distance"),
normalized_similarity=getattr(mod, func_name + "_normalized_similarity"),
editops=getattr(metrics_cpp, func_name + "_editops", None),
opcodes=getattr(metrics_cpp, func_name + "_opcodes", None),
)
for mod in cpp_scorer_modules
]

View File

@ -19,3 +19,37 @@ def test_issue_196():
assert Indel.distance("South Korea", "North Korea", score_cutoff=2) == 3
assert Indel.distance("South Korea", "North Korea", score_cutoff=1) == 2
assert Indel.distance("South Korea", "North Korea", score_cutoff=0) == 1
def test_Editops():
"""
basic test for Indel.editops
"""
assert Indel.editops("0", "").as_list() == [("delete", 0, 0)]
assert Indel.editops("", "0").as_list() == [("insert", 0, 0)]
assert Indel.editops("00", "0").as_list() == [("delete", 1, 1)]
assert Indel.editops("0", "00").as_list() == [("insert", 1, 1)]
assert Indel.editops("qabxcd", "abycdf").as_list() == [
("delete", 0, 0),
("insert", 3, 2),
("delete", 3, 3),
("insert", 6, 5),
]
assert Indel.editops("Lorem ipsum.", "XYZLorem ABC iPsum").as_list() == [
("insert", 0, 0),
("insert", 0, 1),
("insert", 0, 2),
("insert", 6, 9),
("insert", 6, 10),
("insert", 6, 11),
("insert", 6, 12),
("insert", 7, 14),
("delete", 7, 15),
("delete", 11, 18),
]
ops = Indel.editops("aaabaaa", "abbaaabba")
assert ops.src_len == 7
assert ops.dest_len == 9

View File

@ -7,3 +7,37 @@ def test_basic():
assert LCSseq.distance("", "") == 0
assert LCSseq.distance("test", "test") == 0
assert LCSseq.distance("aaaa", "bbbb") == 4
def test_Editops():
"""
basic test for LCSseq.editops
"""
assert LCSseq.editops("0", "").as_list() == [("delete", 0, 0)]
assert LCSseq.editops("", "0").as_list() == [("insert", 0, 0)]
assert LCSseq.editops("00", "0").as_list() == [("delete", 1, 1)]
assert LCSseq.editops("0", "00").as_list() == [("insert", 1, 1)]
assert LCSseq.editops("qabxcd", "abycdf").as_list() == [
("delete", 0, 0),
("insert", 3, 2),
("delete", 3, 3),
("insert", 6, 5),
]
assert LCSseq.editops("Lorem ipsum.", "XYZLorem ABC iPsum").as_list() == [
("insert", 0, 0),
("insert", 0, 1),
("insert", 0, 2),
("insert", 6, 9),
("insert", 6, 10),
("insert", 6, 11),
("insert", 6, 12),
("insert", 7, 14),
("delete", 7, 15),
("delete", 11, 18),
]
ops = LCSseq.editops("aaabaaa", "abbaaabba")
assert ops.src_len == 7
assert ops.dest_len == 9

View File

@ -106,6 +106,7 @@ scorers = [
fuzz.QRatio,
]
def test_no_processor():
assert fuzz.ratio("new york mets", "new york mets") == 100
assert fuzz.ratio("new york mets", "new YORK mets") != 100

View File

@ -239,7 +239,7 @@ def test_indel_editops(s1, s2):
"""
test Indel.editops with any sizes
"""
ops = metrics_cpp.indel_editops(s1, s2)
ops = Indel.editops(s1, s2)
assert ops.apply(s1, s2) == s2
@ -249,7 +249,7 @@ def test_indel_editops_block(s1, s2):
"""
test Indel.editops for long strings
"""
ops = metrics_cpp.indel_editops(s1, s2)
ops = Indel.editops(s1, s2)
assert ops.apply(s1, s2) == s2
@ -279,7 +279,7 @@ def test_indel_opcodes(s1, s2):
"""
test Indel.opcodes with any sizes
"""
ops = metrics_cpp.indel_opcodes(s1, s2)
ops = Indel.opcodes(s1, s2)
assert ops.apply(s1, s2) == s2
@ -289,7 +289,7 @@ def test_indel_opcodes_block(s1, s2):
"""
test Indel.opcodes for long strings
"""
ops = metrics_cpp.indel_opcodes(s1, s2)
ops = Indel.opcodes(s1, s2)
assert ops.apply(s1, s2) == s2