implement pure Python editops for Indel/LCSseq
This commit is contained in:
parent
6b24ce8a71
commit
e4bfaec4d6
|
@ -13,14 +13,14 @@ This leads to different results depending on the version in use. `RapidFuzz` alw
|
|||
fallback implementation and the C++ based implementation to provide consistent matching results.
|
||||
|
||||
## partial_ratio implementation
|
||||
`fuzzywuzzy` uses searches fo the optimal matching substring and then calculates the similarity using `ratio`. This substring is searches using either:
|
||||
`fuzzywuzzy` searches for the optimal matching substring and then calculates the similarity using `ratio`. This substring is searches using either:
|
||||
1) `difflib.SequenceMatcher.get_matching_blocks` (based on Ratcliff and Obershelp algorithm)
|
||||
2) `Levenshtein.matching_blocks` (backtracks Levenshtein matrix)
|
||||
|
||||
This implementation has a couple of issues:
|
||||
1) in the pure Python implementation the automatic junk heuristic of difflib is not deactivated. This heuristic improves the performance for long strings,
|
||||
but can lead to completely incorrect results.
|
||||
2) the accellerated version backtracks the Levenshtein matrix to find the same alignment found by the Python implementation. However the algorithm just uses
|
||||
2) the accelerated version backtracks the Levenshtein matrix to find the same alignment found by the Python implementation. However the algorithm just uses
|
||||
one of multiple optimal alignment. There is no guarantee for this alignment to include the longest common substring.
|
||||
3) the optimal substring is assumed to start at one of these `matching_blocks`. However this is not guaranteed.
|
||||
|
||||
|
@ -63,4 +63,4 @@ In `RapidFuzz` these functions are sometimes available under different names:
|
|||
- `extractOne` is available under the same name
|
||||
- `dedupe` is not available
|
||||
|
||||
In addition these functions do not preprocess strings by default. However preprocessing can be enabled using the `processor` argument.
|
||||
In addition these functions do not preprocess strings by default. However preprocessing can be enabled using the `processor` argument.
|
||||
|
|
|
@ -88,7 +88,6 @@ if(RAPIDFUZZ_ARCH_X86)
|
|||
install(TARGETS fuzz_cpp_sse2 LIBRARY DESTINATION src/rapidfuzz)
|
||||
endif()
|
||||
|
||||
|
||||
create_cython_target(process_cpp_impl)
|
||||
rf_add_library(process_cpp_impl ${process_cpp_impl})
|
||||
target_compile_features(process_cpp_impl PUBLIC cxx_std_17)
|
||||
|
@ -145,19 +144,20 @@ if(NOT Windows)
|
|||
HAVE_CXX_ATOMICS_UNSIGNED_WITH_LIB)
|
||||
if(NOT HAVE_CXX_ATOMICS_INT_WITH_LIB)
|
||||
message(
|
||||
FATAL_ERROR "No native support for std::atomic<int>, or libatomic not found!"
|
||||
)
|
||||
FATAL_ERROR
|
||||
"No native support for std::atomic<int>, or libatomic not found!")
|
||||
elseif(NOT HAVE_CXX_ATOMICS_SIZE_T_WITH_LIB)
|
||||
message(
|
||||
FATAL_ERROR "No native support for std::atomic<size_t>, or libatomic not found!"
|
||||
)
|
||||
FATAL_ERROR
|
||||
"No native support for std::atomic<size_t>, or libatomic not found!")
|
||||
elseif(NOT HAVE_CXX_ATOMICS_VOID_PTR_WITH_LIB)
|
||||
message(
|
||||
FATAL_ERROR "No native support for std::atomic<void*>, or libatomic not found!"
|
||||
)
|
||||
FATAL_ERROR
|
||||
"No native support for std::atomic<void*>, or libatomic not found!")
|
||||
elseif(NOT HAVE_CXX_ATOMICS_UNSIGNED_WITH_LIB)
|
||||
message(
|
||||
FATAL_ERROR "No native support for std::atomic<unsigned>, or libatomic not found!"
|
||||
FATAL_ERROR
|
||||
"No native support for std::atomic<unsigned>, or libatomic not found!"
|
||||
)
|
||||
else()
|
||||
message(STATUS "Linking with libatomic for atomics support")
|
||||
|
|
|
@ -43,3 +43,31 @@ def conv_sequences(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> Sequence[H
|
|||
return s1, s2
|
||||
|
||||
return conv_sequence(s1), conv_sequence(s2)
|
||||
|
||||
|
||||
def common_prefix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
|
||||
prefix_len = 0
|
||||
for ch1, ch2 in zip(s1, s2):
|
||||
if ch1 != ch2:
|
||||
break
|
||||
|
||||
prefix_len += 1
|
||||
|
||||
return prefix_len
|
||||
|
||||
|
||||
def common_suffix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
|
||||
suffix_len = 0
|
||||
for ch1, ch2 in zip(reversed(s1), reversed(s2)):
|
||||
if ch1 != ch2:
|
||||
break
|
||||
|
||||
suffix_len += 1
|
||||
|
||||
return suffix_len
|
||||
|
||||
|
||||
def common_affix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> (int, int):
|
||||
prefix_len = common_prefix(s1, s2)
|
||||
suffix_len = common_suffix(s1[prefix_len:], s2[prefix_len:])
|
||||
return (prefix_len, suffix_len)
|
||||
|
|
|
@ -87,4 +87,3 @@ if(RAPIDFUZZ_ARCH_X86)
|
|||
target_link_libraries(metrics_cpp_sse2 PRIVATE rapidfuzz::rapidfuzz)
|
||||
install(TARGETS metrics_cpp_sse2 LIBRARY DESTINATION src/rapidfuzz/distance)
|
||||
endif()
|
||||
|
||||
|
|
|
@ -9,6 +9,8 @@ from rapidfuzz._common_py import conv_sequences
|
|||
from rapidfuzz._utils import is_none
|
||||
from rapidfuzz.distance._initialize_py import Editops, Opcodes
|
||||
from rapidfuzz.distance.LCSseq_py import _block_similarity as lcs_seq_block_similarity
|
||||
from rapidfuzz.distance.LCSseq_py import editops as lcs_seq_editops
|
||||
from rapidfuzz.distance.LCSseq_py import opcodes as lcs_seq_opcodes
|
||||
from rapidfuzz.distance.LCSseq_py import similarity as lcs_seq_similarity
|
||||
|
||||
|
||||
|
@ -300,12 +302,7 @@ def editops(
|
|||
insert s1[4] s2[2]
|
||||
insert s1[6] s2[5]
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
raise NotImplementedError
|
||||
return lcs_seq_editops(s1, s2, processor=processor)
|
||||
|
||||
|
||||
def opcodes(
|
||||
|
@ -358,4 +355,4 @@ def opcodes(
|
|||
equal a[4:6] (cd) b[3:5] (cd)
|
||||
insert a[6:6] () b[5:6] (f)
|
||||
"""
|
||||
return editops(s1, s2, processor=processor).as_opcodes()
|
||||
return lcs_seq_opcodes(s1, s2, processor=processor)
|
||||
|
|
|
@ -5,9 +5,9 @@ from __future__ import annotations
|
|||
|
||||
from typing import Callable, Hashable, Sequence
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._common_py import common_affix, conv_sequences
|
||||
from rapidfuzz._utils import is_none
|
||||
from rapidfuzz.distance._initialize_py import Editops, Opcodes
|
||||
from rapidfuzz.distance._initialize_py import Editop, Editops, Opcodes
|
||||
|
||||
|
||||
def similarity(
|
||||
|
@ -254,6 +254,30 @@ def normalized_similarity(
|
|||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def _matrix(s1: Sequence[Hashable], s2: Sequence[Hashable]):
|
||||
if not s1:
|
||||
return (0, [])
|
||||
|
||||
S = (1 << len(s1)) - 1
|
||||
block: dict[Hashable, int] = {}
|
||||
block_get = block.get
|
||||
x = 1
|
||||
for ch1 in s1:
|
||||
block[ch1] = block_get(ch1, 0) | x
|
||||
x <<= 1
|
||||
|
||||
matrix = []
|
||||
for ch2 in s2:
|
||||
Matches = block_get(ch2, 0)
|
||||
u = S & Matches
|
||||
S = (S + u) | (S - u)
|
||||
matrix.append(S)
|
||||
|
||||
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
|
||||
sim = bin(S)[-len(s1) :].count("0")
|
||||
return (sim, matrix)
|
||||
|
||||
|
||||
def editops(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
|
@ -298,8 +322,56 @@ def editops(
|
|||
insert s1[4] s2[2]
|
||||
insert s1[6] s2[5]
|
||||
"""
|
||||
_ = s1, s2, processor
|
||||
raise NotImplementedError
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
prefix_len, suffix_len = common_affix(s1, s2)
|
||||
s1 = s1[prefix_len : len(s1) - suffix_len]
|
||||
s2 = s2[prefix_len : len(s2) - suffix_len]
|
||||
sim, matrix = _matrix(s1, s2)
|
||||
|
||||
editops = Editops([], 0, 0)
|
||||
editops._src_len = len(s1) + prefix_len + suffix_len
|
||||
editops._dest_len = len(s2) + prefix_len + suffix_len
|
||||
|
||||
dist = len(s1) + len(s2) - 2 * sim
|
||||
if dist == 0:
|
||||
return editops
|
||||
|
||||
editop_list = [None] * dist
|
||||
col = len(s1)
|
||||
row = len(s2)
|
||||
while row != 0 and col != 0:
|
||||
# deletion
|
||||
if matrix[row - 1] & (1 << (col - 1)):
|
||||
dist -= 1
|
||||
col -= 1
|
||||
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
|
||||
else:
|
||||
row -= 1
|
||||
|
||||
# insertion
|
||||
if row and not (matrix[row - 1] & (1 << (col - 1))):
|
||||
dist -= 1
|
||||
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
|
||||
# match
|
||||
else:
|
||||
col -= 1
|
||||
|
||||
while col != 0:
|
||||
dist -= 1
|
||||
col -= 1
|
||||
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
|
||||
|
||||
while row != 0:
|
||||
dist -= 1
|
||||
row -= 1
|
||||
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
|
||||
|
||||
editops._editops = editop_list
|
||||
return editops
|
||||
|
||||
|
||||
def opcodes(
|
||||
|
|
|
@ -342,7 +342,7 @@ class Editops:
|
|||
|
||||
This is the equivalent of ``[x for x in editops]``
|
||||
"""
|
||||
return self._editops
|
||||
return [tuple(op) for op in self._editops]
|
||||
|
||||
def copy(self) -> Editops:
|
||||
"""
|
||||
|
@ -472,15 +472,15 @@ class Editops:
|
|||
|
||||
for op in self._editops:
|
||||
# matches between last and current editop
|
||||
while src_pos < op.dest_pos:
|
||||
while src_pos < op.src_pos:
|
||||
res_str += source_string[src_pos]
|
||||
src_pos += 1
|
||||
|
||||
if op.tag == "replace":
|
||||
res_str += destination_string[src_pos]
|
||||
res_str += destination_string[op.dest_pos]
|
||||
src_pos += 1
|
||||
elif op.tag == "insert":
|
||||
res_str += destination_string[src_pos]
|
||||
res_str += destination_string[op.dest_pos]
|
||||
elif op.tag == "delete":
|
||||
src_pos += 1
|
||||
|
||||
|
@ -618,7 +618,7 @@ class Opcode:
|
|||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"Opcode(tag={self.tag}, src_start={self.src_start}, src_end={self.src_end}, "
|
||||
f"Opcode(tag={self.tag!r}, src_start={self.src_start}, src_end={self.src_end}, "
|
||||
f"dest_start={self.dest_start}, dest_end={self.dest_end})"
|
||||
)
|
||||
|
||||
|
@ -711,7 +711,7 @@ class Opcodes:
|
|||
|
||||
This is the equivalent of ``[x for x in opcodes]``
|
||||
"""
|
||||
return self._opcodes[::]
|
||||
return [tuple(op) for op in self._opcodes]
|
||||
|
||||
def copy(self) -> Opcodes:
|
||||
"""
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from math import ceil
|
||||
import itertools
|
||||
from typing import Any, Callable, Hashable, Sequence
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
|
@ -35,7 +34,7 @@ def _norm_distance(dist: int, lensum: int, score_cutoff: float) -> float:
|
|||
|
||||
|
||||
def _split_sequence(seq: Sequence[Hashable]) -> list[Sequence[Hashable]]:
|
||||
if isinstance(seq, str) or isinstance(seq, bytes):
|
||||
if isinstance(seq, (str, bytes)):
|
||||
return seq.split()
|
||||
|
||||
splitted_seq = [[]]
|
||||
|
@ -60,7 +59,7 @@ def _join_splitted_sequence(seq_list: list[Sequence[Hashable]]):
|
|||
joined = []
|
||||
for seq in seq_list:
|
||||
joined += seq
|
||||
joined += [ord(' ')]
|
||||
joined += [ord(" ")]
|
||||
return joined[:-1]
|
||||
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
#pragma once
|
||||
#include "cpp_common.hpp"
|
||||
#include "rapidfuzz.h"
|
||||
#include "taskflow/taskflow.hpp"
|
||||
#include "taskflow/algorithm/for_each.hpp"
|
||||
#include "taskflow/taskflow.hpp"
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <exception>
|
||||
|
|
|
@ -40,7 +40,10 @@ def call_and_maybe_catch(call, *args, **kwargs):
|
|||
|
||||
|
||||
def compare_exceptions(e1, e2):
|
||||
return type(e1) is type(e2) and str(e1) == str(e2)
|
||||
try:
|
||||
return str(e1) == str(e2)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def scorer_tester(scorer, s1, s2, **kwargs):
|
||||
|
@ -156,6 +159,8 @@ class Scorer:
|
|||
similarity: Any
|
||||
normalized_distance: Any
|
||||
normalized_similarity: Any
|
||||
editops: Any
|
||||
opcodes: Any
|
||||
|
||||
|
||||
class GenericScorer:
|
||||
|
@ -185,6 +190,28 @@ class GenericScorer:
|
|||
|
||||
self.get_scorer_flags = get_scorer_flags
|
||||
|
||||
def _editops(self, s1, s2, **kwargs):
|
||||
results = [call_and_maybe_catch(scorer.editops, s1, s2, **kwargs) for scorer in self.scorers]
|
||||
|
||||
for result in results:
|
||||
assert compare_exceptions(result, results[0])
|
||||
|
||||
if any(isinstance(result, Exception) for result in results):
|
||||
raise results[0]
|
||||
|
||||
return results[0]
|
||||
|
||||
def _opcodes(self, s1, s2, **kwargs):
|
||||
results = [call_and_maybe_catch(scorer.opcodes, s1, s2, **kwargs) for scorer in self.scorers]
|
||||
|
||||
for result in results:
|
||||
assert compare_exceptions(result, results[0])
|
||||
|
||||
if any(isinstance(result, Exception) for result in results):
|
||||
raise results[0]
|
||||
|
||||
return results[0]
|
||||
|
||||
def _distance(self, s1, s2, **kwargs):
|
||||
symmetric = self.get_scorer_flags(s1, s2, **kwargs)["symmetric"]
|
||||
tester = symmetric_scorer_tester if symmetric else scorer_tester
|
||||
|
@ -303,3 +330,17 @@ class GenericScorer:
|
|||
if "score_cutoff" not in kwargs:
|
||||
return norm_sim
|
||||
return self._normalized_similarity(s1, s2, **kwargs)
|
||||
|
||||
def editops(self, s1, s2, **kwargs):
|
||||
editops_ = self._editops(s1, s2, **kwargs)
|
||||
opcodes_ = self._opcodes(s1, s2, **kwargs)
|
||||
assert opcodes_.as_editops() == editops_
|
||||
assert opcodes_ == editops_.as_opcodes()
|
||||
return editops_
|
||||
|
||||
def opcodes(self, s1, s2, **kwargs):
|
||||
editops_ = self._editops(s1, s2, **kwargs)
|
||||
opcodes_ = self._opcodes(s1, s2, **kwargs)
|
||||
assert opcodes_.as_editops() == editops_
|
||||
assert opcodes_ == editops_.as_opcodes()
|
||||
return opcodes_
|
||||
|
|
|
@ -21,6 +21,8 @@ def create_generic_scorer(func_name, get_scorer_flags):
|
|||
similarity=getattr(metrics_py, func_name + "_similarity"),
|
||||
normalized_distance=getattr(metrics_py, func_name + "_normalized_distance"),
|
||||
normalized_similarity=getattr(metrics_py, func_name + "_normalized_similarity"),
|
||||
editops=getattr(metrics_py, func_name + "_editops", None),
|
||||
opcodes=getattr(metrics_py, func_name + "_opcodes", None),
|
||||
)
|
||||
]
|
||||
|
||||
|
@ -30,6 +32,8 @@ def create_generic_scorer(func_name, get_scorer_flags):
|
|||
similarity=getattr(mod, func_name + "_similarity"),
|
||||
normalized_distance=getattr(mod, func_name + "_normalized_distance"),
|
||||
normalized_similarity=getattr(mod, func_name + "_normalized_similarity"),
|
||||
editops=getattr(metrics_cpp, func_name + "_editops", None),
|
||||
opcodes=getattr(metrics_cpp, func_name + "_opcodes", None),
|
||||
)
|
||||
for mod in cpp_scorer_modules
|
||||
]
|
||||
|
|
|
@ -19,3 +19,37 @@ def test_issue_196():
|
|||
assert Indel.distance("South Korea", "North Korea", score_cutoff=2) == 3
|
||||
assert Indel.distance("South Korea", "North Korea", score_cutoff=1) == 2
|
||||
assert Indel.distance("South Korea", "North Korea", score_cutoff=0) == 1
|
||||
|
||||
|
||||
def test_Editops():
|
||||
"""
|
||||
basic test for Indel.editops
|
||||
"""
|
||||
assert Indel.editops("0", "").as_list() == [("delete", 0, 0)]
|
||||
assert Indel.editops("", "0").as_list() == [("insert", 0, 0)]
|
||||
|
||||
assert Indel.editops("00", "0").as_list() == [("delete", 1, 1)]
|
||||
assert Indel.editops("0", "00").as_list() == [("insert", 1, 1)]
|
||||
|
||||
assert Indel.editops("qabxcd", "abycdf").as_list() == [
|
||||
("delete", 0, 0),
|
||||
("insert", 3, 2),
|
||||
("delete", 3, 3),
|
||||
("insert", 6, 5),
|
||||
]
|
||||
assert Indel.editops("Lorem ipsum.", "XYZLorem ABC iPsum").as_list() == [
|
||||
("insert", 0, 0),
|
||||
("insert", 0, 1),
|
||||
("insert", 0, 2),
|
||||
("insert", 6, 9),
|
||||
("insert", 6, 10),
|
||||
("insert", 6, 11),
|
||||
("insert", 6, 12),
|
||||
("insert", 7, 14),
|
||||
("delete", 7, 15),
|
||||
("delete", 11, 18),
|
||||
]
|
||||
|
||||
ops = Indel.editops("aaabaaa", "abbaaabba")
|
||||
assert ops.src_len == 7
|
||||
assert ops.dest_len == 9
|
||||
|
|
|
@ -7,3 +7,37 @@ def test_basic():
|
|||
assert LCSseq.distance("", "") == 0
|
||||
assert LCSseq.distance("test", "test") == 0
|
||||
assert LCSseq.distance("aaaa", "bbbb") == 4
|
||||
|
||||
|
||||
def test_Editops():
|
||||
"""
|
||||
basic test for LCSseq.editops
|
||||
"""
|
||||
assert LCSseq.editops("0", "").as_list() == [("delete", 0, 0)]
|
||||
assert LCSseq.editops("", "0").as_list() == [("insert", 0, 0)]
|
||||
|
||||
assert LCSseq.editops("00", "0").as_list() == [("delete", 1, 1)]
|
||||
assert LCSseq.editops("0", "00").as_list() == [("insert", 1, 1)]
|
||||
|
||||
assert LCSseq.editops("qabxcd", "abycdf").as_list() == [
|
||||
("delete", 0, 0),
|
||||
("insert", 3, 2),
|
||||
("delete", 3, 3),
|
||||
("insert", 6, 5),
|
||||
]
|
||||
assert LCSseq.editops("Lorem ipsum.", "XYZLorem ABC iPsum").as_list() == [
|
||||
("insert", 0, 0),
|
||||
("insert", 0, 1),
|
||||
("insert", 0, 2),
|
||||
("insert", 6, 9),
|
||||
("insert", 6, 10),
|
||||
("insert", 6, 11),
|
||||
("insert", 6, 12),
|
||||
("insert", 7, 14),
|
||||
("delete", 7, 15),
|
||||
("delete", 11, 18),
|
||||
]
|
||||
|
||||
ops = LCSseq.editops("aaabaaa", "abbaaabba")
|
||||
assert ops.src_len == 7
|
||||
assert ops.dest_len == 9
|
||||
|
|
|
@ -106,6 +106,7 @@ scorers = [
|
|||
fuzz.QRatio,
|
||||
]
|
||||
|
||||
|
||||
def test_no_processor():
|
||||
assert fuzz.ratio("new york mets", "new york mets") == 100
|
||||
assert fuzz.ratio("new york mets", "new YORK mets") != 100
|
||||
|
|
|
@ -239,7 +239,7 @@ def test_indel_editops(s1, s2):
|
|||
"""
|
||||
test Indel.editops with any sizes
|
||||
"""
|
||||
ops = metrics_cpp.indel_editops(s1, s2)
|
||||
ops = Indel.editops(s1, s2)
|
||||
assert ops.apply(s1, s2) == s2
|
||||
|
||||
|
||||
|
@ -249,7 +249,7 @@ def test_indel_editops_block(s1, s2):
|
|||
"""
|
||||
test Indel.editops for long strings
|
||||
"""
|
||||
ops = metrics_cpp.indel_editops(s1, s2)
|
||||
ops = Indel.editops(s1, s2)
|
||||
assert ops.apply(s1, s2) == s2
|
||||
|
||||
|
||||
|
@ -279,7 +279,7 @@ def test_indel_opcodes(s1, s2):
|
|||
"""
|
||||
test Indel.opcodes with any sizes
|
||||
"""
|
||||
ops = metrics_cpp.indel_opcodes(s1, s2)
|
||||
ops = Indel.opcodes(s1, s2)
|
||||
assert ops.apply(s1, s2) == s2
|
||||
|
||||
|
||||
|
@ -289,7 +289,7 @@ def test_indel_opcodes_block(s1, s2):
|
|||
"""
|
||||
test Indel.opcodes for long strings
|
||||
"""
|
||||
ops = metrics_cpp.indel_opcodes(s1, s2)
|
||||
ops = Indel.opcodes(s1, s2)
|
||||
assert ops.apply(s1, s2) == s2
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue