fix some issues in hamming distance

This commit is contained in:
maxbachmann 2022-06-29 11:34:51 +02:00
parent 531512e0fa
commit 72e2ca7d95
4 changed files with 89 additions and 3 deletions

View File

@ -9,6 +9,9 @@
#### Fixed
- fuzz.partial_ratio_alignment ignored the score_cutoff
- fix implementation of Hamming.normalized_similarity
- fix default score_cutoff of Hamming.similarity
- fix implementation of LCSseq.distance when used in the process module
### [2.0.15] - 2022-06-24
#### Fixed

@ -1 +1 @@
Subproject commit db5ac864bcb2890556c0e5b3c0a13335e65ac3e3
Subproject commit 13d7c7751b23dd2ef1efb340572661972941675c

View File

@ -103,7 +103,7 @@ def similarity(s1, s2, *, processor=None, score_cutoff=None):
ValueError
If s1 and s2 have a different length
"""
cdef int64_t c_score_cutoff = INT64_MAX if score_cutoff is None else score_cutoff
cdef int64_t c_score_cutoff = 0 if score_cutoff is None else score_cutoff
cdef RF_StringWrapper s1_proc, s2_proc
if c_score_cutoff < 0:
@ -187,7 +187,7 @@ def normalized_similarity(s1, s2, *, processor=None, score_cutoff=None):
raise ValueError("score_cutoff has to be >= 0")
preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None)
return hamming_normalized_distance_func(s1_proc.string, s2_proc.string, c_score_cutoff)
return hamming_normalized_similarity_func(s1_proc.string, s2_proc.string, c_score_cutoff)
cdef bool NoKwargsInit(RF_Kwargs* self, dict kwargs) except False:

View File

@ -0,0 +1,83 @@
import unittest
from rapidfuzz.distance import Hamming_cpp, Hamming_py
def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
class Hamming:
@staticmethod
def distance(*args, **kwargs):
dist1 = Hamming_cpp.distance(*args, **kwargs)
dist2 = Hamming_py.distance(*args, **kwargs)
assert dist1 == dist2
return dist1
@staticmethod
def similarity(*args, **kwargs):
dist1 = Hamming_cpp.similarity(*args, **kwargs)
dist2 = Hamming_py.similarity(*args, **kwargs)
assert dist1 == dist2
return dist1
@staticmethod
def normalized_distance(*args, **kwargs):
dist1 = Hamming_cpp.normalized_distance(*args, **kwargs)
dist2 = Hamming_py.normalized_distance(*args, **kwargs)
assert isclose(dist1, dist2)
return dist1
@staticmethod
def normalized_similarity(*args, **kwargs):
dist1 = Hamming_cpp.normalized_similarity(*args, **kwargs)
dist2 = Hamming_py.normalized_similarity(*args, **kwargs)
assert isclose(dist1, dist2)
return dist1
def test_empty_string():
"""
when both strings are empty this is a perfect match
"""
assert Hamming.distance("", "") == 0
assert Hamming.similarity("", "") == 0
assert Hamming.normalized_distance("", "") == 0.0
assert Hamming.normalized_similarity("", "") == 1.0
def test_similar_strings():
"""
Test similar strings
"""
assert Hamming.distance("test", "test") == 0
assert Hamming.similarity("test", "test") == 4
assert Hamming.normalized_distance("test", "test") == 0
assert Hamming.normalized_similarity("test", "test") == 1.0
def test_different_strings():
"""
Test completly different strings
"""
assert Hamming.distance("aaaa", "bbbb") == 4
assert Hamming.similarity("aaaa", "bbbb") == 0
assert Hamming.normalized_distance("aaaa", "bbbb") == 1.0
assert Hamming.normalized_similarity("aaaa", "bbbb") == 0.0
def test_score_cutoff():
"""
test whether score_cutoff works correctly
"""
assert Hamming.distance("South Korea", "North Korea") == 2
assert Hamming.distance("South Korea", "North Korea", score_cutoff=4) == 2
assert Hamming.distance("South Korea", "North Korea", score_cutoff=3) == 2
assert Hamming.distance("South Korea", "North Korea", score_cutoff=2) == 2
assert Hamming.distance("South Korea", "North Korea", score_cutoff=1) == 2
assert Hamming.distance("South Korea", "North Korea", score_cutoff=0) == 1
if __name__ == "__main__":
unittest.main()