From 72e2ca7d954dc17a8b5ebbf274ac5ba190fe9ec1 Mon Sep 17 00:00:00 2001 From: maxbachmann Date: Wed, 29 Jun 2022 11:34:51 +0200 Subject: [PATCH] fix some issues in hamming distance --- CHANGELOG.md | 3 ++ extern/rapidfuzz-cpp | 2 +- rapidfuzz/distance/Hamming_cpp.pyx | 4 +- tests/distance/test_Hamming.py | 83 ++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 tests/distance/test_Hamming.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 416feb3..695fbb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ #### Fixed - fuzz.partial_ratio_alignment ignored the score_cutoff +- fix implementation of Hamming.normalized_similarity +- fix default score_cutoff of Hamming.similarity +- fix implementation of LCSseq.distance when used in the process module ### [2.0.15] - 2022-06-24 #### Fixed diff --git a/extern/rapidfuzz-cpp b/extern/rapidfuzz-cpp index db5ac86..13d7c77 160000 --- a/extern/rapidfuzz-cpp +++ b/extern/rapidfuzz-cpp @@ -1 +1 @@ -Subproject commit db5ac864bcb2890556c0e5b3c0a13335e65ac3e3 +Subproject commit 13d7c7751b23dd2ef1efb340572661972941675c diff --git a/rapidfuzz/distance/Hamming_cpp.pyx b/rapidfuzz/distance/Hamming_cpp.pyx index b0911b5..b820b86 100644 --- a/rapidfuzz/distance/Hamming_cpp.pyx +++ b/rapidfuzz/distance/Hamming_cpp.pyx @@ -103,7 +103,7 @@ def similarity(s1, s2, *, processor=None, score_cutoff=None): ValueError If s1 and s2 have a different length """ - cdef int64_t c_score_cutoff = INT64_MAX if score_cutoff is None else score_cutoff + cdef int64_t c_score_cutoff = 0 if score_cutoff is None else score_cutoff cdef RF_StringWrapper s1_proc, s2_proc if c_score_cutoff < 0: @@ -187,7 +187,7 @@ def normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): raise ValueError("score_cutoff has to be >= 0") preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) - return hamming_normalized_distance_func(s1_proc.string, s2_proc.string, c_score_cutoff) + return hamming_normalized_similarity_func(s1_proc.string, s2_proc.string, c_score_cutoff) cdef bool NoKwargsInit(RF_Kwargs* self, dict kwargs) except False: diff --git a/tests/distance/test_Hamming.py b/tests/distance/test_Hamming.py new file mode 100644 index 0000000..3e6c34b --- /dev/null +++ b/tests/distance/test_Hamming.py @@ -0,0 +1,83 @@ +import unittest + +from rapidfuzz.distance import Hamming_cpp, Hamming_py + + +def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): + return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) + + +class Hamming: + @staticmethod + def distance(*args, **kwargs): + dist1 = Hamming_cpp.distance(*args, **kwargs) + dist2 = Hamming_py.distance(*args, **kwargs) + assert dist1 == dist2 + return dist1 + + @staticmethod + def similarity(*args, **kwargs): + dist1 = Hamming_cpp.similarity(*args, **kwargs) + dist2 = Hamming_py.similarity(*args, **kwargs) + assert dist1 == dist2 + return dist1 + + @staticmethod + def normalized_distance(*args, **kwargs): + dist1 = Hamming_cpp.normalized_distance(*args, **kwargs) + dist2 = Hamming_py.normalized_distance(*args, **kwargs) + assert isclose(dist1, dist2) + return dist1 + + @staticmethod + def normalized_similarity(*args, **kwargs): + dist1 = Hamming_cpp.normalized_similarity(*args, **kwargs) + dist2 = Hamming_py.normalized_similarity(*args, **kwargs) + assert isclose(dist1, dist2) + return dist1 + + +def test_empty_string(): + """ + when both strings are empty this is a perfect match + """ + assert Hamming.distance("", "") == 0 + assert Hamming.similarity("", "") == 0 + assert Hamming.normalized_distance("", "") == 0.0 + assert Hamming.normalized_similarity("", "") == 1.0 + + +def test_similar_strings(): + """ + Test similar strings + """ + assert Hamming.distance("test", "test") == 0 + assert Hamming.similarity("test", "test") == 4 + assert Hamming.normalized_distance("test", "test") == 0 + assert Hamming.normalized_similarity("test", "test") == 1.0 + + +def test_different_strings(): + """ + Test completly different strings + """ + assert Hamming.distance("aaaa", "bbbb") == 4 + assert Hamming.similarity("aaaa", "bbbb") == 0 + assert Hamming.normalized_distance("aaaa", "bbbb") == 1.0 + assert Hamming.normalized_similarity("aaaa", "bbbb") == 0.0 + + +def test_score_cutoff(): + """ + test whether score_cutoff works correctly + """ + assert Hamming.distance("South Korea", "North Korea") == 2 + assert Hamming.distance("South Korea", "North Korea", score_cutoff=4) == 2 + assert Hamming.distance("South Korea", "North Korea", score_cutoff=3) == 2 + assert Hamming.distance("South Korea", "North Korea", score_cutoff=2) == 2 + assert Hamming.distance("South Korea", "North Korea", score_cutoff=1) == 2 + assert Hamming.distance("South Korea", "North Korea", score_cutoff=0) == 1 + + +if __name__ == "__main__": + unittest.main()