diff --git a/CHANGELOG.md b/CHANGELOG.md index 695fbb9..687c5e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ - fix implementation of Hamming.normalized_similarity - fix default score_cutoff of Hamming.similarity - fix implementation of LCSseq.distance when used in the process module +- treat hash for -1 and -2 as different ### [2.0.15] - 2022-06-24 #### Fixed diff --git a/rapidfuzz/cpp_common.pxd b/rapidfuzz/cpp_common.pxd index e80153d..298cc4c 100644 --- a/rapidfuzz/cpp_common.pxd +++ b/rapidfuzz/cpp_common.pxd @@ -116,6 +116,11 @@ cdef extern from "cpp_common.hpp": vector[T] vector_slice[T](const vector[T]& vec, int start, int stop, int step) except + +cdef inline uint64_t rf_hash(val) except *: + if val == -1: + return -1 + return hash(val) + cdef inline RF_String hash_array(arr) except *: # TODO on Cpython this does not require any copies cdef RF_String s_proc @@ -156,7 +161,7 @@ cdef inline RF_String hash_array(arr) except *: else: # float/double are hashed s_proc.kind = RF_StringType.RF_UINT64 for i in range(s_proc.length): - (s_proc.data)[i] = hash(arr[i]) + (s_proc.data)[i] = rf_hash(arr[i]) except Exception as e: free(s_proc.data) s_proc.data = NULL @@ -183,7 +188,7 @@ cdef inline RF_String hash_sequence(seq) except *: if isinstance(elem, str) and len(elem) == 1: (s_proc.data)[i] = elem else: - (s_proc.data)[i] = hash(elem) + (s_proc.data)[i] = rf_hash(elem) except Exception as e: free(s_proc.data) s_proc.data = NULL diff --git a/tests/distance/test_Levenshtein.py b/tests/distance/test_Levenshtein.py index 0fdedaa..f6b584c 100644 --- a/tests/distance/test_Levenshtein.py +++ b/tests/distance/test_Levenshtein.py @@ -60,6 +60,7 @@ def test_cross_type_matching(): assert Levenshtein.distance("aaaa", ["a", "a", "a", "a"]) == 0 # todo add support in pure python assert Levenshtein_cpp.distance("aaaa", [ord("a"), ord("a"), "a", "a"]) == 0 + assert Levenshtein_cpp.distance([0, -1], [0, -2]) == 1 def test_word_error_rate():