improve hash function

This commit is contained in:
maxbachmann 2022-06-29 14:59:03 +02:00
parent 72e2ca7d95
commit 53434ca085
3 changed files with 9 additions and 2 deletions

View File

@ -12,6 +12,7 @@
- fix implementation of Hamming.normalized_similarity
- fix default score_cutoff of Hamming.similarity
- fix implementation of LCSseq.distance when used in the process module
- treat hash for -1 and -2 as different
### [2.0.15] - 2022-06-24
#### Fixed

View File

@ -116,6 +116,11 @@ cdef extern from "cpp_common.hpp":
vector[T] vector_slice[T](const vector[T]& vec, int start, int stop, int step) except +
cdef inline uint64_t rf_hash(val) except *:
if val == -1:
return <uint64_t>-1
return <uint64_t>hash(val)
cdef inline RF_String hash_array(arr) except *:
# TODO on Cpython this does not require any copies
cdef RF_String s_proc
@ -156,7 +161,7 @@ cdef inline RF_String hash_array(arr) except *:
else: # float/double are hashed
s_proc.kind = RF_StringType.RF_UINT64
for i in range(s_proc.length):
(<uint64_t*>s_proc.data)[i] = <uint64_t>hash(arr[i])
(<uint64_t*>s_proc.data)[i] = rf_hash(arr[i])
except Exception as e:
free(s_proc.data)
s_proc.data = NULL
@ -183,7 +188,7 @@ cdef inline RF_String hash_sequence(seq) except *:
if isinstance(elem, str) and len(elem) == 1:
(<uint64_t*>s_proc.data)[i] = <uint64_t><Py_UCS4>elem
else:
(<uint64_t*>s_proc.data)[i] = <uint64_t>hash(elem)
(<uint64_t*>s_proc.data)[i] = rf_hash(elem)
except Exception as e:
free(s_proc.data)
s_proc.data = NULL

View File

@ -60,6 +60,7 @@ def test_cross_type_matching():
assert Levenshtein.distance("aaaa", ["a", "a", "a", "a"]) == 0
# todo add support in pure python
assert Levenshtein_cpp.distance("aaaa", [ord("a"), ord("a"), "a", "a"]) == 0
assert Levenshtein_cpp.distance([0, -1], [0, -2]) == 1
def test_word_error_rate():