From 837d241b686a8fa71fb79a5cbdaf65c178554772 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 20 Dec 2021 17:11:31 +0100 Subject: [PATCH] Make floret murmurhash endian-neutral (#9735) --- spacy/vectors.pyx | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 1b985a638..345e8df68 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,5 +1,5 @@ cimport numpy as np -from libc.stdint cimport uint32_t +from libc.stdint cimport uint32_t, uint64_t from cython.operator cimport dereference as deref from libcpp.set cimport set as cppset from murmurhash.mrmr cimport hash128_x64 @@ -353,12 +353,18 @@ cdef class Vectors: key (str): The string key. RETURNS: A list of the integer hashes. """ - cdef uint32_t[4] out + # MurmurHash3_x64_128 returns an array of 2 uint64_t values. + cdef uint64_t[2] out chars = s.encode("utf8") cdef char* utf8_string = chars hash128_x64(utf8_string, len(chars), self.hash_seed, &out) - rows = [out[i] for i in range(min(self.hash_count, 4))] - return rows + rows = [ + out[0] & 0xffffffffu, + out[0] >> 32, + out[1] & 0xffffffffu, + out[1] >> 32, + ] + return rows[:min(self.hash_count, 4)] def _get_ngrams(self, unicode key): """Get all padded ngram strings using the ngram settings.