diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 9ca6fd74c..ddfddc29c 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -5,11 +5,12 @@ cimport cython from libc.string cimport memcpy from libc.stdint cimport uint64_t -from murmurhash.mrmr cimport hash64 +from murmurhash.mrmr cimport hash64, hash32 from preshed.maps cimport map_iter, key_t from .typedefs cimport hash_t +from libc.stdint cimport uint32_t try: import ujson as json @@ -26,6 +27,10 @@ cdef hash_t _hash_utf8(char* utf8_string, int length): return hash64(utf8_string, length, 1) +cdef uint32_t _hash32_utf8(char* utf8_string, int length): + return hash32(utf8_string, length, 1) + + cdef unicode _decode(const Utf8Str* string): cdef int i, length if string.s[0] < sizeof(string.s) and string.s[0] != 0: @@ -84,7 +89,7 @@ cdef class StringStore: self._resize_at = 10000 self.c = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) self.size = 1 - self.is_frozen = False + self.is_frozen = freeze if strings is not None: for string in strings: _ = self[string] @@ -118,12 +123,14 @@ cdef class StringStore: cdef bytes byte_string cdef const Utf8Str* utf8str cdef uint64_t int_id + cdef uint32_t oov_id if isinstance(string_or_id, (int, long)): int_id = string_or_id + oov_id = string_or_id if int_id < self.size: return _decode(&self.c[int_id]) else: - utf8str = self._oov.get(int_id) + utf8str = self._oov.get(oov_id) if utf8str is not NULL: return _decode(utf8str) else: @@ -137,10 +144,12 @@ cdef class StringStore: raise TypeError(type(string_or_id)) utf8str = self._intern_utf8(byte_string, len(byte_string)) if utf8str is NULL: - # TODO: We could get unlucky here, and hash into a value that - # collides with the 'real' strings. All we have to do is offset - # I think? - return _hash_utf8(byte_string, len(byte_string)) + # TODO: We need to use 32 bit here, for compatibility with the + # vocabulary values. This makes birthday paradox probabilities + # pretty bad. + # We could also get unlucky here, and hash into a value that + # collides with the 'real' strings. + return _hash32_utf8(byte_string, len(byte_string)) else: return utf8str - self.c @@ -199,11 +208,13 @@ cdef class StringStore: if value is not NULL: return value if self.is_frozen: + # OOV store uses 32 bit hashes. Pretty ugly :( + key32 = _hash32_utf8(utf8_string, length) # Important: Make the OOV store own the memory. That way it's trivial # to flush them all. value = self._oov.mem.alloc(1, sizeof(Utf8Str)) value[0] = _allocate(self._oov.mem, utf8_string, length) - self._oov.set(key, value) + self._oov.set(key32, value) return NULL if self.size == self._resize_at: