Use 32 bit hashes for OOV, re Issue #589, Issue #285

2016-11-01 13:27:13 +01:00 · 2016-11-01 13:27:13 +01:00 · 62fc6b1afa
parent 6977a2b8cd
commit 62fc6b1afa
1 changed files with 19 additions and 8 deletions
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -5,11 +5,12 @@ cimport cython
 from libc.string cimport memcpy
 from libc.stdint cimport uint64_t
-from murmurhash.mrmr cimport hash64
+from murmurhash.mrmr cimport hash64, hash32
 from preshed.maps cimport map_iter, key_t
 from .typedefs cimport hash_t
 from libc.stdint cimport uint32_t
 try:
    import ujson as json
@ -26,6 +27,10 @@ cdef hash_t _hash_utf8(char* utf8_string, int length):
    return hash64(utf8_string, length, 1)
 cdef uint32_t _hash32_utf8(char* utf8_string, int length):
    return hash32(utf8_string, length, 1)
 cdef unicode _decode(const Utf8Str* string):
    cdef int i, length
    if string.s[0] < sizeof(string.s) and string.s[0] != 0:
@ -84,7 +89,7 @@ cdef class StringStore:
        self._resize_at = 10000
        self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
        self.size = 1
-        self.is_frozen = False
+        self.is_frozen = freeze
        if strings is not None:
            for string in strings:
                _ = self[string]
@ -118,12 +123,14 @@ cdef class StringStore:
        cdef bytes byte_string
        cdef const Utf8Str* utf8str
        cdef uint64_t int_id
        cdef uint32_t oov_id
        if isinstance(string_or_id, (int, long)):
            int_id = string_or_id
            oov_id = string_or_id
            if int_id < <uint64_t>self.size:
                return _decode(&self.c[int_id])
            else:
-                utf8str = <Utf8Str*>self._oov.get(int_id)
+                utf8str = <Utf8Str*>self._oov.get(oov_id)
                if utf8str is not NULL:
                    return _decode(utf8str)
                else:
@ -137,10 +144,12 @@ cdef class StringStore:
                raise TypeError(type(string_or_id))
            utf8str = self._intern_utf8(byte_string, len(byte_string))
            if utf8str is NULL:
-                # TODO: We could get unlucky here, and hash into a value that
+                # TODO: We need to use 32 bit here, for compatibility with the 
-                # collides with the 'real' strings. All we have to do is offset
+                # vocabulary values. This makes birthday paradox probabilities
-                # I think?
+                # pretty bad.
-                return _hash_utf8(byte_string, len(byte_string))
+                # We could also get unlucky here, and hash into a value that
                # collides with the 'real' strings. 
                return _hash32_utf8(byte_string, len(byte_string))
            else:
                return utf8str - self.c
@ -199,11 +208,13 @@ cdef class StringStore:
        if value is not NULL:
            return value
        if self.is_frozen:
            # OOV store uses 32 bit hashes. Pretty ugly :(
            key32 = _hash32_utf8(utf8_string, length)
            # Important: Make the OOV store own the memory. That way it's trivial
            # to flush them all.
            value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
            value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
-            self._oov.set(key, value)
+            self._oov.set(key32, value)
            return NULL
        if self.size == self._resize_at: