mirror of https://github.com/explosion/spaCy.git
parent
6977a2b8cd
commit
62fc6b1afa
|
@ -5,11 +5,12 @@ cimport cython
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64, hash32
|
||||||
|
|
||||||
from preshed.maps cimport map_iter, key_t
|
from preshed.maps cimport map_iter, key_t
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
|
from libc.stdint cimport uint32_t
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ujson as json
|
import ujson as json
|
||||||
|
@ -26,6 +27,10 @@ cdef hash_t _hash_utf8(char* utf8_string, int length):
|
||||||
return hash64(utf8_string, length, 1)
|
return hash64(utf8_string, length, 1)
|
||||||
|
|
||||||
|
|
||||||
|
cdef uint32_t _hash32_utf8(char* utf8_string, int length):
|
||||||
|
return hash32(utf8_string, length, 1)
|
||||||
|
|
||||||
|
|
||||||
cdef unicode _decode(const Utf8Str* string):
|
cdef unicode _decode(const Utf8Str* string):
|
||||||
cdef int i, length
|
cdef int i, length
|
||||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||||
|
@ -84,7 +89,7 @@ cdef class StringStore:
|
||||||
self._resize_at = 10000
|
self._resize_at = 10000
|
||||||
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
||||||
self.size = 1
|
self.size = 1
|
||||||
self.is_frozen = False
|
self.is_frozen = freeze
|
||||||
if strings is not None:
|
if strings is not None:
|
||||||
for string in strings:
|
for string in strings:
|
||||||
_ = self[string]
|
_ = self[string]
|
||||||
|
@ -118,12 +123,14 @@ cdef class StringStore:
|
||||||
cdef bytes byte_string
|
cdef bytes byte_string
|
||||||
cdef const Utf8Str* utf8str
|
cdef const Utf8Str* utf8str
|
||||||
cdef uint64_t int_id
|
cdef uint64_t int_id
|
||||||
|
cdef uint32_t oov_id
|
||||||
if isinstance(string_or_id, (int, long)):
|
if isinstance(string_or_id, (int, long)):
|
||||||
int_id = string_or_id
|
int_id = string_or_id
|
||||||
|
oov_id = string_or_id
|
||||||
if int_id < <uint64_t>self.size:
|
if int_id < <uint64_t>self.size:
|
||||||
return _decode(&self.c[int_id])
|
return _decode(&self.c[int_id])
|
||||||
else:
|
else:
|
||||||
utf8str = <Utf8Str*>self._oov.get(int_id)
|
utf8str = <Utf8Str*>self._oov.get(oov_id)
|
||||||
if utf8str is not NULL:
|
if utf8str is not NULL:
|
||||||
return _decode(utf8str)
|
return _decode(utf8str)
|
||||||
else:
|
else:
|
||||||
|
@ -137,10 +144,12 @@ cdef class StringStore:
|
||||||
raise TypeError(type(string_or_id))
|
raise TypeError(type(string_or_id))
|
||||||
utf8str = self._intern_utf8(byte_string, len(byte_string))
|
utf8str = self._intern_utf8(byte_string, len(byte_string))
|
||||||
if utf8str is NULL:
|
if utf8str is NULL:
|
||||||
# TODO: We could get unlucky here, and hash into a value that
|
# TODO: We need to use 32 bit here, for compatibility with the
|
||||||
# collides with the 'real' strings. All we have to do is offset
|
# vocabulary values. This makes birthday paradox probabilities
|
||||||
# I think?
|
# pretty bad.
|
||||||
return _hash_utf8(byte_string, len(byte_string))
|
# We could also get unlucky here, and hash into a value that
|
||||||
|
# collides with the 'real' strings.
|
||||||
|
return _hash32_utf8(byte_string, len(byte_string))
|
||||||
else:
|
else:
|
||||||
return utf8str - self.c
|
return utf8str - self.c
|
||||||
|
|
||||||
|
@ -199,11 +208,13 @@ cdef class StringStore:
|
||||||
if value is not NULL:
|
if value is not NULL:
|
||||||
return value
|
return value
|
||||||
if self.is_frozen:
|
if self.is_frozen:
|
||||||
|
# OOV store uses 32 bit hashes. Pretty ugly :(
|
||||||
|
key32 = _hash32_utf8(utf8_string, length)
|
||||||
# Important: Make the OOV store own the memory. That way it's trivial
|
# Important: Make the OOV store own the memory. That way it's trivial
|
||||||
# to flush them all.
|
# to flush them all.
|
||||||
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
|
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
|
||||||
value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
|
value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
|
||||||
self._oov.set(key, value)
|
self._oov.set(key32, value)
|
||||||
return NULL
|
return NULL
|
||||||
|
|
||||||
if self.size == self._resize_at:
|
if self.size == self._resize_at:
|
||||||
|
|
Loading…
Reference in New Issue