diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 7a4549b4e..1e7912907 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -9,7 +9,6 @@ from collections import OrderedDict from thinc.neural.util import get_array_module from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme -from .strings cimport hash_string from .typedefs cimport attr_t from .tokens.token cimport Token from .attrs cimport PROB, LANG, ORTH, TAG @@ -116,10 +115,11 @@ cdef class Vocab: if string == u'': return &EMPTY_LEXEME cdef LexemeC* lex - cdef hash_t key = hash_string(string) + cdef hash_t key = self.strings[string] lex = self._by_orth.get(key) cdef size_t addr if lex != NULL: + assert lex.orth in self.strings if lex.orth != key: raise KeyError(Errors.E064.format(string=lex.orth, orth=key, orth_id=string)) @@ -142,7 +142,6 @@ cdef class Vocab: return self._new_lexeme(mem, self.strings[orth]) cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: - cdef hash_t key if len(string) < 3 or self.length < 10000: mem = self.mem cdef bint is_oov = mem is not self.mem @@ -180,9 +179,9 @@ cdef class Vocab: """ cdef hash_t int_key if isinstance(key, bytes): - int_key = hash_string(key.decode('utf8')) + int_key = self.strings[key.decode('utf8')] elif isinstance(key, unicode): - int_key = hash_string(key) + int_key = self.strings[key] else: int_key = key lex = self._by_orth.get(int_key) @@ -225,6 +224,7 @@ cdef class Vocab: cdef int i tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) for i, props in enumerate(substrings): + self.strings.add(props[ORTH]) props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True) token = &tokens[i]