Fix lookup of symbols in vocab.

This commit is contained in:
Matthew Honnibal 2018-08-15 23:43:34 +02:00
parent b9f0588580
commit 8365226bf3
1 changed files with 5 additions and 5 deletions

View File

@ -9,7 +9,6 @@ from collections import OrderedDict
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .strings cimport hash_string
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .tokens.token cimport Token from .tokens.token cimport Token
from .attrs cimport PROB, LANG, ORTH, TAG from .attrs cimport PROB, LANG, ORTH, TAG
@ -116,10 +115,11 @@ cdef class Vocab:
if string == u'': if string == u'':
return &EMPTY_LEXEME return &EMPTY_LEXEME
cdef LexemeC* lex cdef LexemeC* lex
cdef hash_t key = hash_string(string) cdef hash_t key = self.strings[string]
lex = <LexemeC*>self._by_orth.get(key) lex = <LexemeC*>self._by_orth.get(key)
cdef size_t addr cdef size_t addr
if lex != NULL: if lex != NULL:
assert lex.orth in self.strings
if lex.orth != key: if lex.orth != key:
raise KeyError(Errors.E064.format(string=lex.orth, raise KeyError(Errors.E064.format(string=lex.orth,
orth=key, orth_id=string)) orth=key, orth_id=string))
@ -142,7 +142,6 @@ cdef class Vocab:
return self._new_lexeme(mem, self.strings[orth]) return self._new_lexeme(mem, self.strings[orth])
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
cdef hash_t key
if len(string) < 3 or self.length < 10000: if len(string) < 3 or self.length < 10000:
mem = self.mem mem = self.mem
cdef bint is_oov = mem is not self.mem cdef bint is_oov = mem is not self.mem
@ -180,9 +179,9 @@ cdef class Vocab:
""" """
cdef hash_t int_key cdef hash_t int_key
if isinstance(key, bytes): if isinstance(key, bytes):
int_key = hash_string(key.decode('utf8')) int_key = self.strings[key.decode('utf8')]
elif isinstance(key, unicode): elif isinstance(key, unicode):
int_key = hash_string(key) int_key = self.strings[key]
else: else:
int_key = key int_key = key
lex = self._by_orth.get(int_key) lex = self._by_orth.get(int_key)
@ -225,6 +224,7 @@ cdef class Vocab:
cdef int i cdef int i
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings): for i, props in enumerate(substrings):
self.strings.add(props[ORTH])
props = intify_attrs(props, strings_map=self.strings, props = intify_attrs(props, strings_map=self.strings,
_do_deprecated=True) _do_deprecated=True)
token = &tokens[i] token = &tokens[i]