mirror of https://github.com/explosion/spaCy.git
💫 Small efficiency fixes to tokenizer (#2587)
This patch improves tokenizer speed by about 10%, and reduces memory usage in the `Vocab` by removing a redundant index. The `vocab._by_orth` and `vocab._by_hash` indexed on different data in v1, but in v2 the orth and the hash are identical. The patch also fixes an uninitialized variable in the tokenizer, the `has_special` flag. This checks whether a chunk we're tokenizing triggers a special-case rule. If it does, then we avoid caching within the chunk. This check led to incorrectly rejecting some chunks from the cache. With the `en_core_web_md` model, we now tokenize the IMDB train data at 503,104k words per second. Prior to this patch, we had 465,764k words per second. Before switching to the regex library and supporting more languages, we had 1.3m words per second for the tokenizer. In order to recover the missing speed, we need to: * Fix the variable-length lookarounds in the suffix, infix and `token_match` rules * Improve the performance of the `token_match` regex * Switch back from the `regex` library to the `re` library. ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
parent
3c30d1763c
commit
82277f63a3
|
@ -150,7 +150,7 @@ cdef class Tokenizer:
|
|||
cdef vector[LexemeC*] prefixes
|
||||
cdef vector[LexemeC*] suffixes
|
||||
cdef int orig_size
|
||||
cdef int has_special
|
||||
cdef int has_special = 0
|
||||
orig_size = tokens.length
|
||||
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
|
||||
&has_special)
|
||||
|
@ -272,7 +272,7 @@ cdef class Tokenizer:
|
|||
int has_special, int n) except -1:
|
||||
cdef int i
|
||||
for i in range(n):
|
||||
if self.vocab._by_hash.get(tokens[i].lex.orth) == NULL:
|
||||
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
|
||||
return 0
|
||||
# See https://github.com/explosion/spaCy/issues/1250
|
||||
if has_special:
|
||||
|
|
|
@ -42,5 +42,4 @@ cdef class Vocab:
|
|||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
|
||||
|
||||
cdef PreshMap _by_hash
|
||||
cdef PreshMap _by_orth
|
||||
|
|
|
@ -48,7 +48,6 @@ cdef class Vocab:
|
|||
lemmatizer = Lemmatizer({}, {}, {})
|
||||
self.cfg = {'oov_prob': oov_prob}
|
||||
self.mem = Pool()
|
||||
self._by_hash = PreshMap()
|
||||
self._by_orth = PreshMap()
|
||||
self.strings = StringStore()
|
||||
self.length = 0
|
||||
|
@ -118,13 +117,12 @@ cdef class Vocab:
|
|||
return &EMPTY_LEXEME
|
||||
cdef LexemeC* lex
|
||||
cdef hash_t key = hash_string(string)
|
||||
lex = <LexemeC*>self._by_hash.get(key)
|
||||
lex = <LexemeC*>self._by_orth.get(key)
|
||||
cdef size_t addr
|
||||
if lex != NULL:
|
||||
if lex.orth != self.strings[string]:
|
||||
if lex.orth != key:
|
||||
raise KeyError(Errors.E064.format(string=lex.orth,
|
||||
orth=self.strings[string],
|
||||
orth_id=string))
|
||||
orth=key, orth_id=string))
|
||||
return lex
|
||||
else:
|
||||
return self._new_lexeme(mem, string)
|
||||
|
@ -165,14 +163,12 @@ cdef class Vocab:
|
|||
elif value is not None:
|
||||
Lexeme.set_struct_attr(lex, attr, value)
|
||||
if not is_oov:
|
||||
key = hash_string(string)
|
||||
self._add_lex_to_vocab(key, lex)
|
||||
self._add_lex_to_vocab(lex.orth, lex)
|
||||
if lex == NULL:
|
||||
raise ValueError(Errors.E085.format(string=string))
|
||||
return lex
|
||||
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
||||
self._by_hash.set(key, <void*>lex)
|
||||
self._by_orth.set(lex.orth, <void*>lex)
|
||||
self.length += 1
|
||||
|
||||
|
@ -189,7 +185,7 @@ cdef class Vocab:
|
|||
int_key = hash_string(key)
|
||||
else:
|
||||
int_key = key
|
||||
lex = self._by_hash.get(int_key)
|
||||
lex = self._by_orth.get(int_key)
|
||||
return lex is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
|
@ -461,7 +457,7 @@ cdef class Vocab:
|
|||
cdef LexemeC* lexeme = NULL
|
||||
cdef SerializedLexemeC lex_data
|
||||
cdef int size = 0
|
||||
for key, addr in self._by_hash.items():
|
||||
for key, addr in self._by_orth.items():
|
||||
if addr == 0:
|
||||
continue
|
||||
size += sizeof(lex_data.data)
|
||||
|
@ -469,7 +465,7 @@ cdef class Vocab:
|
|||
byte_ptr = <unsigned char*>byte_string
|
||||
cdef int j
|
||||
cdef int i = 0
|
||||
for key, addr in self._by_hash.items():
|
||||
for key, addr in self._by_orth.items():
|
||||
if addr == 0:
|
||||
continue
|
||||
lexeme = <LexemeC*>addr
|
||||
|
@ -504,17 +500,12 @@ cdef class Vocab:
|
|||
raise ValueError(Errors.E086.format(string=py_str,
|
||||
orth_id=lexeme.orth,
|
||||
hash_id=self.strings[py_str]))
|
||||
key = hash_string(py_str)
|
||||
self._by_hash.set(key, lexeme)
|
||||
self._by_orth.set(lexeme.orth, lexeme)
|
||||
self.length += 1
|
||||
|
||||
def _reset_cache(self, keys, strings):
|
||||
for k in keys:
|
||||
del self._by_hash[k]
|
||||
|
||||
if len(strings) != 0:
|
||||
self._by_orth = PreshMap()
|
||||
# I'm not sure this made sense. Disable it for now.
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def pickle_vocab(vocab):
|
||||
|
|
Loading…
Reference in New Issue