💫 Small efficiency fixes to tokenizer (#2587)

This patch improves tokenizer speed by about 10%, and reduces memory usage in the `Vocab` by removing a redundant index. The `vocab._by_orth` and `vocab._by_hash` indexed on different data in v1, but in v2 the orth and the hash are identical.

The patch also fixes an uninitialized variable in the tokenizer, the `has_special` flag. This checks whether a chunk we're tokenizing triggers a special-case rule. If it does, then we avoid caching within the chunk. This check led to incorrectly rejecting some chunks from the cache. 

With the `en_core_web_md` model, we now tokenize the IMDB train data at 503,104k words per second. Prior to this patch, we had 465,764k words per second.

Before switching to the regex library and supporting more languages, we had 1.3m words per second for the tokenizer. In order to recover the missing speed, we need to:

* Fix the variable-length lookarounds in the suffix, infix and `token_match` rules
* Improve the performance of the `token_match` regex
* Switch back from the `regex` library to the `re` library.

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
Matthew Honnibal 2018-07-24 23:35:54 +02:00 committed by Ines Montani
parent 3c30d1763c
commit 82277f63a3
3 changed files with 11 additions and 21 deletions

View File

@ -150,7 +150,7 @@ cdef class Tokenizer:
cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes cdef vector[LexemeC*] suffixes
cdef int orig_size cdef int orig_size
cdef int has_special cdef int has_special = 0
orig_size = tokens.length orig_size = tokens.length
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes, span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
&has_special) &has_special)
@ -272,7 +272,7 @@ cdef class Tokenizer:
int has_special, int n) except -1: int has_special, int n) except -1:
cdef int i cdef int i
for i in range(n): for i in range(n):
if self.vocab._by_hash.get(tokens[i].lex.orth) == NULL: if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
return 0 return 0
# See https://github.com/explosion/spaCy/issues/1250 # See https://github.com/explosion/spaCy/issues/1250
if has_special: if has_special:

View File

@ -42,5 +42,4 @@ cdef class Vocab:
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
cdef PreshMap _by_hash
cdef PreshMap _by_orth cdef PreshMap _by_orth

View File

@ -48,7 +48,6 @@ cdef class Vocab:
lemmatizer = Lemmatizer({}, {}, {}) lemmatizer = Lemmatizer({}, {}, {})
self.cfg = {'oov_prob': oov_prob} self.cfg = {'oov_prob': oov_prob}
self.mem = Pool() self.mem = Pool()
self._by_hash = PreshMap()
self._by_orth = PreshMap() self._by_orth = PreshMap()
self.strings = StringStore() self.strings = StringStore()
self.length = 0 self.length = 0
@ -118,13 +117,12 @@ cdef class Vocab:
return &EMPTY_LEXEME return &EMPTY_LEXEME
cdef LexemeC* lex cdef LexemeC* lex
cdef hash_t key = hash_string(string) cdef hash_t key = hash_string(string)
lex = <LexemeC*>self._by_hash.get(key) lex = <LexemeC*>self._by_orth.get(key)
cdef size_t addr cdef size_t addr
if lex != NULL: if lex != NULL:
if lex.orth != self.strings[string]: if lex.orth != key:
raise KeyError(Errors.E064.format(string=lex.orth, raise KeyError(Errors.E064.format(string=lex.orth,
orth=self.strings[string], orth=key, orth_id=string))
orth_id=string))
return lex return lex
else: else:
return self._new_lexeme(mem, string) return self._new_lexeme(mem, string)
@ -165,14 +163,12 @@ cdef class Vocab:
elif value is not None: elif value is not None:
Lexeme.set_struct_attr(lex, attr, value) Lexeme.set_struct_attr(lex, attr, value)
if not is_oov: if not is_oov:
key = hash_string(string) self._add_lex_to_vocab(lex.orth, lex)
self._add_lex_to_vocab(key, lex)
if lex == NULL: if lex == NULL:
raise ValueError(Errors.E085.format(string=string)) raise ValueError(Errors.E085.format(string=string))
return lex return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
self._by_hash.set(key, <void*>lex)
self._by_orth.set(lex.orth, <void*>lex) self._by_orth.set(lex.orth, <void*>lex)
self.length += 1 self.length += 1
@ -189,7 +185,7 @@ cdef class Vocab:
int_key = hash_string(key) int_key = hash_string(key)
else: else:
int_key = key int_key = key
lex = self._by_hash.get(int_key) lex = self._by_orth.get(int_key)
return lex is not NULL return lex is not NULL
def __iter__(self): def __iter__(self):
@ -461,7 +457,7 @@ cdef class Vocab:
cdef LexemeC* lexeme = NULL cdef LexemeC* lexeme = NULL
cdef SerializedLexemeC lex_data cdef SerializedLexemeC lex_data
cdef int size = 0 cdef int size = 0
for key, addr in self._by_hash.items(): for key, addr in self._by_orth.items():
if addr == 0: if addr == 0:
continue continue
size += sizeof(lex_data.data) size += sizeof(lex_data.data)
@ -469,7 +465,7 @@ cdef class Vocab:
byte_ptr = <unsigned char*>byte_string byte_ptr = <unsigned char*>byte_string
cdef int j cdef int j
cdef int i = 0 cdef int i = 0
for key, addr in self._by_hash.items(): for key, addr in self._by_orth.items():
if addr == 0: if addr == 0:
continue continue
lexeme = <LexemeC*>addr lexeme = <LexemeC*>addr
@ -504,17 +500,12 @@ cdef class Vocab:
raise ValueError(Errors.E086.format(string=py_str, raise ValueError(Errors.E086.format(string=py_str,
orth_id=lexeme.orth, orth_id=lexeme.orth,
hash_id=self.strings[py_str])) hash_id=self.strings[py_str]))
key = hash_string(py_str)
self._by_hash.set(key, lexeme)
self._by_orth.set(lexeme.orth, lexeme) self._by_orth.set(lexeme.orth, lexeme)
self.length += 1 self.length += 1
def _reset_cache(self, keys, strings): def _reset_cache(self, keys, strings):
for k in keys: # I'm not sure this made sense. Disable it for now.
del self._by_hash[k] raise NotImplementedError
if len(strings) != 0:
self._by_orth = PreshMap()
def pickle_vocab(vocab): def pickle_vocab(vocab):