mirror of https://github.com/explosion/spaCy.git
💫 Small efficiency fixes to tokenizer (#2587)
This patch improves tokenizer speed by about 10%, and reduces memory usage in the `Vocab` by removing a redundant index. The `vocab._by_orth` and `vocab._by_hash` indexed on different data in v1, but in v2 the orth and the hash are identical. The patch also fixes an uninitialized variable in the tokenizer, the `has_special` flag. This checks whether a chunk we're tokenizing triggers a special-case rule. If it does, then we avoid caching within the chunk. This check led to incorrectly rejecting some chunks from the cache. With the `en_core_web_md` model, we now tokenize the IMDB train data at 503,104k words per second. Prior to this patch, we had 465,764k words per second. Before switching to the regex library and supporting more languages, we had 1.3m words per second for the tokenizer. In order to recover the missing speed, we need to: * Fix the variable-length lookarounds in the suffix, infix and `token_match` rules * Improve the performance of the `token_match` regex * Switch back from the `regex` library to the `re` library. ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
parent
3c30d1763c
commit
82277f63a3
|
@ -150,7 +150,7 @@ cdef class Tokenizer:
|
||||||
cdef vector[LexemeC*] prefixes
|
cdef vector[LexemeC*] prefixes
|
||||||
cdef vector[LexemeC*] suffixes
|
cdef vector[LexemeC*] suffixes
|
||||||
cdef int orig_size
|
cdef int orig_size
|
||||||
cdef int has_special
|
cdef int has_special = 0
|
||||||
orig_size = tokens.length
|
orig_size = tokens.length
|
||||||
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
|
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
|
||||||
&has_special)
|
&has_special)
|
||||||
|
@ -272,7 +272,7 @@ cdef class Tokenizer:
|
||||||
int has_special, int n) except -1:
|
int has_special, int n) except -1:
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
if self.vocab._by_hash.get(tokens[i].lex.orth) == NULL:
|
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
|
||||||
return 0
|
return 0
|
||||||
# See https://github.com/explosion/spaCy/issues/1250
|
# See https://github.com/explosion/spaCy/issues/1250
|
||||||
if has_special:
|
if has_special:
|
||||||
|
|
|
@ -42,5 +42,4 @@ cdef class Vocab:
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
|
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
|
||||||
|
|
||||||
cdef PreshMap _by_hash
|
|
||||||
cdef PreshMap _by_orth
|
cdef PreshMap _by_orth
|
||||||
|
|
|
@ -48,7 +48,6 @@ cdef class Vocab:
|
||||||
lemmatizer = Lemmatizer({}, {}, {})
|
lemmatizer = Lemmatizer({}, {}, {})
|
||||||
self.cfg = {'oov_prob': oov_prob}
|
self.cfg = {'oov_prob': oov_prob}
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._by_hash = PreshMap()
|
|
||||||
self._by_orth = PreshMap()
|
self._by_orth = PreshMap()
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
self.length = 0
|
self.length = 0
|
||||||
|
@ -118,13 +117,12 @@ cdef class Vocab:
|
||||||
return &EMPTY_LEXEME
|
return &EMPTY_LEXEME
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
cdef hash_t key = hash_string(string)
|
cdef hash_t key = hash_string(string)
|
||||||
lex = <LexemeC*>self._by_hash.get(key)
|
lex = <LexemeC*>self._by_orth.get(key)
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
if lex.orth != self.strings[string]:
|
if lex.orth != key:
|
||||||
raise KeyError(Errors.E064.format(string=lex.orth,
|
raise KeyError(Errors.E064.format(string=lex.orth,
|
||||||
orth=self.strings[string],
|
orth=key, orth_id=string))
|
||||||
orth_id=string))
|
|
||||||
return lex
|
return lex
|
||||||
else:
|
else:
|
||||||
return self._new_lexeme(mem, string)
|
return self._new_lexeme(mem, string)
|
||||||
|
@ -165,14 +163,12 @@ cdef class Vocab:
|
||||||
elif value is not None:
|
elif value is not None:
|
||||||
Lexeme.set_struct_attr(lex, attr, value)
|
Lexeme.set_struct_attr(lex, attr, value)
|
||||||
if not is_oov:
|
if not is_oov:
|
||||||
key = hash_string(string)
|
self._add_lex_to_vocab(lex.orth, lex)
|
||||||
self._add_lex_to_vocab(key, lex)
|
|
||||||
if lex == NULL:
|
if lex == NULL:
|
||||||
raise ValueError(Errors.E085.format(string=string))
|
raise ValueError(Errors.E085.format(string=string))
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
||||||
self._by_hash.set(key, <void*>lex)
|
|
||||||
self._by_orth.set(lex.orth, <void*>lex)
|
self._by_orth.set(lex.orth, <void*>lex)
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
|
||||||
|
@ -189,7 +185,7 @@ cdef class Vocab:
|
||||||
int_key = hash_string(key)
|
int_key = hash_string(key)
|
||||||
else:
|
else:
|
||||||
int_key = key
|
int_key = key
|
||||||
lex = self._by_hash.get(int_key)
|
lex = self._by_orth.get(int_key)
|
||||||
return lex is not NULL
|
return lex is not NULL
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
@ -461,7 +457,7 @@ cdef class Vocab:
|
||||||
cdef LexemeC* lexeme = NULL
|
cdef LexemeC* lexeme = NULL
|
||||||
cdef SerializedLexemeC lex_data
|
cdef SerializedLexemeC lex_data
|
||||||
cdef int size = 0
|
cdef int size = 0
|
||||||
for key, addr in self._by_hash.items():
|
for key, addr in self._by_orth.items():
|
||||||
if addr == 0:
|
if addr == 0:
|
||||||
continue
|
continue
|
||||||
size += sizeof(lex_data.data)
|
size += sizeof(lex_data.data)
|
||||||
|
@ -469,7 +465,7 @@ cdef class Vocab:
|
||||||
byte_ptr = <unsigned char*>byte_string
|
byte_ptr = <unsigned char*>byte_string
|
||||||
cdef int j
|
cdef int j
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
for key, addr in self._by_hash.items():
|
for key, addr in self._by_orth.items():
|
||||||
if addr == 0:
|
if addr == 0:
|
||||||
continue
|
continue
|
||||||
lexeme = <LexemeC*>addr
|
lexeme = <LexemeC*>addr
|
||||||
|
@ -504,17 +500,12 @@ cdef class Vocab:
|
||||||
raise ValueError(Errors.E086.format(string=py_str,
|
raise ValueError(Errors.E086.format(string=py_str,
|
||||||
orth_id=lexeme.orth,
|
orth_id=lexeme.orth,
|
||||||
hash_id=self.strings[py_str]))
|
hash_id=self.strings[py_str]))
|
||||||
key = hash_string(py_str)
|
|
||||||
self._by_hash.set(key, lexeme)
|
|
||||||
self._by_orth.set(lexeme.orth, lexeme)
|
self._by_orth.set(lexeme.orth, lexeme)
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
|
||||||
def _reset_cache(self, keys, strings):
|
def _reset_cache(self, keys, strings):
|
||||||
for k in keys:
|
# I'm not sure this made sense. Disable it for now.
|
||||||
del self._by_hash[k]
|
raise NotImplementedError
|
||||||
|
|
||||||
if len(strings) != 0:
|
|
||||||
self._by_orth = PreshMap()
|
|
||||||
|
|
||||||
|
|
||||||
def pickle_vocab(vocab):
|
def pickle_vocab(vocab):
|
||||||
|
|
Loading…
Reference in New Issue