From 82277f63a349c0c44853e386e1df74298b89d282 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 24 Jul 2018 23:35:54 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Small=20efficiency=20fixes=20to?= =?UTF-8?q?=20tokenizer=20(#2587)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch improves tokenizer speed by about 10%, and reduces memory usage in the `Vocab` by removing a redundant index. The `vocab._by_orth` and `vocab._by_hash` indexed on different data in v1, but in v2 the orth and the hash are identical. The patch also fixes an uninitialized variable in the tokenizer, the `has_special` flag. This checks whether a chunk we're tokenizing triggers a special-case rule. If it does, then we avoid caching within the chunk. This check led to incorrectly rejecting some chunks from the cache. With the `en_core_web_md` model, we now tokenize the IMDB train data at 503,104k words per second. Prior to this patch, we had 465,764k words per second. Before switching to the regex library and supporting more languages, we had 1.3m words per second for the tokenizer. In order to recover the missing speed, we need to: * Fix the variable-length lookarounds in the suffix, infix and `token_match` rules * Improve the performance of the `token_match` regex * Switch back from the `regex` library to the `re` library. ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. --- spacy/tokenizer.pyx | 4 ++-- spacy/vocab.pxd | 1 - spacy/vocab.pyx | 27 +++++++++------------------ 3 files changed, 11 insertions(+), 21 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 9f89636dd..6b247d7e5 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -150,7 +150,7 @@ cdef class Tokenizer: cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes cdef int orig_size - cdef int has_special + cdef int has_special = 0 orig_size = tokens.length span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes, &has_special) @@ -272,7 +272,7 @@ cdef class Tokenizer: int has_special, int n) except -1: cdef int i for i in range(n): - if self.vocab._by_hash.get(tokens[i].lex.orth) == NULL: + if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: return 0 # See https://github.com/explosion/spaCy/issues/1250 if has_special: diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index b12bccf38..2e4f3b105 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -42,5 +42,4 @@ cdef class Vocab: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL - cdef PreshMap _by_hash cdef PreshMap _by_orth diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index a3eb08b32..7a4549b4e 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -48,7 +48,6 @@ cdef class Vocab: lemmatizer = Lemmatizer({}, {}, {}) self.cfg = {'oov_prob': oov_prob} self.mem = Pool() - self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() self.length = 0 @@ -118,13 +117,12 @@ cdef class Vocab: return &EMPTY_LEXEME cdef LexemeC* lex cdef hash_t key = hash_string(string) - lex = self._by_hash.get(key) + lex = self._by_orth.get(key) cdef size_t addr if lex != NULL: - if lex.orth != self.strings[string]: + if lex.orth != key: raise KeyError(Errors.E064.format(string=lex.orth, - orth=self.strings[string], - orth_id=string)) + orth=key, orth_id=string)) return lex else: return self._new_lexeme(mem, string) @@ -165,14 +163,12 @@ cdef class Vocab: elif value is not None: Lexeme.set_struct_attr(lex, attr, value) if not is_oov: - key = hash_string(string) - self._add_lex_to_vocab(key, lex) + self._add_lex_to_vocab(lex.orth, lex) if lex == NULL: raise ValueError(Errors.E085.format(string=string)) return lex cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: - self._by_hash.set(key, lex) self._by_orth.set(lex.orth, lex) self.length += 1 @@ -189,7 +185,7 @@ cdef class Vocab: int_key = hash_string(key) else: int_key = key - lex = self._by_hash.get(int_key) + lex = self._by_orth.get(int_key) return lex is not NULL def __iter__(self): @@ -461,7 +457,7 @@ cdef class Vocab: cdef LexemeC* lexeme = NULL cdef SerializedLexemeC lex_data cdef int size = 0 - for key, addr in self._by_hash.items(): + for key, addr in self._by_orth.items(): if addr == 0: continue size += sizeof(lex_data.data) @@ -469,7 +465,7 @@ cdef class Vocab: byte_ptr = byte_string cdef int j cdef int i = 0 - for key, addr in self._by_hash.items(): + for key, addr in self._by_orth.items(): if addr == 0: continue lexeme = addr @@ -504,17 +500,12 @@ cdef class Vocab: raise ValueError(Errors.E086.format(string=py_str, orth_id=lexeme.orth, hash_id=self.strings[py_str])) - key = hash_string(py_str) - self._by_hash.set(key, lexeme) self._by_orth.set(lexeme.orth, lexeme) self.length += 1 def _reset_cache(self, keys, strings): - for k in keys: - del self._by_hash[k] - - if len(strings) != 0: - self._by_orth = PreshMap() + # I'm not sure this made sense. Disable it for now. + raise NotImplementedError def pickle_vocab(vocab):