From 82277f63a349c0c44853e386e1df74298b89d282 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 24 Jul 2018 23:35:54 +0200
Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Small=20efficiency=20fixes=20to?=
 =?UTF-8?q?=20tokenizer=20(#2587)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch improves tokenizer speed by about 10%, and reduces memory usage in the `Vocab` by removing a redundant index. The `vocab._by_orth` and `vocab._by_hash` indexed on different data in v1, but in v2 the orth and the hash are identical.

The patch also fixes an uninitialized variable in the tokenizer, the `has_special` flag. This checks whether a chunk we're tokenizing triggers a special-case rule. If it does, then we avoid caching within the chunk. This check led to incorrectly rejecting some chunks from the cache.

With the `en_core_web_md` model, we now tokenize the IMDB train data at 503,104k words per second. Prior to this patch, we had 465,764k words per second.

Before switching to the regex library and supporting more languages, we had 1.3m words per second for the tokenizer. In order to recover the missing speed, we need to:

* Fix the variable-length lookarounds in the suffix, infix and `token_match` rules
* Improve the performance of the `token_match` regex
* Switch back from the `regex` library to the `re` library.

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
---
 spacy/tokenizer.pyx |  4 ++--
 spacy/vocab.pxd     |  1 -
 spacy/vocab.pyx     | 27 +++++++++------------------
 3 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 9f89636dd..6b247d7e5 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -150,7 +150,7 @@ cdef class Tokenizer:
         cdef vector[LexemeC*] prefixes
         cdef vector[LexemeC*] suffixes
         cdef int orig_size
-        cdef int has_special
+        cdef int has_special = 0
         orig_size = tokens.length
         span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
                                    &has_special)
@@ -272,7 +272,7 @@ cdef class Tokenizer:
                           int has_special, int n) except -1:
         cdef int i
         for i in range(n):
-            if self.vocab._by_hash.get(tokens[i].lex.orth) == NULL:
+            if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
                 return 0
         # See https://github.com/explosion/spaCy/issues/1250
         if has_special:
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index b12bccf38..2e4f3b105 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -42,5 +42,4 @@ cdef class Vocab:
     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
     cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
 
-    cdef PreshMap _by_hash
     cdef PreshMap _by_orth
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index a3eb08b32..7a4549b4e 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -48,7 +48,6 @@ cdef class Vocab:
             lemmatizer = Lemmatizer({}, {}, {})
         self.cfg = {'oov_prob': oov_prob}
         self.mem = Pool()
-        self._by_hash = PreshMap()
         self._by_orth = PreshMap()
         self.strings = StringStore()
         self.length = 0
@@ -118,13 +117,12 @@ cdef class Vocab:
             return &EMPTY_LEXEME
         cdef LexemeC* lex
         cdef hash_t key = hash_string(string)
-        lex = <LexemeC*>self._by_hash.get(key)
+        lex = <LexemeC*>self._by_orth.get(key)
         cdef size_t addr
         if lex != NULL:
-            if lex.orth != self.strings[string]:
+            if lex.orth != key:
                 raise KeyError(Errors.E064.format(string=lex.orth,
-                                                  orth=self.strings[string],
-                                                  orth_id=string))
+                                                  orth=key, orth_id=string))
             return lex
         else:
             return self._new_lexeme(mem, string)
@@ -165,14 +163,12 @@ cdef class Vocab:
                 elif value is not None:
                     Lexeme.set_struct_attr(lex, attr, value)
         if not is_oov:
-            key = hash_string(string)
-            self._add_lex_to_vocab(key, lex)
+            self._add_lex_to_vocab(lex.orth, lex)
         if lex == NULL:
             raise ValueError(Errors.E085.format(string=string))
         return lex
 
     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
-        self._by_hash.set(key, <void*>lex)
         self._by_orth.set(lex.orth, <void*>lex)
         self.length += 1
 
@@ -189,7 +185,7 @@ cdef class Vocab:
             int_key = hash_string(key)
         else:
             int_key = key
-        lex = self._by_hash.get(int_key)
+        lex = self._by_orth.get(int_key)
         return lex is not NULL
 
     def __iter__(self):
@@ -461,7 +457,7 @@ cdef class Vocab:
         cdef LexemeC* lexeme = NULL
         cdef SerializedLexemeC lex_data
         cdef int size = 0
-        for key, addr in self._by_hash.items():
+        for key, addr in self._by_orth.items():
             if addr == 0:
                 continue
             size += sizeof(lex_data.data)
@@ -469,7 +465,7 @@ cdef class Vocab:
         byte_ptr = <unsigned char*>byte_string
         cdef int j
         cdef int i = 0
-        for key, addr in self._by_hash.items():
+        for key, addr in self._by_orth.items():
             if addr == 0:
                 continue
             lexeme = <LexemeC*>addr
@@ -504,17 +500,12 @@ cdef class Vocab:
                 raise ValueError(Errors.E086.format(string=py_str,
                                                     orth_id=lexeme.orth,
                                                     hash_id=self.strings[py_str]))
-            key = hash_string(py_str)
-            self._by_hash.set(key, lexeme)
             self._by_orth.set(lexeme.orth, lexeme)
             self.length += 1
 
     def _reset_cache(self, keys, strings):
-        for k in keys:
-            del self._by_hash[k]
-
-        if len(strings) != 0:
-            self._by_orth = PreshMap()
+        # I'm not sure this made sense. Disable it for now.
+        raise NotImplementedError
 
 
 def pickle_vocab(vocab):