From 9298e36b363e676f962661a002cd8104c0940f45 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Fri, 12 Sep 2014 19:43:14 +0200
Subject: [PATCH] * Move special tokenization into its own lookup table, away
 from the cache.

---
 spacy/en.pyx   |  1 +
 spacy/lang.pxd |  1 +
 spacy/lang.pyx | 17 ++++++++++++-----
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/spacy/en.pyx b/spacy/en.pyx
index 6300fda25..9d9f4f596 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -237,6 +237,7 @@ cdef class English(Language):
     v_shape = View_WordShape
     def __cinit__(self, name, user_string_features, user_flag_features):
         self.cache = {}
+        self.specials = {}
         lang_data = util.read_lang_data(name)
         rules, words, probs, clusters, case_stats, tag_stats = lang_data
         self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index a498e6e0f..094b3d580 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -72,6 +72,7 @@ cdef class Lexicon:
 cdef class Language:
     cdef unicode name
     cdef dict cache
+    cdef dict specials
     cpdef readonly Lexicon lexicon
     cpdef readonly object tokens_class
 
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index b11e78921..9e57c63af 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -41,6 +41,7 @@ cdef class Language:
             string_features = []
         self.name = name
         self.cache = {}
+        self.specials = {}
         lang_data = read_lang_data(name)
         rules, words, probs, clusters, case_stats, tag_stats = lang_data
         self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
@@ -88,6 +89,7 @@ cdef class Language:
         cdef size_t i = 0
         cdef Py_UNICODE* characters = string
         cdef Py_UNICODE c
+        assert Py_UNICODE_ISSPACE(' ') == 1
         for i in range(length):
             c = characters[i]
             if Py_UNICODE_ISSPACE(c) == 1:
@@ -103,6 +105,11 @@ cdef class Language:
         cdef list lexemes
         cdef size_t lex_addr
         cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
+        if hashed in self.specials:
+            for lex_addr in self.specials[hashed]:
+                tokens.push_back(<LexemeC*>lex_addr)
+            return 0
+
         if hashed in self.cache:
             for lex_addr in self.cache[hashed]:
                 tokens.push_back(<LexemeC*>lex_addr)
@@ -113,16 +120,16 @@ cdef class Language:
         cdef size_t split = 0
         while start < length:
             split = self._split_one(&characters[start], length - start)
-            hashed = hash64(&characters[start], split * sizeof(Py_UNICODE), 0)
-            if hashed in self.cache:
-                lexemes.extend(self.cache[hashed])
+            piece_hash = hash64(&characters[start], split * sizeof(Py_UNICODE), 0)
+            if piece_hash in self.specials:
+                lexemes.extend(self.specials[piece_hash])
             else:
                 lexeme = <LexemeC*>self.lexicon.get(&characters[start], split)
                 lexemes.append(<size_t>lexeme)
             start += split
         for lex_addr in lexemes:
             tokens.push_back(<LexemeC*>lex_addr)
-        #self.cache[hashed] = lexemes
+        self.cache[hashed] = lexemes
 
     cdef int _split_one(self, Py_UNICODE* characters, size_t length):
         return length
@@ -146,7 +153,7 @@ cdef class Language:
             lexemes = []
             for substring in substrings:
                 lexemes.append(self.lexicon.get(<Py_UNICODE*>substring, len(substring)))
-            self.cache[hashed] = lexemes
+            self.specials[hashed] = lexemes
  
 
 cdef class Lexicon: