From 985bc68327c8e079700e301e6e2364f5a9e7a03d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Fri, 12 Sep 2014 18:00:42 +0200
Subject: [PATCH] * Fix bug with trailing punct on contractions. Reduced
 efficiency, and slightly hacky implementation.

---
 spacy/en.pxd            |  2 ++
 spacy/en.pyx            |  2 +-
 spacy/lang.pxd          | 14 +++++-----
 spacy/lang.pyx          | 61 ++++++++++++++++-------------------------
 tests/test_tokenizer.py | 18 ++++++++++--
 5 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/spacy/en.pxd b/spacy/en.pxd
index 91d4db3af..f6dc782f0 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -40,3 +40,5 @@ cdef class EnglishTokens(Tokens):
 
 cdef class English(Language):
     cdef int _split_one(self, Py_UNICODE* characters, size_t length)
+
+
diff --git a/spacy/en.pyx b/spacy/en.pyx
index 5d4e6ef51..6300fda25 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -236,7 +236,7 @@ cdef class English(Language):
     fl_is_digit = Flag_IsDigit
     v_shape = View_WordShape
     def __cinit__(self, name, user_string_features, user_flag_features):
-        self.cache.set_empty_key(0)
+        self.cache = {}
         lang_data = util.read_lang_data(name)
         rules, words, probs, clusters, case_stats, tag_stats = lang_data
         self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index c32cb0c41..a498e6e0f 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -9,6 +9,11 @@ from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t, int64_t
 
 
+cdef extern from "Python.h":
+    cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
+    cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
+
+
 cdef extern from "sparsehash/dense_hash_map" namespace "google":
     cdef cppclass dense_hash_map[K, D]:
         K& key_type
@@ -52,10 +57,6 @@ cdef extern from "sparsehash/dense_hash_map" namespace "google":
         D& operator[](K&) nogil
 
 
-cdef struct LexList:
-    LexemeC* lex
-    LexList* tail
-
 cdef class Lexicon:
     cpdef readonly size_t size
 
@@ -70,13 +71,12 @@ cdef class Lexicon:
 
 cdef class Language:
     cdef unicode name
-    cdef dense_hash_map[uint64_t, size_t] cache
-    cdef size_t cache_size
+    cdef dict cache
     cpdef readonly Lexicon lexicon
     cpdef readonly object tokens_class
 
     cpdef Tokens tokenize(self, unicode text)
     cpdef Lexeme lookup(self, unicode text)
 
-    cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1
+    cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length)
     cdef int _split_one(self, Py_UNICODE* characters, size_t length)
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 8766eb86a..b11e78921 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -40,8 +40,7 @@ cdef class Language:
         if string_features is None:
             string_features = []
         self.name = name
-        self.cache.set_empty_key(0)
-        self.cache_size = 0
+        self.cache = {}
         lang_data = read_lang_data(name)
         rules, words, probs, clusters, case_stats, tag_stats = lang_data
         self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
@@ -80,7 +79,6 @@ cdef class Language:
         Returns:
             tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
         """
-        print repr(string)
         cdef size_t length = len(string)
         cdef Tokens tokens = self.tokens_class(length)
         if length == 0:
@@ -92,7 +90,7 @@ cdef class Language:
         cdef Py_UNICODE c
         for i in range(length):
             c = characters[i]
-            if c == ' ' or c == '\n' or c == '\t':
+            if Py_UNICODE_ISSPACE(c) == 1:
                 if start < i:
                     self._tokenize(tokens, &characters[start], i - start)
                 start = i + 1
@@ -101,38 +99,30 @@ cdef class Language:
             self._tokenize(tokens, &characters[start], i - start)
         return tokens
 
-    cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1:
+    cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length):
+        cdef list lexemes
+        cdef size_t lex_addr
         cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
-        cdef LexList* node = <LexList*>self.cache[hashed]
-        if node is not NULL:
-            while node != NULL:
-                tokens.push_back(node.lex)
-                node = node.tail
+        if hashed in self.cache:
+            for lex_addr in self.cache[hashed]:
+                tokens.push_back(<LexemeC*>lex_addr)
             return 0
-   
-        node = <LexList*>calloc(1, sizeof(LexList))
-        self.cache[hashed] = <size_t>node
+
+        lexemes = []
         cdef size_t start = 0
         cdef size_t split = 0
         while start < length:
             split = self._split_one(&characters[start], length - start)
-            node.lex = <LexemeC*>self.lexicon.get(&characters[start], split)
-            tokens.push_back(node.lex)
-            start += split
-            if start >= length:
-                break
-            hashed = hash64(&characters[start], (length - start) * sizeof(Py_UNICODE), 0)
-            node.tail = <LexList*>self.cache[hashed]
-            if node.tail == NULL:
-                node.tail = <LexList*>calloc(1, sizeof(LexList))
-                self.cache[hashed] = <size_t>node.tail
-                node = node.tail
+            hashed = hash64(&characters[start], split * sizeof(Py_UNICODE), 0)
+            if hashed in self.cache:
+                lexemes.extend(self.cache[hashed])
             else:
-                node = node.tail
-                while node != NULL:
-                    tokens.push_back(node.lex)
-                    node = node.tail
-                break
+                lexeme = <LexemeC*>self.lexicon.get(&characters[start], split)
+                lexemes.append(<size_t>lexeme)
+            start += split
+        for lex_addr in lexemes:
+            tokens.push_back(<LexemeC*>lex_addr)
+        #self.cache[hashed] = lexemes
 
     cdef int _split_one(self, Py_UNICODE* characters, size_t length):
         return length
@@ -149,17 +139,14 @@ cdef class Language:
             token_rules (list): A list of (chunk, tokens) pairs, where chunk is
                 a string and tokens is a list of strings.
         '''
-        cdef LexList* node
+        cdef list lexemes
         cdef uint64_t hashed
         for string, substrings in token_rules:
             hashed = hash64(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0)
-            node = <LexList*>calloc(1, sizeof(LexList))
-            self.cache[hashed] = <size_t>node
-            for substring in substrings[:-1]:
-                node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substring, len(substring))
-                node.tail = <LexList*>calloc(1, sizeof(LexList))
-                node = node.tail
-            node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substrings[-1], len(substrings[-1]))
+            lexemes = []
+            for substring in substrings:
+                lexemes.append(self.lexicon.get(<Py_UNICODE*>substring, len(substring)))
+            self.cache[hashed] = lexemes
  
 
 cdef class Lexicon:
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 50b0dae71..12ae2595f 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -26,6 +26,8 @@ def test_punct():
 
 def test_digits():
     lex_ids = EN.tokenize('The year: 1984.')
+    assert lex_ids.string(4) == "."
+    assert lex_ids.string(3) == "1984"
     assert len(lex_ids) == 5
     assert lex_ids[0].string == EN.lexicon.lookup('The').string
     assert lex_ids[3].string == EN.lexicon.lookup('1984').string
@@ -37,5 +39,17 @@ def test_contraction():
     assert len(lex_ids) == 3
     assert lex_ids[1].string == EN.lexicon.lookup("not").string
     lex_ids = EN.tokenize("i said don't!")
-    assert len(lex_ids) == 4
-    assert lex_ids[3].string == EN.lexicon.lookup('!').string
+    assert len(lex_ids) == 5
+    assert lex_ids[4].string == EN.lexicon.lookup('!').string
+
+
+def test_contraction_punct():
+    tokens = EN.tokenize("(can't")
+    assert len(tokens) == 3
+    tokens = EN.tokenize("`ain't")
+    assert len(tokens) == 3
+    tokens = EN.tokenize('''"isn't''')
+    assert len(tokens) == 3
+    tokens = EN.tokenize("can't!")
+    assert len(tokens) == 3
+