From ff1869ff07f78606e1bbfea7e08a845c4585f7e9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Mon, 7 Jul 2014 07:36:43 +0200
Subject: [PATCH 1/4] * Fixed major efficiency problem, from not quite grokking
 pass by reference in cython c++

---
 ext/murmurhash.pxd     | 10 +++--
 ext/murmurhash.pyx     |  1 +
 ext/sparsehash.pyx     |  1 +
 spacy/en.pxd           |  2 +-
 spacy/en.pyx           |  9 +++--
 spacy/en_ptb.pxd       |  2 +-
 spacy/en_ptb.pyx       |  4 +-
 spacy/lexeme.pxd       |  4 +-
 spacy/lexeme.pyx       |  8 ++--
 spacy/spacy.pxd        | 11 +++---
 spacy/spacy.pyx        | 85 ++++++++++++++++++++++++++++++------------
 spacy/string_tools.pyx |  2 +
 spacy/util.py          | 45 +---------------------
 13 files changed, 96 insertions(+), 88 deletions(-)

diff --git a/ext/murmurhash.pxd b/ext/murmurhash.pxd
index 67b0cd06c..9ded57240 100644
--- a/ext/murmurhash.pxd
+++ b/ext/murmurhash.pxd
@@ -1,11 +1,13 @@
+# cython profile=True
+
 from libc.stdint cimport uint64_t, int64_t
 
 
 cdef extern from "../include/MurmurHash3.h":
-    void MurmurHash3_x86_32(void * key, uint64_t len, uint64_t seed, void* out)
-    void MurmurHash3_x86_128(void * key, uint64_t len, uint64_t seed, void* out)
+    void MurmurHash3_x86_32(void * key, uint64_t len, uint64_t seed, void* out) nogil
+    void MurmurHash3_x86_128(void * key, uint64_t len, uint64_t seed, void* out) nogil
 
 
 cdef extern from "../include/MurmurHash2.h":
-    uint64_t MurmurHash64A(void * key, uint64_t len, int64_t seed)
-    uint64_t MurmurHash64B(void * key, uint64_t len, int64_t seed)
+    uint64_t MurmurHash64A(void * key, uint64_t len, int64_t seed) nogil
+    uint64_t MurmurHash64B(void * key, uint64_t len, int64_t seed) nogil
diff --git a/ext/murmurhash.pyx b/ext/murmurhash.pyx
index e69de29bb..54652d22a 100644
--- a/ext/murmurhash.pyx
+++ b/ext/murmurhash.pyx
@@ -0,0 +1 @@
+# cython: profile=True
diff --git a/ext/sparsehash.pyx b/ext/sparsehash.pyx
index e69de29bb..54f2811e1 100644
--- a/ext/sparsehash.pyx
+++ b/ext/sparsehash.pyx
@@ -0,0 +1 @@
+# cython profile=True
diff --git a/spacy/en.pxd b/spacy/en.pxd
index 183490102..efced3606 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -6,7 +6,7 @@ from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport Lexeme_addr
 
 
-cdef Vocab VOCAB
+cdef Vocab* VOCAB
 cdef dict BACOV
 
 
diff --git a/spacy/en.pyx b/spacy/en.pyx
index df8d30ff9..986468988 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -1,3 +1,4 @@
+# cython: profile=True
 '''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
 so that strings can be retrieved from hashes.  Use 64-bit hash values and
 boldly assume no collisions.
@@ -15,19 +16,18 @@ from . import util
 cimport spacy
 
 BACOV = {}
-VOCAB = Vocab()
+VOCAB = new Vocab(100000)
 VOCAB.set_empty_key(0)
 
 
 spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
 
-
 cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
     return spacy.tokenize(VOCAB, BACOV, find_split, string)
  
 
 cpdef Lexeme_addr lookup(unicode string) except 0:
-    return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
+    return spacy.lookup(VOCAB, BACOV, find_split, -1, string, len(string))
 
 
 cpdef unicode unhash(StringHash hash_value):
@@ -72,3 +72,6 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
     if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
         return False
     return not word[i].isalnum()
+
+
+#spacy.load_browns(VOCAB, BACOV, find_split)
diff --git a/spacy/en_ptb.pxd b/spacy/en_ptb.pxd
index 183490102..efced3606 100644
--- a/spacy/en_ptb.pxd
+++ b/spacy/en_ptb.pxd
@@ -6,7 +6,7 @@ from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport Lexeme_addr
 
 
-cdef Vocab VOCAB
+cdef Vocab* VOCAB
 cdef dict BACOV
 
 
diff --git a/spacy/en_ptb.pyx b/spacy/en_ptb.pyx
index 2ad8f96b2..d950c2133 100644
--- a/spacy/en_ptb.pyx
+++ b/spacy/en_ptb.pyx
@@ -15,7 +15,7 @@ from . import util
 cimport spacy
 
 BACOV = {}
-VOCAB = Vocab()
+VOCAB = new Vocab(100000)
 VOCAB.set_empty_key(0)
 
 
@@ -27,7 +27,7 @@ cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
  
 
 cpdef Lexeme_addr lookup(unicode string) except 0:
-    return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
+    return spacy.lookup(VOCAB, BACOV, find_split, -1, string, len(string))
 
 
 cpdef unicode unhash(StringHash hash_value):
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 9d6be64b7..2cd38e709 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -25,9 +25,9 @@ cdef struct Lexeme:
 
 cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
 
-cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
+cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split,
                          unicode string, StringHash hashed,
-                         int split, size_t length) except NULL
+                         int split, size_t length)
  
 # Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
 # has a conditional to pick out the correct item.  This allows safe iteration
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 4d760f0a0..2bc56969b 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -1,3 +1,4 @@
+# cython: profile=True
 '''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
 Mostly useful from Python-space. From Cython-space, you can just cast to
 Lexeme* yourself.
@@ -13,9 +14,9 @@ from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 
 
-cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
+cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split,
                          unicode string, StringHash hashed,
-                         int split, size_t length) except NULL:
+                         int split, size_t length):
     assert split <= length
     cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
 
@@ -54,7 +55,8 @@ cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
     
     # Now recurse, and deal with the tail
     if tail_string:
-        word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string)
+        word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string,
+                                    len(tail_string))
     return word
 
 
diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd
index db3226d23..ac1132ca2 100644
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@@ -12,12 +12,13 @@ ctypedef int (*Splitter)(unicode word, size_t length)
 
 from spacy.lexeme cimport Lexeme
 
-cdef load_tokenization(Vocab& vocab, dict bacov, token_rules)
-cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
+cdef load_tokenization(Vocab* vocab, dict bacov, token_rules)
+cdef load_browns(Vocab* vocab, dict bacov, Splitter find_split)
+cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter,
                                   unicode string) except *
-cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter splitter, int start,
-                        unicode string) except 0
-cdef StringHash hash_string(unicode s, size_t length) except 0
+cdef Lexeme_addr lookup(Vocab* vocab, dict bacov, Splitter splitter, int start,
+                        Py_UNICODE* string, size_t length) except 0
+cdef StringHash hash_string(Py_UNICODE* s, size_t length) nogil
 cdef unicode unhash(dict bacov, StringHash hash_value)
  
  
diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx
index ca04ad82c..1cc73ac3c 100644
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@@ -1,3 +1,4 @@
+# cython: profile=True
 from __future__ import unicode_literals
 
 from ext.murmurhash cimport MurmurHash64A
@@ -9,14 +10,16 @@ from spacy.lexeme cimport BLANK_WORD
 from spacy.string_tools cimport is_whitespace
 
 from . import util
+from os import path
+cimport cython
 
 
-cdef load_tokenization(Vocab& vocab, dict bacov, token_rules):
+cdef load_tokenization(Vocab* vocab, dict bacov, token_rules):
     cdef Lexeme* word
     cdef StringHash hashed
     for chunk, lex, tokens in token_rules:
         hashed = hash_string(chunk, len(chunk))
-        assert vocab[hashed] == 0, chunk
+        assert vocab[0][hashed] == 0, chunk
         word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex))
         for i, lex in enumerate(tokens):
             token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
@@ -26,7 +29,29 @@ cdef load_tokenization(Vocab& vocab, dict bacov, token_rules):
             word = word.tail
 
 
-cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
+cdef load_browns(Vocab* vocab, dict bacov, Splitter find_split):
+    cdef Lexeme* w
+    data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
+    case_stats = util.load_case_stats(data_dir)
+    brown_loc = path.join(data_dir, 'clusters')
+    cdef size_t start 
+    cdef int end 
+    with util.utf8open(brown_loc) as browns_file:
+        for i, line in enumerate(browns_file):
+            cluster_str, token_string, freq_str = line.split()
+            # Decode as a little-endian string, so that we can do & 15 to get
+            # the first 4 bits. See redshift._parse_features.pyx
+            cluster = int(cluster_str[::-1], 2)
+            upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
+            start = 0
+            end = -1
+            hashed = hash_string(token_string, len(token_string))
+
+            word = _add(vocab, bacov, find_split, hashed, token_string,
+                        len(token_string), len(token_string))
+
+
+cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter,
                                   unicode string) except *:
     cdef size_t length = len(string)
     cdef Py_UNICODE* characters = <Py_UNICODE*>string
@@ -35,40 +60,54 @@ cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
     cdef Py_UNICODE c
 
     cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
-    cdef unicode current = u''
+    cdef Py_UNICODE[1000] current
+    for i in range(1000):
+        current[i] = 0
+    cdef size_t word_len = 0
     cdef Lexeme* token
     for i in range(length):
         c = characters[i]
-        if is_whitespace(c):
-            if current:
-                token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
+        if _is_whitespace(c):
+            if word_len != 0:
+                token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current, word_len)
                 while token != NULL:
                     tokens.push_back(<Lexeme_addr>token)
                     token = token.tail
-            current = u''
+                for j in range(word_len+1):
+                    current[j] = 0
+                word_len = 0
         else:
-            current += c
-    if current:
-        token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
+            current[word_len] = c
+            word_len += 1
+    if word_len != 0:
+        token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current, word_len)
         while token != NULL:
             tokens.push_back(<Lexeme_addr>token)
             token = token.tail
     return tokens
 
+cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
+    if c == ' ':
+        return True
+    elif c == '\n':
+        return True
+    elif c == '\t':
+        return True
+    else:
+        return False
 
-cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter find_split, int start,
-                        unicode string) except 0:
+cdef Lexeme_addr lookup(Vocab* vocab, dict bacov, Splitter find_split, int start,
+                        Py_UNICODE* string, size_t length) except 0:
     '''Fetch a Lexeme representing a word string. If the word has not been seen,
     construct one, splitting off any attached punctuation or clitics.  A
     reference to BLANK_WORD is returned for the empty string.
     
     To specify the boundaries of the word if it has not been seen, use lookup_chunk.
     '''
-    if string == '':
+    if length == 0:
         return <Lexeme_addr>&BLANK_WORD
-    cdef size_t length = len(string)
     cdef StringHash hashed = hash_string(string, length)
-    cdef Lexeme* word_ptr = <Lexeme*>vocab[hashed]
+    cdef Lexeme* word_ptr = <Lexeme*>vocab[0][hashed]
     if word_ptr == NULL:
         start = find_split(string, length) if start == -1 else start
         word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length)
@@ -84,9 +123,8 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *:
     return tokens
 
 
-cdef StringHash hash_string(unicode s, size_t length) except 0:
+cdef StringHash hash_string(Py_UNICODE* s, size_t length) nogil:
     '''Hash unicode with MurmurHash64A'''
-    assert length
     return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
 
 
@@ -95,11 +133,12 @@ cdef unicode unhash(dict bacov, StringHash hash_value):
     return bacov[hash_value]
 
 
-cdef Lexeme* _add(Vocab& vocab, dict bacov, Splitter find_split, StringHash hashed,
-                  unicode string, int split, size_t length) except NULL:
-    assert string
-    assert split <= length
+@cython.nonecheck(False)
+cdef Lexeme* _add(Vocab* vocab, dict bacov, Splitter find_split, StringHash hashed,
+                  unicode string, int split, size_t length):
     word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length)
-    vocab[hashed] = <Lexeme_addr>word
+    vocab[0][hashed] = <Lexeme_addr>word
     bacov[hashed] = string
     return word
+
+
diff --git a/spacy/string_tools.pyx b/spacy/string_tools.pyx
index 437fc152a..5397fd647 100644
--- a/spacy/string_tools.pyx
+++ b/spacy/string_tools.pyx
@@ -1,3 +1,5 @@
+# cython: profile=True
+
 cpdef unicode substr(unicode string, int start, int end, size_t length):
     if end >= length:
         end = -1
diff --git a/spacy/util.py b/spacy/util.py
index 64dee8877..4e080d0b3 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -10,7 +10,7 @@ def utf8open(loc, mode='r'):
 
 
 def load_case_stats(data_dir):
-    case_loc = path.join(data_dir, 'english.case')
+    case_loc = path.join(data_dir, 'case')
     case_stats = {}
     with utf8open(case_loc) as cases_file:
         for line in cases_file:
@@ -42,46 +42,3 @@ def read_tokenization(lang):
                 seen.add(chunk)
                 entries.append((chunk, lex, pieces))
     return entries
- 
-
-"""
-    def load_browns(self, data_dir):
-        cdef Lexeme* w
-        case_stats = load_case_stats(data_dir)
-        brown_loc = path.join(data_dir, 'bllip-clusters')
-        assert path.exists(brown_loc)
-        cdef size_t start 
-        cdef int end 
-        with utf8open(brown_loc) as browns_file:
-            for i, line in enumerate(browns_file):
-                cluster_str, word, freq_str = line.split()
-                # Decode as a little-endian string, so that we can do & 15 to get
-                # the first 4 bits. See redshift._parse_features.pyx
-                cluster = int(cluster_str[::-1], 2)
-                upper_pc, title_pc = case_stats.get(word.lower(), (0.0, 0.0))
-                start = 0
-                end = -1
-                find_slice(&start, &end, word)
-                print "Load", repr(word), start, end
-                w = <Lexeme*>init_word(word, start, end, cluster,
-                                      upper_pc, title_pc, int(freq_str))
-                self.words[_hash_str(word)] = <size_t>w
-                self.strings[<size_t>w] = word
-
-    def load_clitics(self, data_dir):
-        cdef unicode orig_str
-        cdef unicode clitic
-        for orig_str, norm_form, clitic_strs in util.load_clitics(data_dir):
-            w = init_clitic(orig_str, <Lexeme*>self.lookup_slice(norm_form, 0, -1))
-            self.words[w.orig] = <size_t>w
-            self.strings[<size_t>w] = orig_str
-            assert len(clitic_strs) < MAX_CLITICS
-            assert clitic_strs
-            for i, clitic in enumerate(clitic_strs):
-                # If we write punctuation here, assume we want to keep it,
-                # so tell it the slice boundaries (the full string)
-                w.clitics[i] = self.lookup_slice(clitic, 0, -1)
-            # Ensure we null terminate
-            w.clitics[i+1] = 0
-"""
-

From 0575f16ade8e067543ef4cb261c93c1a55ecf8ae Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Mon, 7 Jul 2014 07:37:29 +0200
Subject: [PATCH 2/4] * Upd requirements

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 029a6618e..f6629e024 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1 @@
 cython
-sparsehash

From 0074ae2fc0b82153139fed16bf77ab215b38faab Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Mon, 7 Jul 2014 08:05:29 +0200
Subject: [PATCH 3/4] * Switch to dynamically allocating array, based on the
 document length

---
 spacy/spacy.pyx | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx
index 1cc73ac3c..51ab59da2 100644
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@@ -1,6 +1,8 @@
 # cython: profile=True
 from __future__ import unicode_literals
 
+from libc.stdlib cimport calloc, free
+
 from ext.murmurhash cimport MurmurHash64A
 from ext.murmurhash cimport MurmurHash64B
 
@@ -60,9 +62,7 @@ cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter,
     cdef Py_UNICODE c
 
     cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
-    cdef Py_UNICODE[1000] current
-    for i in range(1000):
-        current[i] = 0
+    cdef Py_UNICODE* current = <Py_UNICODE*>calloc(len(string), sizeof(Py_UNICODE))
     cdef size_t word_len = 0
     cdef Lexeme* token
     for i in range(length):
@@ -84,6 +84,7 @@ cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter,
         while token != NULL:
             tokens.push_back(<Lexeme_addr>token)
             token = token.tail
+    free(current)
     return tokens
 
 cdef inline bint _is_whitespace(Py_UNICODE c) nogil:

From 6668e449614a2c29109e24bfa846a7d2402fb186 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Mon, 7 Jul 2014 08:15:44 +0200
Subject: [PATCH 4/4] * Whitespace

---
 spacy/spacy.pyx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx
index 51ab59da2..59cc2fd51 100644
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@@ -141,5 +141,3 @@ cdef Lexeme* _add(Vocab* vocab, dict bacov, Splitter find_split, StringHash hash
     vocab[0][hashed] = <Lexeme_addr>word
     bacov[hashed] = string
     return word
-
-