From 0c507bd80a2391cce44b900236130a40dc327d97 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 22 Jul 2015 14:10:30 +0200
Subject: [PATCH] * Fix tokenizer

---
 spacy/tokenizer.pyx | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 0bf1aba6e..220a8a1de 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -10,6 +10,7 @@ from cpython cimport Py_UNICODE_ISSPACE
 
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
+from murmurhash.mrmr cimport hash64
 
 from .morphology cimport set_morph_from_dict
 from .strings cimport hash_string
@@ -91,11 +92,11 @@ cdef class Tokenizer:
         # We find spans of whitespace and non-space characters, and ignore
         # spans that are exactly ' '. So, our sequences will all be separated
         # by either ' ' or nothing.
-        for i range(1, length):
+        for i in range(1, length):
             uc = chars_ptr[i]
             if Py_UNICODE_ISSPACE(uc) != in_ws:
                 if start < i:
-                    key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0)
+                    key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0)
                     cache_hit = self._try_cache(key, tokens)
                     if not cache_hit:
                         self._tokenize(tokens, string[start:i], key)
@@ -107,7 +108,7 @@ cdef class Tokenizer:
                     start = i
         i += 1
         if start < i:
-            key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0)
+            key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0)
             cache_hit = self._try_cache(key, tokens)
             if not cache_hit:
                 self._tokenize(tokens, string[start:], key)