From 0c507bd80a2391cce44b900236130a40dc327d97 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 22 Jul 2015 14:10:30 +0200 Subject: [PATCH] * Fix tokenizer --- spacy/tokenizer.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 0bf1aba6e..220a8a1de 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -10,6 +10,7 @@ from cpython cimport Py_UNICODE_ISSPACE from cymem.cymem cimport Pool from preshed.maps cimport PreshMap +from murmurhash.mrmr cimport hash64 from .morphology cimport set_morph_from_dict from .strings cimport hash_string @@ -91,11 +92,11 @@ cdef class Tokenizer: # We find spans of whitespace and non-space characters, and ignore # spans that are exactly ' '. So, our sequences will all be separated # by either ' ' or nothing. - for i range(1, length): + for i in range(1, length): uc = chars_ptr[i] if Py_UNICODE_ISSPACE(uc) != in_ws: if start < i: - key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0) + key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0) cache_hit = self._try_cache(key, tokens) if not cache_hit: self._tokenize(tokens, string[start:i], key) @@ -107,7 +108,7 @@ cdef class Tokenizer: start = i i += 1 if start < i: - key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0) + key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0) cache_hit = self._try_cache(key, tokens) if not cache_hit: self._tokenize(tokens, string[start:], key)