diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index eea7e5ad1..0bf1aba6e 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -80,34 +80,37 @@ cdef class Tokenizer: cdef int i = 0 cdef int start = 0 cdef bint cache_hit - chars = string - cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) + cdef bint in_ws = Py_UNICODE_ISSPACE(string[0]) cdef unicode span + # Use of Py_UNICODE is deprecated, and I should be using Py_UCS4. + # But this is hard --- I need to acquire a pointer, but there's no + # Py_UCS4 API in Python 2. + cdef Py_UNICODE uc + cdef Py_UNICODE* chars_ptr = string # The task here is much like string.split, but not quite # We find spans of whitespace and non-space characters, and ignore # spans that are exactly ' '. So, our sequences will all be separated # by either ' ' or nothing. - for i in range(1, length): - if Py_UNICODE_ISSPACE(chars[i]) != in_ws: + for i range(1, length): + uc = chars_ptr[i] + if Py_UNICODE_ISSPACE(uc) != in_ws: if start < i: - span = string[start:i] - key = hash_string(span) + key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0) cache_hit = self._try_cache(key, tokens) if not cache_hit: - self._tokenize(tokens, span, key) + self._tokenize(tokens, string[start:i], key) in_ws = not in_ws - start = i - if chars[i] == ' ': + if uc == ' ': tokens.data[tokens.length - 1].spacy = True - start += 1 + start = i + 1 + else: + start = i i += 1 if start < i: - span = string[start:i] - key = hash_string(span) + key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0) cache_hit = self._try_cache(key, tokens) if not cache_hit: - self._tokenize(tokens, span, key) - + self._tokenize(tokens, string[start:], key) tokens.data[tokens.length - 1].spacy = string[-1] == ' ' return tokens