From 519366f6775734d53b508b4b86a09e2069007964 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 4 May 2016 15:53:36 +0200 Subject: [PATCH] * Fix Issue #351: Indices off when leading whitespace --- spacy/tokenizer.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index a1a5c289c..229e70793 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -98,7 +98,7 @@ cdef class Tokenizer: cdef int i = 0 cdef int start = 0 cdef bint cache_hit - cdef bint in_ws = False + cdef bint in_ws = string[0].isspace() cdef unicode span # The task here is much like string.split, but not quite # We find spans of whitespace and non-space characters, and ignore @@ -116,12 +116,12 @@ cdef class Tokenizer: cache_hit = self._try_cache(key, tokens) if not cache_hit: self._tokenize(tokens, span, key) - in_ws = not in_ws if uc == ' ': tokens.c[tokens.length - 1].spacy = True start = i + 1 else: start = i + in_ws = not in_ws i += 1 i += 1 if start < i: