From 519366f6775734d53b508b4b86a09e2069007964 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 4 May 2016 15:53:36 +0200
Subject: [PATCH] * Fix Issue #351: Indices off when leading whitespace

---
 spacy/tokenizer.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index a1a5c289c..229e70793 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -98,7 +98,7 @@ cdef class Tokenizer:
         cdef int i = 0
         cdef int start = 0
         cdef bint cache_hit
-        cdef bint in_ws = False
+        cdef bint in_ws = string[0].isspace()
         cdef unicode span
         # The task here is much like string.split, but not quite
         # We find spans of whitespace and non-space characters, and ignore
@@ -116,12 +116,12 @@ cdef class Tokenizer:
                     cache_hit = self._try_cache(key, tokens)
                     if not cache_hit:
                         self._tokenize(tokens, span, key)
-                in_ws = not in_ws
                 if uc == ' ':
                     tokens.c[tokens.length - 1].spacy = True
                     start = i + 1
                 else:
                     start = i
+                in_ws = not in_ws
             i += 1
         i += 1
         if start < i: