From 0ac3d2768991521205a6d0e365303560521b6108 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 8 Mar 2017 15:01:40 +0100
Subject: [PATCH] Fix handling of trailing whitespace

Fix off-by-one error that meant trailing spaces were being dropped.
Closes #792
---
 spacy/tests/regression/test_issue792.py | 12 +++++++++---
 spacy/tokenizer.pyx                     |  1 -
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/regression/test_issue792.py b/spacy/tests/regression/test_issue792.py
index 563e061a6..df8b5ef50 100644
--- a/spacy/tests/regression/test_issue792.py
+++ b/spacy/tests/regression/test_issue792.py
@@ -4,9 +4,15 @@ from __future__ import unicode_literals
 import pytest
 
 
-@pytest.mark.xfail
 @pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
 def test_issue792(en_tokenizer, text):
-    """Test for Issue #792: Trailing whitespace is removed after parsing."""
+    """Test for Issue #792: Trailing whitespace is removed after tokenization."""
     doc = en_tokenizer(text)
-    assert doc.text_with_ws == text
+    assert ''.join([token.text_with_ws for token in doc]) == text
+
+
+@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
+def test_control_issue792(en_tokenizer, text):
+    """Test base case for Issue #792: Non-trailing whitespace"""
+    doc = en_tokenizer(text)
+    assert ''.join([token.text_with_ws for token in doc]) == text
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 1b74431ff..5a4eb844a 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -163,7 +163,6 @@ cdef class Tokenizer:
                     start = i
                 in_ws = not in_ws
             i += 1
-        i += 1
         if start < i:
             span = string[start:]
             key = hash_string(span)