Fix handling of trailing whitespace

Fix off-by-one error that meant trailing spaces were being dropped. Closes #792
2017-03-08 15:01:40 +01:00 · 2017-03-08 15:01:40 +01:00 · 0ac3d27689
parent 77f0594761
commit 0ac3d27689
2 changed files with 9 additions and 4 deletions
--- a/spacy/tests/regression/test_issue792.py
+++ b/spacy/tests/regression/test_issue792.py
@ -4,9 +4,15 @@ from __future__ import unicode_literals
 import pytest
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
 def test_issue792(en_tokenizer, text):
-    """Test for Issue #792: Trailing whitespace is removed after parsing."""
+    """Test for Issue #792: Trailing whitespace is removed after tokenization."""
    doc = en_tokenizer(text)
-    assert doc.text_with_ws == text
+    assert ''.join([token.text_with_ws for token in doc]) == text
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
 def test_control_issue792(en_tokenizer, text):
    """Test base case for Issue #792: Non-trailing whitespace"""
    doc = en_tokenizer(text)
    assert ''.join([token.text_with_ws for token in doc]) == text
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -163,7 +163,6 @@ cdef class Tokenizer:
                    start = i
                in_ws = not in_ws
            i += 1
        i += 1
        if start < i:
            span = string[start:]
            key = hash_string(span)