Fix handling of trailing whitespace

Fix off-by-one error that meant trailing spaces were being dropped.
Closes #792
This commit is contained in:
Matthew Honnibal 2017-03-08 15:01:40 +01:00
parent 77f0594761
commit 0ac3d27689
2 changed files with 9 additions and 4 deletions

View File

@ -4,9 +4,15 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"]) @pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
def test_issue792(en_tokenizer, text): def test_issue792(en_tokenizer, text):
"""Test for Issue #792: Trailing whitespace is removed after parsing.""" """Test for Issue #792: Trailing whitespace is removed after tokenization."""
doc = en_tokenizer(text) doc = en_tokenizer(text)
assert doc.text_with_ws == text assert ''.join([token.text_with_ws for token in doc]) == text
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
def test_control_issue792(en_tokenizer, text):
"""Test base case for Issue #792: Non-trailing whitespace"""
doc = en_tokenizer(text)
assert ''.join([token.text_with_ws for token in doc]) == text

View File

@ -163,7 +163,6 @@ cdef class Tokenizer:
start = i start = i
in_ws = not in_ws in_ws = not in_ws
i += 1 i += 1
i += 1
if start < i: if start < i:
span = string[start:] span = string[start:]
key = hash_string(span) key = hash_string(span)