From 0ac3d2768991521205a6d0e365303560521b6108 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 8 Mar 2017 15:01:40 +0100 Subject: [PATCH] Fix handling of trailing whitespace Fix off-by-one error that meant trailing spaces were being dropped. Closes #792 --- spacy/tests/regression/test_issue792.py | 12 +++++++++--- spacy/tokenizer.pyx | 1 - 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/spacy/tests/regression/test_issue792.py b/spacy/tests/regression/test_issue792.py index 563e061a6..df8b5ef50 100644 --- a/spacy/tests/regression/test_issue792.py +++ b/spacy/tests/regression/test_issue792.py @@ -4,9 +4,15 @@ from __future__ import unicode_literals import pytest -@pytest.mark.xfail @pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"]) def test_issue792(en_tokenizer, text): - """Test for Issue #792: Trailing whitespace is removed after parsing.""" + """Test for Issue #792: Trailing whitespace is removed after tokenization.""" doc = en_tokenizer(text) - assert doc.text_with_ws == text + assert ''.join([token.text_with_ws for token in doc]) == text + + +@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"]) +def test_control_issue792(en_tokenizer, text): + """Test base case for Issue #792: Non-trailing whitespace""" + doc = en_tokenizer(text) + assert ''.join([token.text_with_ws for token in doc]) == text diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 1b74431ff..5a4eb844a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -163,7 +163,6 @@ cdef class Tokenizer: start = i in_ws = not in_ws i += 1 - i += 1 if start < i: span = string[start:] key = hash_string(span)