mirror of https://github.com/explosion/spaCy.git
Fix handling of trailing whitespace
Fix off-by-one error that meant trailing spaces were being dropped. Closes #792
This commit is contained in:
parent
77f0594761
commit
0ac3d27689
|
@ -4,9 +4,15 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
||||||
def test_issue792(en_tokenizer, text):
|
def test_issue792(en_tokenizer, text):
|
||||||
"""Test for Issue #792: Trailing whitespace is removed after parsing."""
|
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert doc.text_with_ws == text
|
assert ''.join([token.text_with_ws for token in doc]) == text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
|
||||||
|
def test_control_issue792(en_tokenizer, text):
|
||||||
|
"""Test base case for Issue #792: Non-trailing whitespace"""
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
assert ''.join([token.text_with_ws for token in doc]) == text
|
||||||
|
|
|
@ -163,7 +163,6 @@ cdef class Tokenizer:
|
||||||
start = i
|
start = i
|
||||||
in_ws = not in_ws
|
in_ws = not in_ws
|
||||||
i += 1
|
i += 1
|
||||||
i += 1
|
|
||||||
if start < i:
|
if start < i:
|
||||||
span = string[start:]
|
span = string[start:]
|
||||||
key = hash_string(span)
|
key = hash_string(span)
|
||||||
|
|
Loading…
Reference in New Issue