2017-01-31 13:47:42 +00:00
|
|
|
# coding: utf-8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2017-01-31 14:18:30 +00:00
|
|
|
import pytest
|
2017-01-31 13:47:42 +00:00
|
|
|
|
2017-01-31 14:19:33 +00:00
|
|
|
|
2017-01-31 14:14:42 +00:00
|
|
|
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
|
|
|
def test_issue792(en_tokenizer, text):
|
2017-03-08 14:01:40 +00:00
|
|
|
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
|
2017-01-31 14:14:42 +00:00
|
|
|
doc = en_tokenizer(text)
|
2017-03-08 14:01:40 +00:00
|
|
|
assert ''.join([token.text_with_ws for token in doc]) == text
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
|
|
|
|
def test_control_issue792(en_tokenizer, text):
|
|
|
|
"""Test base case for Issue #792: Non-trailing whitespace"""
|
|
|
|
doc = en_tokenizer(text)
|
|
|
|
assert ''.join([token.text_with_ws for token in doc]) == text
|