spaCy/spacy/tests/regression/test_issue886.py

14 lines
426 B
Python
Raw Normal View History

2017-03-13 10:44:44 +00:00
# coding: utf8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["Datum:2014-06-02\nDokument:76467"])
def test_issue886(en_tokenizer, text):
"""Test that token.idx matches the original text index for texts with newlines."""
2017-03-13 10:44:44 +00:00
doc = en_tokenizer(text)
for token in doc:
assert len(token.text) == len(token.text_with_ws)
assert text[token.idx] == token.text[0]