spaCy/spacy/tests/pipeline/test_functions.py

25 lines
882 B
Python
Raw Normal View History

# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.pipeline.functions import merge_subtokens
from ..util import get_doc
@pytest.fixture
def doc(en_tokenizer):
# fmt: off
text = "This is a sentence. This is another sentence. And a third."
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 1, 1, 1, 0]
deps = ["nsubj", "ROOT", "subtok", "attr", "punct", "nsubj", "ROOT",
"subtok", "attr", "punct", "subtok", "subtok", "subtok", "ROOT"]
# fmt: on
tokens = en_tokenizer(text)
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
def test_merge_subtokens(doc):
doc = merge_subtokens(doc)
# get_doc() doesn't set spaces, so the result is "And a third ."
assert [t.text for t in doc] == ["This", "is", "a sentence", ".", "This", "is", "another sentence", ".", "And a third ."]