spaCy/spacy/tests/doc/test_doc_split.py

# coding: utf-8
from __future__ import unicode_literals

import pytest
from spacy.vocab import Vocab
from spacy.tokens import Doc

from ..util import get_doc


def test_doc_split(en_vocab):
    words = ["LosAngeles", "start", "."]
    heads = [1, 1, 0]
    doc = get_doc(en_vocab, words=words, heads=heads)
    assert len(doc) == 3
    assert len(str(doc)) == 19
    assert doc[0].head.text == "start"
    assert doc[1].head.text == "."
    with doc.retokenize() as retokenizer:
        retokenizer.split(
            doc[0],
            ["Los", "Angeles"],
            [(doc[0], 1), doc[1]],
            attrs={
                "tag": ["NNP"] * 2,
                "lemma": ["Los", "Angeles"],
                "ent_type": ["GPE"] * 2,
            },
        )
    assert len(doc) == 4
    assert doc[0].text == "Los"
    assert doc[0].head.text == "Angeles"
    assert doc[0].idx == 0
    assert doc[1].idx == 3
    assert doc[1].text == "Angeles"
    assert doc[1].head.text == "start"
    assert doc[2].text == "start"
    assert doc[2].head.text == "."
    assert doc[3].text == "."
    assert doc[3].head.text == "."
    assert len(str(doc)) == 19


def test_split_dependencies(en_vocab):
    doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
    dep1 = doc.vocab.strings.add("amod")
    dep2 = doc.vocab.strings.add("subject")
    with doc.retokenize() as retokenizer:
        retokenizer.split(
            doc[0],
            ["Los", "Angeles"],
            [(doc[0], 1), doc[1]],
            attrs={"dep": [dep1, dep2]},
        )
    assert doc[0].dep == dep1
    assert doc[1].dep == dep2


def test_split_heads_error(en_vocab):
    doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
    # Not enough heads
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1]])

    # Too many heads
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])


def test_spans_entity_merge_iob():
    # Test entity IOB stays consistent after merging
    words = ["abc", "d", "e"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)]
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    with doc.retokenize() as retokenizer:
        retokenizer.split(doc[0], ["a", "b", "c"], [(doc[0], 1), (doc[0], 2), doc[1]])
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    assert doc[2].ent_iob_ == "I"
    assert doc[3].ent_iob_ == "I"


def test_spans_sentence_update_after_merge(en_vocab):
    # fmt: off
    words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He",
             "lives", "in", "England", "and", "loves", "JoePasquale", "."]
    heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
    deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj",
            "ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"]
    # fmt: on
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    sent1, sent2 = list(doc.sents)
    init_len = len(sent1)
    init_len2 = len(sent2)
    with doc.retokenize() as retokenizer:
        retokenizer.split(
            doc[0],
            ["Stewart", "Lee"],
            [(doc[0], 1), doc[1]],
            attrs={"dep": ["compound", "nsubj"]},
        )
        retokenizer.split(
            doc[13],
            ["Joe", "Pasquale"],
            [(doc[13], 1), doc[12]],
            attrs={"dep": ["compound", "dobj"]},
        )
    sent1, sent2 = list(doc.sents)
    assert len(sent1) == init_len + 1
    assert len(sent2) == init_len2 + 1


def test_split_orths_mismatch(en_vocab):
    """Test that the regular retokenizer.split raises an error if the orths
    don't match the original token text. There might still be a method that
    allows this, but for the default use cases, merging and splitting should
    always conform with spaCy's non-destructive tokenization policy. Otherwise,
    it can lead to very confusing and unexpected results.
    """
    doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["L", "A"], [(doc[0], 0), (doc[0], 0)])
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`import pytest`
Auto-formatting 2019-02-14 18:56:38 +00:00			`from spacy.vocab import Vocab`
			`from spacy.tokens import Doc`

			`from ..util import get_doc`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00

Tidy up tests 2019-02-15 11:56:51 +00:00			`def test_doc_split(en_vocab):`
			`words = ["LosAngeles", "start", "."]`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`heads = [1, 1, 0]`
Tidy up tests 2019-02-15 11:56:51 +00:00			`doc = get_doc(en_vocab, words=words, heads=heads)`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`assert len(doc) == 3`
			`assert len(str(doc)) == 19`
Auto-formatting 2019-02-14 18:56:38 +00:00			`assert doc[0].head.text == "start"`
			`assert doc[1].head.text == "."`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`with doc.retokenize() as retokenizer:`
Refinements to retokenize.split() function (#3282) * Change retokenize.split() API for heads * Pass lists as values for attrs in split * Fix test_doc_split filename * Add error for mismatched tokens after split * Raise error if new tokens don't match text * Fix doc test * Fix error * Move deps under attrs * Fix split tests * Fix retokenize.split 2019-02-15 16:32:31 +00:00			`retokenizer.split(`
			`doc[0],`
			`["Los", "Angeles"],`
			`[(doc[0], 1), doc[1]],`
			`attrs={`
Auto-format 2019-02-17 11:22:07 +00:00			`"tag": ["NNP"] * 2,`
Refinements to retokenize.split() function (#3282) * Change retokenize.split() API for heads * Pass lists as values for attrs in split * Fix test_doc_split filename * Add error for mismatched tokens after split * Raise error if new tokens don't match text * Fix doc test * Fix error * Move deps under attrs * Fix split tests * Fix retokenize.split 2019-02-15 16:32:31 +00:00			`"lemma": ["Los", "Angeles"],`
Auto-format 2019-02-17 11:22:07 +00:00			`"ent_type": ["GPE"] * 2,`
Refinements to retokenize.split() function (#3282) * Change retokenize.split() API for heads * Pass lists as values for attrs in split * Fix test_doc_split filename * Add error for mismatched tokens after split * Raise error if new tokens don't match text * Fix doc test * Fix error * Move deps under attrs * Fix split tests * Fix retokenize.split 2019-02-15 16:32:31 +00:00			`},`
			`)`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`assert len(doc) == 4`
Auto-formatting 2019-02-14 18:56:38 +00:00			`assert doc[0].text == "Los"`
			`assert doc[0].head.text == "Angeles"`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`assert doc[0].idx == 0`
			`assert doc[1].idx == 3`
Auto-formatting 2019-02-14 18:56:38 +00:00			`assert doc[1].text == "Angeles"`
			`assert doc[1].head.text == "start"`
			`assert doc[2].text == "start"`
			`assert doc[2].head.text == "."`
			`assert doc[3].text == "."`
			`assert doc[3].head.text == "."`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`assert len(str(doc)) == 19`

Auto-formatting 2019-02-14 18:56:38 +00:00
Tidy up tests 2019-02-15 11:56:51 +00:00			`def test_split_dependencies(en_vocab):`
			`doc = Doc(en_vocab, words=["LosAngeles", "start", "."])`
Auto-formatting 2019-02-14 18:56:38 +00:00			`dep1 = doc.vocab.strings.add("amod")`
			`dep2 = doc.vocab.strings.add("subject")`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`with doc.retokenize() as retokenizer:`
Auto-format 2019-02-17 11:22:07 +00:00			`retokenizer.split(`
			`doc[0],`
			`["Los", "Angeles"],`
			`[(doc[0], 1), doc[1]],`
			`attrs={"dep": [dep1, dep2]},`
			`)`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`assert doc[0].dep == dep1`
			`assert doc[1].dep == dep2`


Tidy up tests 2019-02-15 11:56:51 +00:00			`def test_split_heads_error(en_vocab):`
			`doc = Doc(en_vocab, words=["LosAngeles", "start", "."])`
Auto-formatting 2019-02-14 18:56:38 +00:00			`# Not enough heads`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`with pytest.raises(ValueError):`
			`with doc.retokenize() as retokenizer:`
Refinements to retokenize.split() function (#3282) * Change retokenize.split() API for heads * Pass lists as values for attrs in split * Fix test_doc_split filename * Add error for mismatched tokens after split * Raise error if new tokens don't match text * Fix doc test * Fix error * Move deps under attrs * Fix split tests * Fix retokenize.split 2019-02-15 16:32:31 +00:00			`retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1]])`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00
Auto-formatting 2019-02-14 18:56:38 +00:00			`# Too many heads`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`with pytest.raises(ValueError):`
			`with doc.retokenize() as retokenizer:`
Refinements to retokenize.split() function (#3282) * Change retokenize.split() API for heads * Pass lists as values for attrs in split * Fix test_doc_split filename * Add error for mismatched tokens after split * Raise error if new tokens don't match text * Fix doc test * Fix error * Move deps under attrs * Fix split tests * Fix retokenize.split 2019-02-15 16:32:31 +00:00			`retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])`
Add xfailing test for out-of-bounds heads 2019-02-15 12:09:07 +00:00

Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`def test_spans_entity_merge_iob():`
			`# Test entity IOB stays consistent after merging`
			`words = ["abc", "d", "e"]`
			`doc = Doc(Vocab(), words=words)`
Auto-formatting 2019-02-14 18:56:38 +00:00			`doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)]`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`assert doc[0].ent_iob_ == "B"`
			`assert doc[1].ent_iob_ == "I"`
			`with doc.retokenize() as retokenizer:`
Auto-format 2019-02-17 11:22:07 +00:00			`retokenizer.split(doc[0], ["a", "b", "c"], [(doc[0], 1), (doc[0], 2), doc[1]])`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`assert doc[0].ent_iob_ == "B"`
			`assert doc[1].ent_iob_ == "I"`
			`assert doc[2].ent_iob_ == "I"`
			`assert doc[3].ent_iob_ == "I"`

Auto-formatting 2019-02-14 18:56:38 +00:00
Tidy up tests 2019-02-15 11:56:51 +00:00			`def test_spans_sentence_update_after_merge(en_vocab):`
Auto-formatting 2019-02-14 18:56:38 +00:00			`# fmt: off`
Tidy up tests 2019-02-15 11:56:51 +00:00			`words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He",`
			`"lives", "in", "England", "and", "loves", "JoePasquale", "."]`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]`
Auto-formatting 2019-02-14 18:56:38 +00:00			`deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj",`
			`"ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"]`
			`# fmt: on`
Tidy up tests 2019-02-15 11:56:51 +00:00			`doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`sent1, sent2 = list(doc.sents)`
			`init_len = len(sent1)`
			`init_len2 = len(sent2)`
			`with doc.retokenize() as retokenizer:`
Auto-format 2019-02-17 11:22:07 +00:00			`retokenizer.split(`
			`doc[0],`
			`["Stewart", "Lee"],`
			`[(doc[0], 1), doc[1]],`
			`attrs={"dep": ["compound", "nsubj"]},`
			`)`
			`retokenizer.split(`
			`doc[13],`
			`["Joe", "Pasquale"],`
			`[(doc[13], 1), doc[12]],`
			`attrs={"dep": ["compound", "dobj"]},`
			`)`
Add split one token into several (resolves #2838) (#3253) * Add split one token into several (resolves #2838) * Improve error message for token splitting * Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. * Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. * Fix token.idx in retokenize.split() * Test that token.idx is correct after split * Fix token.idx for split tokens * Fix retokenize.split() * Fix retokenize.split * Fix retokenize.split() test 2019-02-14 14:27:13 +00:00			`sent1, sent2 = list(doc.sents)`
			`assert len(sent1) == init_len + 1`
			`assert len(sent2) == init_len2 + 1`
Add xfailing test for orth mismatch in retokenizer.split 2019-02-15 12:55:04 +00:00

			`def test_split_orths_mismatch(en_vocab):`
			`"""Test that the regular retokenizer.split raises an error if the orths`
			`don't match the original token text. There might still be a method that`
			`allows this, but for the default use cases, merging and splitting should`
			`always conform with spaCy's non-destructive tokenization policy. Otherwise,`
			`it can lead to very confusing and unexpected results.`
			`"""`
			`doc = Doc(en_vocab, words=["LosAngeles", "start", "."])`
			`with pytest.raises(ValueError):`
			`with doc.retokenize() as retokenizer:`
Refinements to retokenize.split() function (#3282) * Change retokenize.split() API for heads * Pass lists as values for attrs in split * Fix test_doc_split filename * Add error for mismatched tokens after split * Raise error if new tokens don't match text * Fix doc test * Fix error * Move deps under attrs * Fix split tests * Fix retokenize.split 2019-02-15 16:32:31 +00:00			`retokenizer.split(doc[0], ["L", "A"], [(doc[0], 0), (doc[0], 0)])`