spaCy/spacy/tests/doc/test_doc_api.py

689 lines
24 KiB
Python

import weakref
import pytest
import numpy
import logging
import mock
from spacy.lang.xx import MultiLanguage
from spacy.tokens import Doc, Span, Token
from spacy.vocab import Vocab
from spacy.lexeme import Lexeme
from spacy.lang.en import English
from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
from .test_underscore import clean_underscore # noqa: F401
def test_doc_api_init(en_vocab):
words = ["a", "b", "c", "d"]
heads = [0, 0, 2, 2]
# set sent_start by sent_starts
doc = Doc(en_vocab, words=words, sent_starts=[True, False, True, False])
assert [t.is_sent_start for t in doc] == [True, False, True, False]
# set sent_start by heads
doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * 4)
assert [t.is_sent_start for t in doc] == [True, False, True, False]
# heads override sent_starts
doc = Doc(
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4
)
assert [t.is_sent_start for t in doc] == [True, False, True, False]
@pytest.mark.parametrize("text", [["one", "two", "three"]])
def test_doc_api_compare_by_string_position(en_vocab, text):
doc = Doc(en_vocab, words=text)
# Get the tokens in this order, so their ID ordering doesn't match the idx
token3 = doc[-1]
token2 = doc[-2]
token1 = doc[-1]
token1, token2, token3 = doc
assert token1 < token2 < token3
assert not token1 > token2
assert token2 > token1
assert token2 <= token3
assert token3 >= token1
def test_doc_api_getitem(en_tokenizer):
text = "Give it back! He pleaded."
tokens = en_tokenizer(text)
assert tokens[0].text == "Give"
assert tokens[-1].text == "."
with pytest.raises(IndexError):
tokens[len(tokens)]
def to_str(span):
return "/".join(token.text for token in span)
span = tokens[1:1]
assert not to_str(span)
span = tokens[1:4]
assert to_str(span) == "it/back/!"
span = tokens[1:4:1]
assert to_str(span) == "it/back/!"
with pytest.raises(ValueError):
tokens[1:4:2]
with pytest.raises(ValueError):
tokens[1:4:-1]
span = tokens[-3:6]
assert to_str(span) == "He/pleaded"
span = tokens[4:-1]
assert to_str(span) == "He/pleaded"
span = tokens[-5:-3]
assert to_str(span) == "back/!"
span = tokens[5:4]
assert span.start == span.end == 5 and not to_str(span)
span = tokens[4:-3]
assert span.start == span.end == 4 and not to_str(span)
span = tokens[:]
assert to_str(span) == "Give/it/back/!/He/pleaded/."
span = tokens[4:]
assert to_str(span) == "He/pleaded/."
span = tokens[:4]
assert to_str(span) == "Give/it/back/!"
span = tokens[:-3]
assert to_str(span) == "Give/it/back/!"
span = tokens[-3:]
assert to_str(span) == "He/pleaded/."
span = tokens[4:50]
assert to_str(span) == "He/pleaded/."
span = tokens[-50:4]
assert to_str(span) == "Give/it/back/!"
span = tokens[-50:-40]
assert span.start == span.end == 0 and not to_str(span)
span = tokens[40:50]
assert span.start == span.end == 7 and not to_str(span)
span = tokens[1:4]
assert span[0].orth_ == "it"
subspan = span[:]
assert to_str(subspan) == "it/back/!"
subspan = span[:2]
assert to_str(subspan) == "it/back"
subspan = span[1:]
assert to_str(subspan) == "back/!"
subspan = span[:-1]
assert to_str(subspan) == "it/back"
subspan = span[-2:]
assert to_str(subspan) == "back/!"
subspan = span[1:2]
assert to_str(subspan) == "back"
subspan = span[-2:-1]
assert to_str(subspan) == "back"
subspan = span[-50:50]
assert to_str(subspan) == "it/back/!"
subspan = span[50:-50]
assert subspan.start == subspan.end == 4 and not to_str(subspan)
@pytest.mark.parametrize(
"text", ["Give it back! He pleaded.", " Give it back! He pleaded. "]
)
def test_doc_api_serialize(en_tokenizer, text):
tokens = en_tokenizer(text)
tokens[0].lemma_ = "lemma"
tokens[0].norm_ = "norm"
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 0, 1)]
tokens[0].ent_kb_id_ = "ent_kb_id"
tokens[0].ent_id_ = "ent_id"
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
assert tokens.text == new_tokens.text
assert [t.text for t in tokens] == [t.text for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
assert new_tokens[0].lemma_ == "lemma"
assert new_tokens[0].norm_ == "norm"
assert new_tokens[0].ent_kb_id_ == "ent_kb_id"
assert new_tokens[0].ent_id_ == "ent_id"
new_tokens = Doc(tokens.vocab).from_bytes(
tokens.to_bytes(exclude=["tensor"]), exclude=["tensor"]
)
assert tokens.text == new_tokens.text
assert [t.text for t in tokens] == [t.text for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
new_tokens = Doc(tokens.vocab).from_bytes(
tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"]
)
assert tokens.text == new_tokens.text
assert [t.text for t in tokens] == [t.text for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
def inner_func(d1, d2):
return "hello!"
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warning") as mock_warning:
_ = tokens.to_bytes() # noqa: F841
mock_warning.assert_not_called()
tokens.user_hooks["similarity"] = inner_func
_ = tokens.to_bytes() # noqa: F841
mock_warning.assert_called_once()
def test_doc_api_set_ents(en_tokenizer):
text = "I use goggle chrone to surf the web"
tokens = en_tokenizer(text)
assert len(tokens.ents) == 0
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
assert len(list(tokens.ents)) == 1
assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2]
assert tokens.ents[0].label_ == "PRODUCT"
assert tokens.ents[0].start == 2
assert tokens.ents[0].end == 4
def test_doc_api_sents_empty_string(en_tokenizer):
doc = en_tokenizer("")
sents = list(doc.sents)
assert len(sents) == 0
def test_doc_api_runtime_error(en_tokenizer):
# Example that caused run-time error while parsing Reddit
# fmt: off
text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
deps = ["nummod", "nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", "nummod", "appos", "prep", "det",
"amod", "pobj", "acl", "prep", "prep", "pobj",
"", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
# fmt: on
tokens = en_tokenizer(text)
doc = Doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
nps = []
for np in doc.noun_chunks:
while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
np = np[1:]
if len(np) > 1:
nps.append(np)
with doc.retokenize() as retokenizer:
for np in nps:
attrs = {
"tag": np.root.tag_,
"lemma": np.text,
"ent_type": np.root.ent_type_,
}
retokenizer.merge(np, attrs=attrs)
def test_doc_api_right_edge(en_vocab):
"""Test for bug occurring from Unshift action, causing incorrect right edge"""
# fmt: off
words = [
"I", "have", "proposed", "to", "myself", ",", "for", "the", "sake",
"of", "such", "as", "live", "under", "the", "government", "of", "the",
"Romans", ",", "to", "translate", "those", "books", "into", "the",
"Greek", "tongue", "."
]
heads = [2, 2, 2, 2, 3, 2, 21, 8, 6, 8, 11, 8, 11, 12, 15, 13, 15, 18, 16, 12, 21, 2, 23, 21, 21, 27, 27, 24, 2]
deps = ["dep"] * len(heads)
# fmt: on
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
assert doc[6].text == "for"
subtree = [w.text for w in doc[6].subtree]
# fmt: off
assert subtree == ["for", "the", "sake", "of", "such", "as", "live", "under", "the", "government", "of", "the", "Romans", ","]
# fmt: on
assert doc[6].right_edge.text == ","
def test_doc_api_has_vector():
vocab = Vocab()
vocab.reset_vectors(width=2)
vocab.set_vector("kitten", vector=numpy.asarray([0.0, 2.0], dtype="f"))
doc = Doc(vocab, words=["kitten"])
assert doc.has_vector
def test_doc_api_similarity_match():
doc = Doc(Vocab(), words=["a"])
assert doc.similarity(doc[0]) == 1.0
assert doc.similarity(doc.vocab["a"]) == 1.0
doc2 = Doc(doc.vocab, words=["a", "b", "c"])
with pytest.warns(UserWarning):
assert doc.similarity(doc2[:1]) == 1.0
assert doc.similarity(doc2) == 0.0
@pytest.mark.parametrize(
"words,heads,lca_matrix",
[
(
["the", "lazy", "dog", "slept"],
[2, 2, 3, 3],
numpy.array([[0, 2, 2, 3], [2, 1, 2, 3], [2, 2, 2, 3], [3, 3, 3, 3]]),
),
(
["The", "lazy", "dog", "slept", ".", "The", "quick", "fox", "jumped"],
[2, 2, 3, 3, 3, 7, 7, 8, 8],
numpy.array(
[
[0, 2, 2, 3, 3, -1, -1, -1, -1],
[2, 1, 2, 3, 3, -1, -1, -1, -1],
[2, 2, 2, 3, 3, -1, -1, -1, -1],
[3, 3, 3, 3, 3, -1, -1, -1, -1],
[3, 3, 3, 3, 4, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, 5, 7, 7, 8],
[-1, -1, -1, -1, -1, 7, 6, 7, 8],
[-1, -1, -1, -1, -1, 7, 7, 7, 8],
[-1, -1, -1, -1, -1, 8, 8, 8, 8],
]
),
),
],
)
def test_lowest_common_ancestor(en_vocab, words, heads, lca_matrix):
doc = Doc(en_vocab, words, heads=heads, deps=["dep"] * len(heads))
lca = doc.get_lca_matrix()
assert (lca == lca_matrix).all()
assert lca[1, 1] == 1
assert lca[0, 1] == 2
assert lca[1, 2] == 2
def test_doc_is_nered(en_vocab):
words = ["I", "live", "in", "New", "York"]
doc = Doc(en_vocab, words=words)
assert not doc.has_annotation("ENT_IOB")
doc.ents = [Span(doc, 3, 5, label="GPE")]
assert doc.has_annotation("ENT_IOB")
# Test creating doc from array with unknown values
arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
assert doc.has_annotation("ENT_IOB")
# Test serialization
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
assert new_doc.has_annotation("ENT_IOB")
def test_doc_from_array_sent_starts(en_vocab):
# fmt: off
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
# fmt: on
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
# HEAD overrides SENT_START without warning
attrs = [SENT_START, HEAD]
arr = doc.to_array(attrs)
new_doc = Doc(en_vocab, words=words)
new_doc.from_array(attrs, arr)
# no warning using default attrs
attrs = doc._get_array_attrs()
arr = doc.to_array(attrs)
with pytest.warns(None) as record:
new_doc.from_array(attrs, arr)
assert len(record) == 0
# only SENT_START uses SENT_START
attrs = [SENT_START]
arr = doc.to_array(attrs)
new_doc = Doc(en_vocab, words=words)
new_doc.from_array(attrs, arr)
assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
assert not new_doc.has_annotation("DEP")
# only HEAD uses HEAD
attrs = [HEAD, DEP]
arr = doc.to_array(attrs)
new_doc = Doc(en_vocab, words=words)
new_doc.from_array(attrs, arr)
assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
assert new_doc.has_annotation("DEP")
def test_doc_from_array_morph(en_vocab):
# fmt: off
words = ["I", "live", "in", "New", "York", "."]
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
# fmt: on
doc = Doc(en_vocab, words=words, morphs=morphs)
attrs = [MORPH]
arr = doc.to_array(attrs)
new_doc = Doc(en_vocab, words=words)
new_doc.from_array(attrs, arr)
assert [str(t.morph) for t in new_doc] == morphs
assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
@pytest.mark.usefixtures("clean_underscore")
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
en_texts_without_empty = [t for t in en_texts if len(t)]
de_text = "Wie war die Frage?"
en_docs = [en_tokenizer(text) for text in en_texts]
en_docs[0].spans["group"] = [en_docs[0][1:4]]
en_docs[2].spans["group"] = [en_docs[2][1:4]]
span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
de_doc = de_tokenizer(de_text)
Token.set_extension("is_ambiguous", default=False)
en_docs[0][2]._.is_ambiguous = True # docs
en_docs[2][3]._.is_ambiguous = True # think
assert Doc.from_docs([]) is None
assert de_doc is not Doc.from_docs([de_doc])
assert str(de_doc) == str(Doc.from_docs([de_doc]))
with pytest.raises(ValueError):
Doc.from_docs(en_docs + [de_doc])
m_doc = Doc.from_docs(en_docs)
assert len(en_texts_without_empty) == len(list(m_doc.sents))
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == " ".join(en_texts_without_empty)
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
assert len(m_doc) == len(en_docs_tokens)
think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
assert m_doc[2]._.is_ambiguous == True
assert m_doc[9].idx == think_idx
assert m_doc[9]._.is_ambiguous == True
assert not any([t._.is_ambiguous for t in m_doc[3:8]])
assert "group" in m_doc.spans
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
assert len(en_texts_without_empty) == len(list(m_doc.sents))
assert len(str(m_doc)) == sum(len(t) for t in en_texts)
assert str(m_doc) == "".join(en_texts)
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and not bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
assert len(m_doc) == len(en_docs_tokens)
think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think")
assert m_doc[9].idx == think_idx
assert "group" in m_doc.spans
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
# space delimiter considered, although spacy attribute was missing
assert str(m_doc) == " ".join(en_texts_without_empty)
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
assert len(m_doc) == len(en_docs_tokens)
think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
assert m_doc[9].idx == think_idx
assert "group" in m_doc.spans
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
# can merge empty docs
doc = Doc.from_docs([en_tokenizer("")] * 10)
def test_doc_api_from_docs_ents(en_tokenizer):
texts = ["Merging the docs is fun.", "They don't think alike."]
docs = [en_tokenizer(t) for t in texts]
docs[0].ents = ()
docs[1].ents = (Span(docs[1], 0, 1, label="foo"),)
doc = Doc.from_docs(docs)
assert len(doc.ents) == 1
def test_doc_lang(en_vocab):
doc = Doc(en_vocab, words=["Hello", "world"])
assert doc.lang_ == "en"
assert doc.lang == en_vocab.strings["en"]
assert doc[0].lang_ == "en"
assert doc[0].lang == en_vocab.strings["en"]
nlp = English()
doc = nlp("Hello world")
assert doc.lang_ == "en"
assert doc.lang == en_vocab.strings["en"]
assert doc[0].lang_ == "en"
assert doc[0].lang == en_vocab.strings["en"]
def test_token_lexeme(en_vocab):
"""Test that tokens expose their lexeme."""
token = Doc(en_vocab, words=["Hello", "world"])[0]
assert isinstance(token.lex, Lexeme)
assert token.lex.text == token.text
assert en_vocab[token.orth] == token.lex
def test_has_annotation(en_vocab):
doc = Doc(en_vocab, words=["Hello", "world"])
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
for attr in attrs:
assert not doc.has_annotation(attr)
doc[0].tag_ = "A"
doc[0].pos_ = "X"
doc[0].set_morph("Feat=Val")
doc[0].lemma_ = "a"
doc[0].dep_ = "dep"
doc[0].head = doc[1]
doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing")
for attr in attrs:
assert doc.has_annotation(attr)
assert not doc.has_annotation(attr, require_complete=True)
doc[1].tag_ = "A"
doc[1].pos_ = "X"
doc[1].set_morph("")
doc[1].lemma_ = "a"
doc[1].dep_ = "dep"
doc.ents = [Span(doc, 0, 2, label="HELLO")]
for attr in attrs:
assert doc.has_annotation(attr)
assert doc.has_annotation(attr, require_complete=True)
def test_is_flags_deprecated(en_tokenizer):
doc = en_tokenizer("test")
with pytest.deprecated_call():
doc.is_tagged
with pytest.deprecated_call():
doc.is_parsed
with pytest.deprecated_call():
doc.is_nered
with pytest.deprecated_call():
doc.is_sentenced
def test_doc_set_ents(en_tokenizer):
# set ents
doc = en_tokenizer("a b c d e")
doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 2]
assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
# add ents, invalid IOB repaired
doc = en_tokenizer("a b c d e")
doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
doc.set_ents([Span(doc, 0, 2, 12)], default="unmodified")
assert [t.ent_iob for t in doc] == [3, 1, 3, 2, 2]
assert [t.ent_type for t in doc] == [12, 12, 11, 0, 0]
# missing ents
doc = en_tokenizer("a b c d e")
doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], missing=[doc[4:5]])
assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 0]
assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
# outside ents
doc = en_tokenizer("a b c d e")
doc.set_ents(
[Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)],
outside=[doc[4:5]],
default="missing",
)
assert [t.ent_iob for t in doc] == [3, 3, 1, 0, 2]
assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
# blocked ents
doc = en_tokenizer("a b c d e")
doc.set_ents([], blocked=[doc[1:2], doc[3:5]], default="unmodified")
assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3]
assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0]
assert doc.ents == tuple()
# invalid IOB repaired after blocked
doc.ents = [Span(doc, 3, 5, "ENT")]
assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1]
doc.set_ents([], blocked=[doc[3:4]], default="unmodified")
assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3]
# all types
doc = en_tokenizer("a b c d e")
doc.set_ents(
[Span(doc, 0, 1, 10)],
blocked=[doc[1:2]],
missing=[doc[2:3]],
outside=[doc[3:4]],
default="unmodified",
)
assert [t.ent_iob for t in doc] == [3, 3, 0, 2, 0]
assert [t.ent_type for t in doc] == [10, 0, 0, 0, 0]
doc = en_tokenizer("a b c d e")
# single span instead of a list
with pytest.raises(ValueError):
doc.set_ents([], missing=doc[1:2])
# invalid default mode
with pytest.raises(ValueError):
doc.set_ents([], missing=[doc[1:2]], default="none")
# conflicting/overlapping specifications
with pytest.raises(ValueError):
doc.set_ents([], missing=[doc[1:2]], outside=[doc[1:2]])
def test_doc_ents_setter():
"""Test that both strings and integers can be used to set entities in
tuple format via doc.ents."""
words = ["a", "b", "c", "d", "e"]
doc = Doc(Vocab(), words=words)
doc.ents = [("HELLO", 0, 2), (doc.vocab.strings.add("WORLD"), 3, 5)]
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
vocab = Vocab()
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
doc = Doc(vocab, words=words, ents=ents)
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
def test_doc_morph_setter(en_tokenizer, de_tokenizer):
doc1 = en_tokenizer("a b")
doc1b = en_tokenizer("c d")
doc2 = de_tokenizer("a b")
# unset values can be copied
doc1[0].morph = doc1[1].morph
assert doc1[0].morph.key == 0
assert doc1[1].morph.key == 0
# morph values from the same vocab can be copied
doc1[0].set_morph("Feat=Val")
doc1[1].morph = doc1[0].morph
assert doc1[0].morph == doc1[1].morph
# ... also across docs
doc1b[0].morph = doc1[0].morph
assert doc1[0].morph == doc1b[0].morph
doc2[0].set_morph("Feat2=Val2")
# the morph value must come from the same vocab
with pytest.raises(ValueError):
doc1[0].morph = doc2[0].morph
def test_doc_init_iob():
"""Test ents validation/normalization in Doc.__init__"""
words = ["a", "b", "c", "d", "e"]
ents = ["O"] * len(words)
doc = Doc(Vocab(), words=words, ents=ents)
assert doc.ents == ()
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 3
# None is missing
ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
# empty tag is missing
ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
doc = Doc(Vocab(), words=words, ents=ents)
assert len(doc.ents) == 2
# invalid IOB
ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# no dash
ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# no ent type
ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
# not strings or None
ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
with pytest.raises(ValueError):
doc = Doc(Vocab(), words=words, ents=ents)
def test_doc_set_ents_invalid_spans(en_tokenizer):
doc = en_tokenizer("Some text about Colombia and the Czech Republic")
spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
with pytest.raises(IndexError):
doc.ents = spans
def test_doc_noun_chunks_not_implemented():
"""Test that a language without noun_chunk iterator, throws a NotImplementedError"""
text = "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat."
nlp = MultiLanguage()
doc = nlp(text)
with pytest.raises(NotImplementedError):
_ = list(doc.noun_chunks) # noqa: F841
def test_span_groups(en_tokenizer):
doc = en_tokenizer("Some text about Colombia and the Czech Republic")
doc.spans["hi"] = [Span(doc, 3, 4, label="bye")]
assert "hi" in doc.spans
assert "bye" not in doc.spans
assert len(doc.spans["hi"]) == 1
assert doc.spans["hi"][0].label_ == "bye"
doc.spans["hi"].append(doc[0:3])
assert len(doc.spans["hi"]) == 2
assert doc.spans["hi"][1].text == "Some text about"
assert [span.text for span in doc.spans["hi"]] == ["Colombia", "Some text about"]
assert not doc.spans["hi"].has_overlap
doc.ents = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
doc.spans["hi"].extend(doc.ents)
assert len(doc.spans["hi"]) == 4
assert [span.label_ for span in doc.spans["hi"]] == ["bye", "", "GPE", "GPE"]
assert doc.spans["hi"].has_overlap
del doc.spans["hi"]
assert "hi" not in doc.spans
def test_doc_spans_copy(en_tokenizer):
doc1 = en_tokenizer("Some text about Colombia and the Czech Republic")
assert weakref.ref(doc1) == doc1.spans.doc_ref
doc2 = doc1.copy()
assert weakref.ref(doc2) == doc2.spans.doc_ref