spaCy/spacy/tests/serialize/test_serialize_doc.py

226 lines
7.2 KiB
Python

import copy
import pickle
import numpy
import pytest
from spacy.attrs import DEP, HEAD
from spacy.lang.en import English
from spacy.language import Language
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc
from spacy.vectors import Vectors
from spacy.vocab import Vocab
from ..util import make_tempdir
@pytest.mark.issue(1727)
def test_issue1727():
"""Test that models with no pretrained vectors can be deserialized
correctly after vectors are added."""
nlp = Language(Vocab())
data = numpy.ones((3, 300), dtype="f")
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
tagger = nlp.create_pipe("tagger")
tagger.add_label("PRP")
assert tagger.cfg.get("pretrained_dims", 0) == 0
tagger.vocab.vectors = vectors
with make_tempdir() as path:
tagger.to_disk(path)
tagger = nlp.create_pipe("tagger").from_disk(path)
assert tagger.cfg.get("pretrained_dims", 0) == 0
@pytest.mark.issue(1799)
def test_issue1799():
"""Test sentence boundaries are deserialized correctly, even for
non-projective sentences."""
heads_deps = numpy.asarray(
[
[1, 397],
[4, 436],
[2, 426],
[1, 402],
[0, 8206900633647566924],
[18446744073709551615, 440],
[18446744073709551614, 442],
],
dtype="uint64",
)
doc = Doc(Vocab(), words="Just what I was looking for .".split())
doc.vocab.strings.add("ROOT")
doc = doc.from_array([HEAD, DEP], heads_deps)
assert len(list(doc.sents)) == 1
@pytest.mark.issue(1834)
def test_issue1834():
"""Test that sentence boundaries & parse/tag flags are not lost
during serialization."""
words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]
doc = Doc(Vocab(), words=words)
doc[6].is_sent_start = True
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
assert new_doc[6].sent_start
assert not new_doc.has_annotation("DEP")
assert not new_doc.has_annotation("TAG")
doc = Doc(
Vocab(),
words=words,
tags=["TAG"] * len(words),
heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],
deps=["dep"] * len(words),
)
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
assert new_doc[6].sent_start
assert new_doc.has_annotation("DEP")
assert new_doc.has_annotation("TAG")
@pytest.mark.issue(1883)
def test_issue1883():
matcher = Matcher(Vocab())
matcher.add("pat1", [[{"orth": "hello"}]])
doc = Doc(matcher.vocab, words=["hello"])
assert len(matcher(doc)) == 1
new_matcher = copy.deepcopy(matcher)
new_doc = Doc(new_matcher.vocab, words=["hello"])
assert len(new_matcher(new_doc)) == 1
@pytest.mark.issue(2564)
def test_issue2564():
"""Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
nlp = Language()
tagger = nlp.add_pipe("tagger")
tagger.add_label("A")
nlp.initialize()
doc = nlp("hello world")
assert doc.has_annotation("TAG")
docs = nlp.pipe(["hello", "world"])
piped_doc = next(docs)
assert piped_doc.has_annotation("TAG")
@pytest.mark.issue(3248)
def test_issue3248_2():
"""Test that the PhraseMatcher can be pickled correctly."""
nlp = English()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
matcher.add("TEST2", [nlp("d")])
data = pickle.dumps(matcher)
new_matcher = pickle.loads(data)
assert len(new_matcher) == len(matcher)
@pytest.mark.issue(3289)
def test_issue3289():
"""Test that Language.to_bytes handles serializing a pipeline component
with an uninitialized model."""
nlp = English()
nlp.add_pipe("textcat")
bytes_data = nlp.to_bytes()
new_nlp = English()
new_nlp.add_pipe("textcat")
new_nlp.from_bytes(bytes_data)
@pytest.mark.issue(3468)
def test_issue3468():
"""Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
be restored after serialization."""
nlp = English()
nlp.add_pipe("sentencizer")
doc = nlp("Hello world")
assert doc[0].is_sent_start
assert doc.has_annotation("SENT_START")
assert len(list(doc.sents)) == 1
doc_bytes = doc.to_bytes()
new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
assert new_doc[0].is_sent_start
assert new_doc.has_annotation("SENT_START")
assert len(list(new_doc.sents)) == 1
@pytest.mark.issue(3959)
def test_issue3959():
"""Ensure that a modified pos attribute is serialized correctly."""
nlp = English()
doc = nlp(
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
)
assert doc[0].pos_ == ""
doc[0].pos_ = "NOUN"
assert doc[0].pos_ == "NOUN"
# usually this is already True when starting from proper models instead of blank English
with make_tempdir() as tmp_dir:
file_path = tmp_dir / "my_doc"
doc.to_disk(file_path)
doc2 = nlp("")
doc2.from_disk(file_path)
assert doc2[0].pos_ == "NOUN"
def test_serialize_empty_doc(en_vocab):
doc = Doc(en_vocab)
data = doc.to_bytes()
doc2 = Doc(en_vocab)
doc2.from_bytes(data)
assert len(doc) == len(doc2)
for token1, token2 in zip(doc, doc2):
assert token1.text == token2.text
def test_serialize_doc_roundtrip_bytes(en_vocab):
doc = Doc(en_vocab, words=["hello", "world"])
doc.cats = {"A": 0.5}
doc_b = doc.to_bytes()
new_doc = Doc(en_vocab).from_bytes(doc_b)
assert new_doc.to_bytes() == doc_b
def test_serialize_doc_roundtrip_disk(en_vocab):
doc = Doc(en_vocab, words=["hello", "world"])
with make_tempdir() as d:
file_path = d / "doc"
doc.to_disk(file_path)
doc_d = Doc(en_vocab).from_disk(file_path)
assert doc.to_bytes() == doc_d.to_bytes()
def test_serialize_doc_roundtrip_disk_str_path(en_vocab):
doc = Doc(en_vocab, words=["hello", "world"])
with make_tempdir() as d:
file_path = d / "doc"
file_path = str(file_path)
doc.to_disk(file_path)
doc_d = Doc(en_vocab).from_disk(file_path)
assert doc.to_bytes() == doc_d.to_bytes()
def test_serialize_doc_exclude(en_vocab):
doc = Doc(en_vocab, words=["hello", "world"])
doc.user_data["foo"] = "bar"
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
assert new_doc.user_data["foo"] == "bar"
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"])
assert not new_doc.user_data
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
assert not new_doc.user_data
def test_serialize_doc_span_groups(en_vocab):
doc = Doc(en_vocab, words=["hello", "world", "!"])
span = doc[0:2]
span.label_ = "test_serialize_doc_span_groups_label"
span.id_ = "test_serialize_doc_span_groups_id"
span.kb_id_ = "test_serialize_doc_span_groups_kb_id"
doc.spans["content"] = [span]
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
assert len(new_doc.spans["content"]) == 1
assert new_doc.spans["content"][0].label_ == "test_serialize_doc_span_groups_label"
assert new_doc.spans["content"][0].id_ == "test_serialize_doc_span_groups_id"
assert new_doc.spans["content"][0].kb_id_ == "test_serialize_doc_span_groups_kb_id"