spaCy/spacy/tests/serialize/test_serialize_doc.py

import copy
import pickle

import numpy
import pytest

from spacy.attrs import DEP, HEAD
from spacy.lang.en import English
from spacy.language import Language
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc
from spacy.vectors import Vectors
from spacy.vocab import Vocab

from ..util import make_tempdir


@pytest.mark.issue(1727)
def test_issue1727():
    """Test that models with no pretrained vectors can be deserialized
    correctly after vectors are added."""
    nlp = Language(Vocab())
    data = numpy.ones((3, 300), dtype="f")
    vectors = Vectors(data=data, keys=["I", "am", "Matt"])
    tagger = nlp.create_pipe("tagger")
    tagger.add_label("PRP")
    assert tagger.cfg.get("pretrained_dims", 0) == 0
    tagger.vocab.vectors = vectors
    with make_tempdir() as path:
        tagger.to_disk(path)
        tagger = nlp.create_pipe("tagger").from_disk(path)
        assert tagger.cfg.get("pretrained_dims", 0) == 0


@pytest.mark.issue(1799)
def test_issue1799():
    """Test sentence boundaries are deserialized correctly, even for
    non-projective sentences."""
    heads_deps = numpy.asarray(
        [
            [1, 397],
            [4, 436],
            [2, 426],
            [1, 402],
            [0, 8206900633647566924],
            [18446744073709551615, 440],
            [18446744073709551614, 442],
        ],
        dtype="uint64",
    )
    doc = Doc(Vocab(), words="Just what I was looking for .".split())
    doc.vocab.strings.add("ROOT")
    doc = doc.from_array([HEAD, DEP], heads_deps)
    assert len(list(doc.sents)) == 1


@pytest.mark.issue(1834)
def test_issue1834():
    """Test that sentence boundaries & parse/tag flags are not lost
    during serialization."""
    words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]
    doc = Doc(Vocab(), words=words)
    doc[6].is_sent_start = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert not new_doc.has_annotation("DEP")
    assert not new_doc.has_annotation("TAG")
    doc = Doc(
        Vocab(),
        words=words,
        tags=["TAG"] * len(words),
        heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],
        deps=["dep"] * len(words),
    )
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert new_doc.has_annotation("DEP")
    assert new_doc.has_annotation("TAG")


@pytest.mark.issue(1883)
def test_issue1883():
    matcher = Matcher(Vocab())
    matcher.add("pat1", [[{"orth": "hello"}]])
    doc = Doc(matcher.vocab, words=["hello"])
    assert len(matcher(doc)) == 1
    new_matcher = copy.deepcopy(matcher)
    new_doc = Doc(new_matcher.vocab, words=["hello"])
    assert len(new_matcher(new_doc)) == 1


@pytest.mark.issue(2564)
def test_issue2564():
    """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
    nlp.initialize()
    doc = nlp("hello world")
    assert doc.has_annotation("TAG")
    docs = nlp.pipe(["hello", "world"])
    piped_doc = next(docs)
    assert piped_doc.has_annotation("TAG")


@pytest.mark.issue(3248)
def test_issue3248_2():
    """Test that the PhraseMatcher can be pickled correctly."""
    nlp = English()
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
    matcher.add("TEST2", [nlp("d")])
    data = pickle.dumps(matcher)
    new_matcher = pickle.loads(data)
    assert len(new_matcher) == len(matcher)


@pytest.mark.issue(3289)
def test_issue3289():
    """Test that Language.to_bytes handles serializing a pipeline component
    with an uninitialized model."""
    nlp = English()
    nlp.add_pipe("textcat")
    bytes_data = nlp.to_bytes()
    new_nlp = English()
    new_nlp.add_pipe("textcat")
    new_nlp.from_bytes(bytes_data)


@pytest.mark.issue(3468)
def test_issue3468():
    """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
    be restored after serialization."""
    nlp = English()
    nlp.add_pipe("sentencizer")
    doc = nlp("Hello world")
    assert doc[0].is_sent_start
    assert doc.has_annotation("SENT_START")
    assert len(list(doc.sents)) == 1
    doc_bytes = doc.to_bytes()
    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
    assert new_doc[0].is_sent_start
    assert new_doc.has_annotation("SENT_START")
    assert len(list(new_doc.sents)) == 1


@pytest.mark.issue(3959)
def test_issue3959():
    """Ensure that a modified pos attribute is serialized correctly."""
    nlp = English()
    doc = nlp(
        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
    )
    assert doc[0].pos_ == ""
    doc[0].pos_ = "NOUN"
    assert doc[0].pos_ == "NOUN"
    # usually this is already True when starting from proper models instead of blank English
    with make_tempdir() as tmp_dir:
        file_path = tmp_dir / "my_doc"
        doc.to_disk(file_path)
        doc2 = nlp("")
        doc2.from_disk(file_path)
        assert doc2[0].pos_ == "NOUN"


def test_serialize_empty_doc(en_vocab):
    doc = Doc(en_vocab)
    data = doc.to_bytes()
    doc2 = Doc(en_vocab)
    doc2.from_bytes(data)
    assert len(doc) == len(doc2)
    for token1, token2 in zip(doc, doc2):
        assert token1.text == token2.text


def test_serialize_doc_roundtrip_bytes(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.cats = {"A": 0.5}
    doc_b = doc.to_bytes()
    new_doc = Doc(en_vocab).from_bytes(doc_b)
    assert new_doc.to_bytes() == doc_b


def test_serialize_doc_roundtrip_disk(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    with make_tempdir() as d:
        file_path = d / "doc"
        doc.to_disk(file_path)
        doc_d = Doc(en_vocab).from_disk(file_path)
        assert doc.to_bytes() == doc_d.to_bytes()


def test_serialize_doc_roundtrip_disk_str_path(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    with make_tempdir() as d:
        file_path = d / "doc"
        file_path = str(file_path)
        doc.to_disk(file_path)
        doc_d = Doc(en_vocab).from_disk(file_path)
        assert doc.to_bytes() == doc_d.to_bytes()


def test_serialize_doc_exclude(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.user_data["foo"] = "bar"
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.user_data["foo"] == "bar"
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"])
    assert not new_doc.user_data
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
    assert not new_doc.user_data


def test_serialize_doc_span_groups(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    doc.spans["content"] = [doc[0:2]]
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert len(new_doc.spans["content"]) == 1
Migrate regression tests into the main test suite (#9655) * Migrate regressions 1-1000 * Move serialize test to correct file * Remove tests that won't work in v3 * Migrate regressions 1000-1500 Removed regression test 1250 because v3 doesn't support the old LEX scheme anymore. * Add missing imports in serializer tests * Migrate tests 1500-2000 * Migrate regressions from 2000-2500 * Migrate regressions from 2501-3000 * Migrate regressions from 3000-3501 * Migrate regressions from 3501-4000 * Migrate regressions from 4001-4500 * Migrate regressions from 4501-5000 * Migrate regressions from 5001-5501 * Migrate regressions from 5501 to 7000 * Migrate regressions from 7001 to 8000 * Migrate remaining regression tests * Fixing missing imports * Update docs with new system [ci skip] * Update CONTRIBUTING.md - Fix formatting - Update wording * Remove lemmatizer tests in el lang * Move a few tests into the general tokenizer * Separate Doc and DocBin tests 2021-12-04 19:34:48 +00:00			`import copy`
			`import pickle`

			`import numpy`
small UX fix for DocBin (#6167) * add informative warning when messing up store_user_data DocBin flags * add informative warning when messing up store_user_data DocBin flags * cleanup test * rename to patterns_path 2020-10-02 13:43:32 +00:00			`import pytest`

Migrate regression tests into the main test suite (#9655) * Migrate regressions 1-1000 * Move serialize test to correct file * Remove tests that won't work in v3 * Migrate regressions 1000-1500 Removed regression test 1250 because v3 doesn't support the old LEX scheme anymore. * Add missing imports in serializer tests * Migrate tests 1500-2000 * Migrate regressions from 2000-2500 * Migrate regressions from 2501-3000 * Migrate regressions from 3000-3501 * Migrate regressions from 3501-4000 * Migrate regressions from 4001-4500 * Migrate regressions from 4501-5000 * Migrate regressions from 5001-5501 * Migrate regressions from 5501 to 7000 * Migrate regressions from 7001 to 8000 * Migrate remaining regression tests * Fixing missing imports * Update docs with new system [ci skip] * Update CONTRIBUTING.md - Fix formatting - Update wording * Remove lemmatizer tests in el lang * Move a few tests into the general tokenizer * Separate Doc and DocBin tests 2021-12-04 19:34:48 +00:00			`from spacy.attrs import DEP, HEAD`
Bugfix initializing DocBin with attributes (#4368) * docbin init fix + documentation fix + unit tests * newline * try with zlib instead of gzip (python 2 incompatibilities) 2019-10-03 12:48:45 +00:00			`from spacy.lang.en import English`
Migrate regression tests into the main test suite (#9655) * Migrate regressions 1-1000 * Move serialize test to correct file * Remove tests that won't work in v3 * Migrate regressions 1000-1500 Removed regression test 1250 because v3 doesn't support the old LEX scheme anymore. * Add missing imports in serializer tests * Migrate tests 1500-2000 * Migrate regressions from 2000-2500 * Migrate regressions from 2501-3000 * Migrate regressions from 3000-3501 * Migrate regressions from 3501-4000 * Migrate regressions from 4001-4500 * Migrate regressions from 4501-5000 * Migrate regressions from 5001-5501 * Migrate regressions from 5501 to 7000 * Migrate regressions from 7001 to 8000 * Migrate remaining regression tests * Fixing missing imports * Update docs with new system [ci skip] * Update CONTRIBUTING.md - Fix formatting - Update wording * Remove lemmatizer tests in el lang * Move a few tests into the general tokenizer * Separate Doc and DocBin tests 2021-12-04 19:34:48 +00:00			`from spacy.language import Language`
			`from spacy.matcher import Matcher, PhraseMatcher`
			`from spacy.tokens import Doc`
			`from spacy.vectors import Vectors`
			`from spacy.vocab import Vocab`
Ensure path in Doc.to_disk/from_disk (resolves ##1521) Also add Doc serialization tests with both Path and string path options 2017-11-09 01:29:03 +00:00
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-24 21:38:44 +00:00			`from ..util import make_tempdir`


Migrate regression tests into the main test suite (#9655) * Migrate regressions 1-1000 * Move serialize test to correct file * Remove tests that won't work in v3 * Migrate regressions 1000-1500 Removed regression test 1250 because v3 doesn't support the old LEX scheme anymore. * Add missing imports in serializer tests * Migrate tests 1500-2000 * Migrate regressions from 2000-2500 * Migrate regressions from 2501-3000 * Migrate regressions from 3000-3501 * Migrate regressions from 3501-4000 * Migrate regressions from 4001-4500 * Migrate regressions from 4501-5000 * Migrate regressions from 5001-5501 * Migrate regressions from 5501 to 7000 * Migrate regressions from 7001 to 8000 * Migrate remaining regression tests * Fixing missing imports * Update docs with new system [ci skip] * Update CONTRIBUTING.md - Fix formatting - Update wording * Remove lemmatizer tests in el lang * Move a few tests into the general tokenizer * Separate Doc and DocBin tests 2021-12-04 19:34:48 +00:00			`@pytest.mark.issue(1727)`
			`def test_issue1727():`
			`"""Test that models with no pretrained vectors can be deserialized`
			`correctly after vectors are added."""`
			`nlp = Language(Vocab())`
			`data = numpy.ones((3, 300), dtype="f")`
			`vectors = Vectors(data=data, keys=["I", "am", "Matt"])`
			`tagger = nlp.create_pipe("tagger")`
			`tagger.add_label("PRP")`
			`assert tagger.cfg.get("pretrained_dims", 0) == 0`
			`tagger.vocab.vectors = vectors`
			`with make_tempdir() as path:`
			`tagger.to_disk(path)`
			`tagger = nlp.create_pipe("tagger").from_disk(path)`
			`assert tagger.cfg.get("pretrained_dims", 0) == 0`


			`@pytest.mark.issue(1799)`
			`def test_issue1799():`
			`"""Test sentence boundaries are deserialized correctly, even for`
			`non-projective sentences."""`
			`heads_deps = numpy.asarray(`
			`[`
			`[1, 397],`
			`[4, 436],`
			`[2, 426],`
			`[1, 402],`
			`[0, 8206900633647566924],`
			`[18446744073709551615, 440],`
			`[18446744073709551614, 442],`
			`],`
			`dtype="uint64",`
			`)`
			`doc = Doc(Vocab(), words="Just what I was looking for .".split())`
			`doc.vocab.strings.add("ROOT")`
			`doc = doc.from_array([HEAD, DEP], heads_deps)`
			`assert len(list(doc.sents)) == 1`


			`@pytest.mark.issue(1834)`
			`def test_issue1834():`
			`"""Test that sentence boundaries & parse/tag flags are not lost`
			`during serialization."""`
			`words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]`
			`doc = Doc(Vocab(), words=words)`
			`doc[6].is_sent_start = True`
			`new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())`
			`assert new_doc[6].sent_start`
			`assert not new_doc.has_annotation("DEP")`
			`assert not new_doc.has_annotation("TAG")`
			`doc = Doc(`
			`Vocab(),`
			`words=words,`
			`tags=["TAG"] * len(words),`
			`heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],`
			`deps=["dep"] * len(words),`
			`)`
			`new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())`
			`assert new_doc[6].sent_start`
			`assert new_doc.has_annotation("DEP")`
			`assert new_doc.has_annotation("TAG")`


			`@pytest.mark.issue(1883)`
			`def test_issue1883():`
			`matcher = Matcher(Vocab())`
			`matcher.add("pat1", [[{"orth": "hello"}]])`
			`doc = Doc(matcher.vocab, words=["hello"])`
			`assert len(matcher(doc)) == 1`
			`new_matcher = copy.deepcopy(matcher)`
			`new_doc = Doc(new_matcher.vocab, words=["hello"])`
			`assert len(new_matcher(new_doc)) == 1`


			`@pytest.mark.issue(2564)`
			`def test_issue2564():`
			`"""Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""`
			`nlp = Language()`
			`tagger = nlp.add_pipe("tagger")`
			`tagger.add_label("A")`
			`nlp.initialize()`
			`doc = nlp("hello world")`
			`assert doc.has_annotation("TAG")`
			`docs = nlp.pipe(["hello", "world"])`
			`piped_doc = next(docs)`
			`assert piped_doc.has_annotation("TAG")`


			`@pytest.mark.issue(3248)`
			`def test_issue3248_2():`
			`"""Test that the PhraseMatcher can be pickled correctly."""`
			`nlp = English()`
			`matcher = PhraseMatcher(nlp.vocab)`
			`matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])`
			`matcher.add("TEST2", [nlp("d")])`
			`data = pickle.dumps(matcher)`
			`new_matcher = pickle.loads(data)`
			`assert len(new_matcher) == len(matcher)`


			`@pytest.mark.issue(3289)`
			`def test_issue3289():`
			`"""Test that Language.to_bytes handles serializing a pipeline component`
			`with an uninitialized model."""`
			`nlp = English()`
			`nlp.add_pipe("textcat")`
			`bytes_data = nlp.to_bytes()`
			`new_nlp = English()`
			`new_nlp.add_pipe("textcat")`
			`new_nlp.from_bytes(bytes_data)`


			`@pytest.mark.issue(3468)`
			`def test_issue3468():`
			`"""Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can`
			`be restored after serialization."""`
			`nlp = English()`
			`nlp.add_pipe("sentencizer")`
			`doc = nlp("Hello world")`
			`assert doc[0].is_sent_start`
			`assert doc.has_annotation("SENT_START")`
			`assert len(list(doc.sents)) == 1`
			`doc_bytes = doc.to_bytes()`
			`new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)`
			`assert new_doc[0].is_sent_start`
			`assert new_doc.has_annotation("SENT_START")`
			`assert len(list(new_doc.sents)) == 1`


			`@pytest.mark.issue(3959)`
			`def test_issue3959():`
			`"""Ensure that a modified pos attribute is serialized correctly."""`
			`nlp = English()`
			`doc = nlp(`
			`"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"`
			`)`
			`assert doc[0].pos_ == ""`
			`doc[0].pos_ = "NOUN"`
			`assert doc[0].pos_ == "NOUN"`
			`# usually this is already True when starting from proper models instead of blank English`
			`with make_tempdir() as tmp_dir:`
			`file_path = tmp_dir / "my_doc"`
			`doc.to_disk(file_path)`
			`doc2 = nlp("")`
			`doc2.from_disk(file_path)`
			`assert doc2[0].pos_ == "NOUN"`


💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-24 21:38:44 +00:00			`def test_serialize_empty_doc(en_vocab):`
			`doc = Doc(en_vocab)`
			`data = doc.to_bytes()`
			`doc2 = Doc(en_vocab)`
			`doc2.from_bytes(data)`
			`assert len(doc) == len(doc2)`
			`for token1, token2 in zip(doc, doc2):`
			`assert token1.text == token2.text`
Ensure path in Doc.to_disk/from_disk (resolves ##1521) Also add Doc serialization tests with both Path and string path options 2017-11-09 01:29:03 +00:00

			`def test_serialize_doc_roundtrip_bytes(en_vocab):`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`doc = Doc(en_vocab, words=["hello", "world"])`
Include Doc.cats in serialization of Doc and DocBin (#4774) * Include Doc.cats in to_bytes() * Include Doc.cats in DocBin serialization * Add tests for serialization of cats Test serialization of cats for Doc and DocBin. 2019-12-06 13:07:39 +00:00			`doc.cats = {"A": 0.5}`
Ensure path in Doc.to_disk/from_disk (resolves ##1521) Also add Doc serialization tests with both Path and string path options 2017-11-09 01:29:03 +00:00			`doc_b = doc.to_bytes()`
			`new_doc = Doc(en_vocab).from_bytes(doc_b)`
			`assert new_doc.to_bytes() == doc_b`


			`def test_serialize_doc_roundtrip_disk(en_vocab):`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`doc = Doc(en_vocab, words=["hello", "world"])`
Ensure path in Doc.to_disk/from_disk (resolves ##1521) Also add Doc serialization tests with both Path and string path options 2017-11-09 01:29:03 +00:00			`with make_tempdir() as d:`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`file_path = d / "doc"`
Ensure path in Doc.to_disk/from_disk (resolves ##1521) Also add Doc serialization tests with both Path and string path options 2017-11-09 01:29:03 +00:00			`doc.to_disk(file_path)`
			`doc_d = Doc(en_vocab).from_disk(file_path)`
			`assert doc.to_bytes() == doc_d.to_bytes()`


			`def test_serialize_doc_roundtrip_disk_str_path(en_vocab):`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`doc = Doc(en_vocab, words=["hello", "world"])`
Ensure path in Doc.to_disk/from_disk (resolves ##1521) Also add Doc serialization tests with both Path and string path options 2017-11-09 01:29:03 +00:00			`with make_tempdir() as d:`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 00:09:36 +00:00			`file_path = d / "doc"`
Drop Python 2.7 and 3.5 (#4828) * Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Revert lookups.md * Revert top-level.md * Small adjustments and docs [ci skip] 2019-12-22 00:53:56 +00:00			`file_path = str(file_path)`
Ensure path in Doc.to_disk/from_disk (resolves ##1521) Also add Doc serialization tests with both Path and string path options 2017-11-09 01:29:03 +00:00			`doc.to_disk(file_path)`
			`doc_d = Doc(en_vocab).from_disk(file_path)`
			`assert doc.to_bytes() == doc_d.to_bytes()`
💫 Make serialization methods consistent (#3385) * Make serialization methods consistent exclude keyword argument instead of random named keyword arguments and deprecation handling * Update docs and add section on serialization fields 2019-03-10 18:16:45 +00:00

			`def test_serialize_doc_exclude(en_vocab):`
			`doc = Doc(en_vocab, words=["hello", "world"])`
			`doc.user_data["foo"] = "bar"`
			`new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())`
			`assert new_doc.user_data["foo"] == "bar"`
			`new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"])`
			`assert not new_doc.user_data`
			`new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))`
			`assert not new_doc.user_data`
Bugfix initializing DocBin with attributes (#4368) * docbin init fix + documentation fix + unit tests * newline * try with zlib instead of gzip (python 2 incompatibilities) 2019-10-03 12:48:45 +00:00

Add SpanGroup and Graph container types to represent arbitrary annotations (#6696) * Draft out initial Spans data structure * Initial span group commit * Basic span group support on Doc * Basic test for span group * Compile span_group.pyx * Draft addition of SpanGroup to DocBin * Add deserialization for SpanGroup * Add tests for serializing SpanGroup * Fix serialization of SpanGroup * Add EdgeC and GraphC structs * Add draft Graph data structure * Compile graph * More work on Graph * Update GraphC * Upd graph * Fix walk functions * Let Graph take nodes and edges on construction * Fix walking and getting * Add graph tests * Fix import * Add module with the SpanGroups dict thingy * Update test * Rename 'span_groups' attribute * Try to fix c++11 compilation * Fix test * Update DocBin * Try to fix compilation * Try to fix graph * Improve SpanGroup docstrings * Add doc.spans to documentation * Fix serialization * Tidy up and add docs * Update docs [ci skip] * Add SpanGroup.has_overlap * WIP updated Graph API * Start testing new Graph API * Update Graph tests * Update Graph * Add docstring Co-authored-by: Ines Montani <ines@ines.io> 2021-01-14 06:30:41 +00:00			`def test_serialize_doc_span_groups(en_vocab):`
			`doc = Doc(en_vocab, words=["hello", "world", "!"])`
			`doc.spans["content"] = [doc[0:2]]`
			`new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())`
			`assert len(new_doc.spans["content"]) == 1`