spaCy/spacy/tests/serialize/test_serialization.py

# coding: utf-8
from __future__ import unicode_literals

from ...serialize.packer import Packer
from ..util import get_doc, assert_docs_equal

import pytest


TEXT = ["This", "is", "a", "test", "sentence", "."]
TAGS = ['DT', 'VBZ', 'DT', 'NN', 'NN', '.']
DEPS = ['nsubj', 'ROOT', 'det', 'compound', 'attr', 'punct']
ENTS = [('hi', 'PERSON', 0, 1)]


def test_serialize_empty_doc(en_vocab):
    doc = get_doc(en_vocab)
    packer = Packer(en_vocab, {})
    b = packer.pack(doc)
    assert b == b''
    loaded = get_doc(en_vocab).from_bytes(b)
    assert len(loaded) == 0


@pytest.mark.parametrize('text', [TEXT])
def test_serialize_tokens(en_vocab, text):
    doc1 = get_doc(en_vocab, [t for t in text])
    doc2 = get_doc(en_vocab).from_bytes(doc1.to_bytes())
    assert_docs_equal(doc1, doc2)


@pytest.mark.models
@pytest.mark.parametrize('text', [TEXT])
@pytest.mark.parametrize('tags', [TAGS, []])
@pytest.mark.parametrize('deps', [DEPS, []])
@pytest.mark.parametrize('ents', [ENTS, []])
def test_serialize_tokens_ner(EN, text, tags, deps, ents):
    doc1 = get_doc(EN.vocab, [t for t in text], tags=tags, deps=deps, ents=ents)
    doc2 = get_doc(EN.vocab).from_bytes(doc1.to_bytes())
    assert_docs_equal(doc1, doc2)
Modernise and merge serialization tests 2017-01-12 20:57:19 +00:00			`# coding: utf-8`
add tests for serialization bug 2016-05-02 09:01:56 +00:00			`from __future__ import unicode_literals`

Modernise and merge serialization tests 2017-01-12 20:57:19 +00:00			`from ...serialize.packer import Packer`
			`from ..util import get_doc, assert_docs_equal`
add tests for serialization bug 2016-05-02 09:01:56 +00:00
Modernise and merge serialization tests 2017-01-12 20:57:19 +00:00			`import pytest`
add tests for serialization bug 2016-05-02 09:01:56 +00:00

Modernise and merge serialization tests 2017-01-12 20:57:19 +00:00			`TEXT = ["This", "is", "a", "test", "sentence", "."]`
			`TAGS = ['DT', 'VBZ', 'DT', 'NN', 'NN', '.']`
			`DEPS = ['nsubj', 'ROOT', 'det', 'compound', 'attr', 'punct']`
			`ENTS = [('hi', 'PERSON', 0, 1)]`
Test Issue #459: Fail to deserialize empty doc 2016-10-23 14:30:22 +00:00

Modernise and merge serialization tests 2017-01-12 20:57:19 +00:00			`def test_serialize_empty_doc(en_vocab):`
			`doc = get_doc(en_vocab)`
			`packer = Packer(en_vocab, {})`
Test Issue #459: Fail to deserialize empty doc 2016-10-23 14:30:22 +00:00			`b = packer.pack(doc)`
			`assert b == b''`
Modernise and merge serialization tests 2017-01-12 20:57:19 +00:00			`loaded = get_doc(en_vocab).from_bytes(b)`
Test Issue #459: Fail to deserialize empty doc 2016-10-23 14:30:22 +00:00			`assert len(loaded) == 0`
Test Issue #514: Serialization fails after adding a new entity label. 2016-10-23 14:40:27 +00:00

Modernise and merge serialization tests 2017-01-12 20:57:19 +00:00			`@pytest.mark.parametrize('text', [TEXT])`
			`def test_serialize_tokens(en_vocab, text):`
			`doc1 = get_doc(en_vocab, [t for t in text])`
			`doc2 = get_doc(en_vocab).from_bytes(doc1.to_bytes())`
			`assert_docs_equal(doc1, doc2)`
Test Issue #514: Serializer fails when new entity type has been added. 2016-10-23 15:41:32 +00:00

			`@pytest.mark.models`
Modernise and merge serialization tests 2017-01-12 20:57:19 +00:00			`@pytest.mark.parametrize('text', [TEXT])`
			`@pytest.mark.parametrize('tags', [TAGS, []])`
			`@pytest.mark.parametrize('deps', [DEPS, []])`
			`@pytest.mark.parametrize('ents', [ENTS, []])`
			`def test_serialize_tokens_ner(EN, text, tags, deps, ents):`
			`doc1 = get_doc(EN.vocab, [t for t in text], tags=tags, deps=deps, ents=ents)`
			`doc2 = get_doc(EN.vocab).from_bytes(doc1.to_bytes())`
			`assert_docs_equal(doc1, doc2)`