spaCy/spacy/tests/serialize/test_serialization.py

52 lines
1.5 KiB
Python
Raw Normal View History

# coding: utf-8
2016-05-02 09:01:56 +00:00
from __future__ import unicode_literals
from ..util import get_doc, assert_docs_equal
2017-05-09 16:12:03 +00:00
from ...tokens import Doc
2017-05-28 23:08:10 +00:00
from ...vocab import Vocab
2016-05-02 09:01:56 +00:00
import pytest
2016-05-02 09:01:56 +00:00
TEXT = ["This", "is", "a", "test", "sentence", "."]
TAGS = ['DT', 'VBZ', 'DT', 'NN', 'NN', '.']
DEPS = ['nsubj', 'ROOT', 'det', 'compound', 'attr', 'punct']
ENTS = [('hi', 'PERSON', 0, 1)]
def test_serialize_empty_doc(en_vocab):
doc = get_doc(en_vocab)
2017-05-09 16:12:03 +00:00
data = doc.to_bytes()
doc2 = Doc(en_vocab)
doc2.from_bytes(data)
assert len(doc) == len(doc2)
for token1, token2 in zip(doc, doc2):
assert token1.text == token2.text
2017-05-28 23:08:10 +00:00
@pytest.mark.xfail
@pytest.mark.parametrize('text', ['rat'])
def test_serialize_vocab(en_vocab, text):
text_hash = en_vocab.strings.add(text)
vocab_bytes = en_vocab.to_bytes()
new_vocab = Vocab().from_bytes(vocab_bytes)
assert new_vocab.strings(text_hash) == text
2017-05-09 16:12:03 +00:00
#
#@pytest.mark.parametrize('text', [TEXT])
#def test_serialize_tokens(en_vocab, text):
# doc1 = get_doc(en_vocab, [t for t in text])
# doc2 = get_doc(en_vocab).from_bytes(doc1.to_bytes())
# assert_docs_equal(doc1, doc2)
#
#
#@pytest.mark.models
#@pytest.mark.parametrize('text', [TEXT])
#@pytest.mark.parametrize('tags', [TAGS, []])
#@pytest.mark.parametrize('deps', [DEPS, []])
#@pytest.mark.parametrize('ents', [ENTS, []])
#def test_serialize_tokens_ner(EN, text, tags, deps, ents):
# doc1 = get_doc(EN.vocab, [t for t in text], tags=tags, deps=deps, ents=ents)
# doc2 = get_doc(EN.vocab).from_bytes(doc1.to_bytes())
# assert_docs_equal(doc1, doc2)