spaCy/spacy/tests/serialize/test_io.py

# coding: utf-8
from __future__ import unicode_literals

from ...tokens import Doc
from ..util import get_doc

import pytest


def test_serialize_io_read_write(en_vocab, text_file_b):
    text1 = ["This", "is", "a", "simple", "test", ".", "With", "a", "couple", "of", "sentences", "."]
    text2 = ["This", "is", "another", "test", "document", "."]

    doc1 = get_doc(en_vocab, text1)
    doc2 = get_doc(en_vocab, text2)
    text_file_b.write(doc1.to_bytes())
    text_file_b.write(doc2.to_bytes())
    text_file_b.seek(0)
    bytes1, bytes2 = Doc.read_bytes(text_file_b)
    result1 = get_doc(en_vocab).from_bytes(bytes1)
    result2 = get_doc(en_vocab).from_bytes(bytes2)
    assert result1.text_with_ws == doc1.text_with_ws
    assert result2.text_with_ws == doc2.text_with_ws


def test_serialize_io_left_right(en_vocab):
    text = ["This", "is", "a", "simple", "test", ".", "With", "a",  "couple", "of", "sentences", "."]
    doc = get_doc(en_vocab, text)
    result = Doc(en_vocab).from_bytes(doc.to_bytes())

    for token in result:
        assert token.head.i == doc[token.i].head.i
        if token.head is not token:
            assert token.i in [w.i for w in token.head.children]
        for child in token.lefts:
            assert child.head.i == token.i
        for child in token.rights:
            assert child.head.i == token.i


@pytest.mark.models
def test_lemmas(EN):
    text = "The geese are flying"
    doc = EN(text)
    result = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert result[1].lemma_ == 'goose'
    assert result[2].lemma_ == 'be'
    assert result[3].lemma_ == 'fly'
Modernise serializer I/O tests and don't depend on models where possible 2017-01-13 01:24:56 +00:00			`# coding: utf-8`
			`from __future__ import unicode_literals`
* Add test_io 2015-07-22 23:19:59 +00:00
Modernise serializer I/O tests and don't depend on models where possible 2017-01-13 01:24:56 +00:00			`from ...tokens import Doc`
			`from ..util import get_doc`
* Add test_io 2015-07-22 23:19:59 +00:00
Modernise serializer I/O tests and don't depend on models where possible 2017-01-13 01:24:56 +00:00			`import pytest`
* Add test_io 2015-07-22 23:19:59 +00:00
avoid writing to /tmp (not cross-platform compatible) 2015-12-16 18:53:06 +00:00
Modernise serializer I/O tests and don't depend on models where possible 2017-01-13 01:24:56 +00:00			`def test_serialize_io_read_write(en_vocab, text_file_b):`
			`text1 = ["This", "is", "a", "simple", "test", ".", "With", "a", "couple", "of", "sentences", "."]`
			`text2 = ["This", "is", "another", "test", "document", "."]`
avoid writing to /tmp (not cross-platform compatible) 2015-12-16 18:53:06 +00:00
Modernise serializer I/O tests and don't depend on models where possible 2017-01-13 01:24:56 +00:00			`doc1 = get_doc(en_vocab, text1)`
			`doc2 = get_doc(en_vocab, text2)`
			`text_file_b.write(doc1.to_bytes())`
			`text_file_b.write(doc2.to_bytes())`
			`text_file_b.seek(0)`
			`bytes1, bytes2 = Doc.read_bytes(text_file_b)`
			`result1 = get_doc(en_vocab).from_bytes(bytes1)`
			`result2 = get_doc(en_vocab).from_bytes(bytes2)`
			`assert result1.text_with_ws == doc1.text_with_ws`
			`assert result2.text_with_ws == doc2.text_with_ws`
* Add test to check parse is being deserialized properly 2015-07-28 19:04:00 +00:00

Modernise serializer I/O tests and don't depend on models where possible 2017-01-13 01:24:56 +00:00			`def test_serialize_io_left_right(en_vocab):`
			`text = ["This", "is", "a", "simple", "test", ".", "With", "a", "couple", "of", "sentences", "."]`
			`doc = get_doc(en_vocab, text)`
			`result = Doc(en_vocab).from_bytes(doc.to_bytes())`
* Add test to check parse is being deserialized properly 2015-07-28 19:04:00 +00:00
Modernise serializer I/O tests and don't depend on models where possible 2017-01-13 01:24:56 +00:00			`for token in result:`
			`assert token.head.i == doc[token.i].head.i`
			`if token.head is not token:`
			`assert token.i in [w.i for w in token.head.children]`
			`for child in token.lefts:`
			`assert child.head.i == token.i`
			`for child in token.rights:`
			`assert child.head.i == token.i`
* Add test to check parse is being deserialized properly 2015-07-28 19:04:00 +00:00
* Ensure morphological features and lemmas are loaded in from_array, re Issue #152 2015-11-03 06:56:50 +00:00
			`@pytest.mark.models`
			`def test_lemmas(EN):`
Modernise serializer I/O tests and don't depend on models where possible 2017-01-13 01:24:56 +00:00			`text = "The geese are flying"`
			`doc = EN(text)`
			`result = Doc(doc.vocab).from_bytes(doc.to_bytes())`
			`assert result[1].lemma_ == 'goose'`
			`assert result[2].lemma_ == 'be'`
			`assert result[3].lemma_ == 'fly'`