spaCy/spacy/tests/serialize/test_packer.py

from __future__ import unicode_literals

import re

import pytest
import numpy

from spacy.language import Language
from spacy.en import English
from spacy.vocab import Vocab
from spacy.tokens.doc import Doc
from spacy.tokenizer import Tokenizer
from os import path
import os

from spacy import util
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
from spacy.serialize.packer import Packer

from spacy.serialize.bits import BitArray


@pytest.fixture
def vocab():
    path = os.environ.get('SPACY_DATA')
    if path is None:
        path = util.match_best_version('en', None, util.get_data_path())
    else:
        path = util.match_best_version('en', None, path)

    vocab = English.Defaults('en', path).Vocab()
    lex = vocab['dog']
    assert vocab[vocab.strings['dog']].orth_ == 'dog'
    lex  = vocab['the']
    lex = vocab['quick']
    lex = vocab['jumped']
    return vocab


@pytest.fixture
def tokenizer(vocab):
    null_re = re.compile(r'!!!!!!!!!')
    tokenizer = Tokenizer(vocab, {}, null_re.search, null_re.search, null_re.finditer)
    return tokenizer


def test_char_packer(vocab):
    packer = Packer(vocab, [])
    bits = BitArray()
    bits.seek(0)

    byte_str = bytearray(b'the dog jumped')
    packer.char_codec.encode(byte_str, bits)
    bits.seek(0)
    result = [b''] * len(byte_str)
    packer.char_codec.decode(bits, result)
    assert bytearray(result) == byte_str


def test_packer_unannotated(tokenizer):
    packer = Packer(tokenizer.vocab, [])

    msg = tokenizer(u'the dog jumped')

    assert msg.string == 'the dog jumped'
    

    bits = packer.pack(msg)

    result = packer.unpack(bits)

    assert result.string == 'the dog jumped'


@pytest.mark.models
def test_packer_annotated(tokenizer):
    vocab = tokenizer.vocab
    nn = vocab.strings['NN']
    dt = vocab.strings['DT']
    vbd = vocab.strings['VBD']
    jj = vocab.strings['JJ']
    det = vocab.strings['det']
    nsubj = vocab.strings['nsubj']
    adj = vocab.strings['adj']
    root = vocab.strings['ROOT']

    attr_freqs = [
        (TAG, [(nn, 0.1), (dt, 0.2), (jj, 0.01), (vbd, 0.05)]),
        (DEP, {det: 0.2, nsubj: 0.1, adj: 0.05, root: 0.1}.items()),
        (HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items())
    ]

    packer = Packer(vocab, attr_freqs)

    msg = tokenizer(u'the dog jumped')

    msg.from_array(
        [TAG, DEP, HEAD],
        numpy.array([
            [dt, det, 1],
            [nn, nsubj, 1],
            [vbd, root, 0]
        ], dtype=numpy.int32))

    assert msg.string == 'the dog jumped'
    assert [t.tag_ for t in msg] == ['DT', 'NN', 'VBD']
    assert [t.dep_ for t in msg] == ['det', 'nsubj', 'ROOT']
    assert [(t.head.i - t.i) for t in msg] == [1, 1, 0]

    bits = packer.pack(msg)
    result = packer.unpack(bits)

    assert result.string == 'the dog jumped'
    assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
    assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
    assert [(t.head.i - t.i) for t in result] == [1, 1, 0]


def test_packer_bad_chars(tokenizer):
    string = u'naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin'
    packer = Packer(tokenizer.vocab, [])

    doc = tokenizer(string)
    bits = packer.pack(doc)
    result = packer.unpack(bits)
    assert result.string == doc.string


@pytest.mark.models
def test_packer_bad_chars(EN):
    string = u'naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin'
    doc = EN(string)
    byte_string = doc.to_bytes()
    result = Doc(EN.vocab).from_bytes(byte_string)
    assert [t.tag_ for t in result] == [t.tag_ for t in doc]
* Tests for serializer 2015-07-17 19:21:10 +00:00			`from __future__ import unicode_literals`

* Update test_packer 2015-07-19 23:38:29 +00:00			`import re`

* Tests for serializer 2015-07-17 19:21:10 +00:00			`import pytest`
			`import numpy`

* Fix serializer tests for new attr scheme 2015-08-26 17:22:26 +00:00			`from spacy.language import Language`
* Fix test of serializer 2015-11-03 08:45:16 +00:00			`from spacy.en import English`
* Tests for serializer 2015-07-17 19:21:10 +00:00			`from spacy.vocab import Vocab`
			`from spacy.tokens.doc import Doc`
* Update test_packer 2015-07-19 23:38:29 +00:00			`from spacy.tokenizer import Tokenizer`
			`from os import path`
Use util.Package class for io Previous Sputnik integration caused API change: Vocab, Tagger, etc were loaded via a from_package classmethod, that required a sputnik.Package instance. This forced users to first create a sputnik.Sputnik() instance, in order to acquire a Package via sp.pool(). Instead I've created a small file-system shim, util.Package, which allows classes to have a .load() classmethod, that accepts either util.Package objects, or strings. We can later gut the internals of this and make it a proxy for Sputnik if we need more functionality that should live in the Sputnik library. Sputnik is now only used to download and install the data, in spacy.en.download 2015-12-29 17:00:48 +00:00			`import os`
* Update test_packer 2015-07-19 23:38:29 +00:00
untangle data_path/via 2016-01-16 11:23:45 +00:00			`from spacy import util`
* Update serializer tests 2015-07-18 20:46:40 +00:00			`from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD`
* Tests for serializer 2015-07-17 19:21:10 +00:00			`from spacy.serialize.packer import Packer`

			`from spacy.serialize.bits import BitArray`


			`@pytest.fixture`
			`def vocab():`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00			`path = os.environ.get('SPACY_DATA')`
			`if path is None:`
			`path = util.match_best_version('en', None, util.get_data_path())`
Use util.Package class for io Previous Sputnik integration caused API change: Vocab, Tagger, etc were loaded via a from_package classmethod, that required a sputnik.Package instance. This forced users to first create a sputnik.Sputnik() instance, in order to acquire a Package via sp.pool(). Instead I've created a small file-system shim, util.Package, which allows classes to have a .load() classmethod, that accepts either util.Package objects, or strings. We can later gut the internals of this and make it a proxy for Sputnik if we need more functionality that should live in the Sputnik library. Sputnik is now only used to download and install the data, in spacy.en.download 2015-12-29 17:00:48 +00:00			`else:`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00			`path = util.match_best_version('en', None, path)`
untangle data_path/via 2016-01-16 11:23:45 +00:00
Finish refactoring data loading 2016-09-24 18:26:17 +00:00			`vocab = English.Defaults('en', path).Vocab()`
* Fix serializer tests for new attr scheme 2015-08-26 17:22:26 +00:00			`lex = vocab['dog']`
* Update serializer tests 2015-07-18 20:46:40 +00:00			`assert vocab[vocab.strings['dog']].orth_ == 'dog'`
* Fix serializer tests for new attr scheme 2015-08-26 17:22:26 +00:00			`lex = vocab['the']`
			`lex = vocab['quick']`
			`lex = vocab['jumped']`
* Tests for serializer 2015-07-17 19:21:10 +00:00			`return vocab`


* Update test_packer 2015-07-19 23:38:29 +00:00			`@pytest.fixture`
			`def tokenizer(vocab):`
			`null_re = re.compile(r'!!!!!!!!!')`
Finish refactoring data loading 2016-09-24 18:26:17 +00:00			`tokenizer = Tokenizer(vocab, {}, null_re.search, null_re.search, null_re.finditer)`
* Update test_packer 2015-07-19 23:38:29 +00:00			`return tokenizer`


			`def test_char_packer(vocab):`
			`packer = Packer(vocab, [])`
			`bits = BitArray()`
			`bits.seek(0)`

* Py 2/3 compatibility of serialize tests 2015-07-24 02:51:53 +00:00			`byte_str = bytearray(b'the dog jumped')`
* Update test_packer 2015-07-19 23:38:29 +00:00			`packer.char_codec.encode(byte_str, bits)`
			`bits.seek(0)`
			`result = [b''] * len(byte_str)`
			`packer.char_codec.decode(bits, result)`
* Update tests for python3 2015-07-24 01:47:59 +00:00			`assert bytearray(result) == byte_str`
* Tests for serializer 2015-07-17 19:21:10 +00:00
* Update test_packer 2015-07-19 23:38:29 +00:00
			`def test_packer_unannotated(tokenizer):`
			`packer = Packer(tokenizer.vocab, [])`

			`msg = tokenizer(u'the dog jumped')`
* Tests for serializer 2015-07-17 19:21:10 +00:00
			`assert msg.string == 'the dog jumped'`
* Update test_packer 2015-07-19 23:38:29 +00:00
* Tests for serializer 2015-07-17 19:21:10 +00:00
			`bits = packer.pack(msg)`

			`result = packer.unpack(bits)`

			`assert result.string == 'the dog jumped'`

* Update test_packer 2015-07-19 23:38:29 +00:00
* Mark serializer test as requiring models 2015-11-03 07:07:08 +00:00			`@pytest.mark.models`
* Update test_packer 2015-07-19 23:38:29 +00:00			`def test_packer_annotated(tokenizer):`
			`vocab = tokenizer.vocab`
* Tests for serializer 2015-07-17 19:21:10 +00:00			`nn = vocab.strings['NN']`
			`dt = vocab.strings['DT']`
			`vbd = vocab.strings['VBD']`
			`jj = vocab.strings['JJ']`
			`det = vocab.strings['det']`
			`nsubj = vocab.strings['nsubj']`
			`adj = vocab.strings['adj']`
			`root = vocab.strings['ROOT']`

			`attr_freqs = [`
			`(TAG, [(nn, 0.1), (dt, 0.2), (jj, 0.01), (vbd, 0.05)]),`
			`(DEP, {det: 0.2, nsubj: 0.1, adj: 0.05, root: 0.1}.items()),`
			`(HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items())`
			`]`

			`packer = Packer(vocab, attr_freqs)`

* Update test_packer 2015-07-19 23:38:29 +00:00			`msg = tokenizer(u'the dog jumped')`

* Tests for serializer 2015-07-17 19:21:10 +00:00			`msg.from_array(`
			`[TAG, DEP, HEAD],`
			`numpy.array([`
			`[dt, det, 1],`
			`[nn, nsubj, 1],`
			`[vbd, root, 0]`
			`], dtype=numpy.int32))`

			`assert msg.string == 'the dog jumped'`
			`assert [t.tag_ for t in msg] == ['DT', 'NN', 'VBD']`
			`assert [t.dep_ for t in msg] == ['det', 'nsubj', 'ROOT']`
			`assert [(t.head.i - t.i) for t in msg] == [1, 1, 0]`

			`bits = packer.pack(msg)`
			`result = packer.unpack(bits)`

			`assert result.string == 'the dog jumped'`
			`assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']`
			`assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']`
			`assert [(t.head.i - t.i) for t in result] == [1, 1, 0]`
* Upd serialization tests 2015-07-27 19:25:48 +00:00

			`def test_packer_bad_chars(tokenizer):`
			`string = u'naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin'`
			`packer = Packer(tokenizer.vocab, [])`

			`doc = tokenizer(string)`
			`bits = packer.pack(doc)`
			`result = packer.unpack(bits)`
			`assert result.string == doc.string`


			`@pytest.mark.models`
			`def test_packer_bad_chars(EN):`
			`string = u'naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin'`
			`doc = EN(string)`
			`byte_string = doc.to_bytes()`
			`result = Doc(EN.vocab).from_bytes(byte_string)`
			`assert [t.tag_ for t in result] == [t.tag_ for t in doc]`