spaCy/tests/serialize/test_packer.py

98 lines
2.7 KiB
Python

from __future__ import unicode_literals
import pytest
import numpy
from spacy.vocab import Vocab
from spacy.tokens.doc import Doc
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
from spacy.serialize.packer import Packer
from spacy.serialize.bits import BitArray
def get_lex_props(string, prob=-22):
return {
'flags': 0,
'length': len(string),
'orth': string,
'lower': string,
'norm': string,
'shape': string,
'prefix': string[0],
'suffix': string[-3:],
'cluster': 0,
'prob': prob,
'sentiment': 0
}
@pytest.fixture
def vocab():
vocab = Vocab(get_lex_props=get_lex_props)
vocab['dog'] = get_lex_props('dog', 0.001)
assert vocab[vocab.strings['dog']].orth_ == 'dog'
vocab['the'] = get_lex_props('the', 0.01)
vocab['quick'] = get_lex_props('quick', 0.005)
vocab['jumped'] = get_lex_props('jumped', 0.007)
return vocab
def test_packer_unannotated(vocab):
packer = Packer(vocab, [(ORTH, [(lex.orth, lex.prob) for lex in vocab]),
(SPACY, [])])
ids = [vocab[w].orth for w in 'the dog jumped'.split()]
msg = Doc.from_ids(vocab, ids, [1, 1, 0])
assert msg.string == 'the dog jumped'
bits = packer.pack(msg)
result = packer.unpack(bits)
assert result.string == 'the dog jumped'
def test_packer_annotated(vocab):
nn = vocab.strings['NN']
dt = vocab.strings['DT']
vbd = vocab.strings['VBD']
jj = vocab.strings['JJ']
det = vocab.strings['det']
nsubj = vocab.strings['nsubj']
adj = vocab.strings['adj']
root = vocab.strings['ROOT']
attr_freqs = [
(ORTH, [(lex.orth, lex.prob) for lex in vocab]),
(SPACY, []),
(TAG, [(nn, 0.1), (dt, 0.2), (jj, 0.01), (vbd, 0.05)]),
(DEP, {det: 0.2, nsubj: 0.1, adj: 0.05, root: 0.1}.items()),
(HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items())
]
packer = Packer(vocab, attr_freqs)
ids = [vocab[w].orth for w in 'the dog jumped'.split()]
msg = Doc.from_ids(vocab, ids, [1, 1, 0])
msg.from_array(
[TAG, DEP, HEAD],
numpy.array([
[dt, det, 1],
[nn, nsubj, 1],
[vbd, root, 0]
], dtype=numpy.int32))
assert msg.string == 'the dog jumped'
assert [t.tag_ for t in msg] == ['DT', 'NN', 'VBD']
assert [t.dep_ for t in msg] == ['det', 'nsubj', 'ROOT']
assert [(t.head.i - t.i) for t in msg] == [1, 1, 0]
bits = packer.pack(msg)
result = packer.unpack(bits)
assert result.string == 'the dog jumped'
assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
assert [(t.head.i - t.i) for t in result] == [1, 1, 0]