from __future__ import unicode_literals import pytest import numpy from spacy.vocab import Vocab from spacy.tokens.doc import Doc from spacy.attrs import ID, SPACY, TAG, DEP, HEAD from spacy.serialize.packer import Packer from spacy.serialize.bits import BitArray def get_lex_props(string, prob=-22): return { 'flags': 0, 'length': len(string), 'orth': string, 'lower': string, 'norm': string, 'shape': string, 'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': prob, 'sentiment': 0 } @pytest.fixture def vocab(): vocab = Vocab(get_lex_props=get_lex_props) vocab['dog'] = get_lex_props('dog', 0.001) vocab['the'] = get_lex_props('the', 0.01) vocab['quick'] = get_lex_props('quick', 0.005) vocab['jumped'] = get_lex_props('jumped', 0.007) return vocab def test_packer_unannotated(vocab): packer = Packer(vocab, [(ID, {}), (SPACY, {})]) ids = [vocab[w].id for w in 'the dog jumped'.split()] msg = Doc.from_ids(vocab, ids, [1, 1, 0]) assert msg.string == 'the dog jumped' bits = packer.pack(msg) result = packer.unpack(bits) assert result.string == 'the dog jumped' def test_packer_annotated(vocab): nn = vocab.strings['NN'] dt = vocab.strings['DT'] vbd = vocab.strings['VBD'] jj = vocab.strings['JJ'] det = vocab.strings['det'] nsubj = vocab.strings['nsubj'] adj = vocab.strings['adj'] root = vocab.strings['ROOT'] attr_freqs = [ (ID, []), (SPACY, []), (TAG, [(nn, 0.1), (dt, 0.2), (jj, 0.01), (vbd, 0.05)]), (DEP, {det: 0.2, nsubj: 0.1, adj: 0.05, root: 0.1}.items()), (HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items()) ] packer = Packer(vocab, attr_freqs) ids = [vocab[w].id for w in 'the dog jumped'.split()] msg = Doc.from_ids(vocab, ids, [1, 1, 0]) msg.from_array( [TAG, DEP, HEAD], numpy.array([ [dt, det, 1], [nn, nsubj, 1], [vbd, root, 0] ], dtype=numpy.int32)) assert msg.string == 'the dog jumped' assert [t.tag_ for t in msg] == ['DT', 'NN', 'VBD'] assert [t.dep_ for t in msg] == ['det', 'nsubj', 'ROOT'] assert [(t.head.i - t.i) for t in msg] == [1, 1, 0] bits = packer.pack(msg) result = packer.unpack(bits) assert result.string == 'the dog jumped' assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD'] assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT'] assert [(t.head.i - t.i) for t in result] == [1, 1, 0]