Work on to/from bytes/disk serialization methods

This commit is contained in:
Matthew Honnibal 2017-05-29 11:45:45 +02:00
parent 6b019b0540
commit ff26aa6c37
6 changed files with 205 additions and 65 deletions

View File

@ -366,20 +366,22 @@ class Language(object):
>>> nlp.to_disk('/path/to/models')
"""
path = util.ensure_path(path)
if not path.exists():
path.mkdir()
if not path.is_dir():
raise IOError("Output path must be a directory")
props = {}
for name, value in self.__dict__.items():
if name in disable:
continue
if hasattr(value, 'to_disk'):
value.to_disk(path / name)
else:
props[name] = value
with (path / 'props.pickle').open('wb') as file_:
dill.dump(props, file_)
with path.open('wb') as file_:
file_.write(self.to_bytes(disable))
#serializers = {
# 'vocab': lambda p: self.vocab.to_disk(p),
# 'tokenizer': lambda p: self.tokenizer.to_disk(p, vocab=False),
# 'meta.json': lambda p: ujson.dump(p.open('w'), self.meta)
#}
#for proc in self.pipeline:
# if not hasattr(proc, 'name'):
# continue
# if proc.name in disable:
# continue
# if not hasattr(proc, 'to_disk'):
# continue
# serializers[proc.name] = lambda p: proc.to_disk(p, vocab=False)
#util.to_disk(serializers, path)
def from_disk(self, path, disable=[]):
"""Loads state from a directory. Modifies the object in place and
@ -396,13 +398,24 @@ class Language(object):
>>> nlp = Language().from_disk('/path/to/models')
"""
path = util.ensure_path(path)
for name in path.iterdir():
if name not in disable and hasattr(self, str(name)):
getattr(self, name).from_disk(path / name)
with (path / 'props.pickle').open('rb') as file_:
with path.open('rb') as file_:
bytes_data = file_.read()
self.from_bytes(bytes_data, disable)
return self
return self.from_bytes(bytes_data, disable)
#deserializers = {
# 'vocab': lambda p: self.vocab.from_disk(p),
# 'tokenizer': lambda p: self.tokenizer.from_disk(p, vocab=False),
# 'meta.json': lambda p: ujson.dump(p.open('w'), self.meta)
#}
#for proc in self.pipeline:
# if not hasattr(proc, 'name'):
# continue
# if proc.name in disable:
# continue
# if not hasattr(proc, 'to_disk'):
# continue
# deserializers[proc.name] = lambda p: proc.from_disk(p, vocab=False)
#util.from_disk(deserializers, path)
#return self
def to_bytes(self, disable=[]):
"""Serialize the current state to a binary string.
@ -411,11 +424,20 @@ class Language(object):
from being serialized.
RETURNS (bytes): The serialized form of the `Language` object.
"""
props = dict(self.__dict__)
for key in disable:
if key in props:
props.pop(key)
return dill.dumps(props, -1)
serializers = {
'vocab': lambda: self.vocab.to_bytes(),
'tokenizer': lambda: self.tokenizer.to_bytes(vocab=False),
'meta': lambda: ujson.dumps(self.meta)
}
for proc in self.pipeline:
if not hasattr(proc, 'name'):
continue
if proc.name in disable:
continue
if not hasattr(proc, 'to_bytes'):
continue
serializers[proc.name] = lambda: proc.to_bytes(p, vocab=False)
return util.to_bytes(serializers)
def from_bytes(self, bytes_data, disable=[]):
"""Load state from a binary string.
@ -424,12 +446,23 @@ class Language(object):
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The `Language` object.
"""
props = dill.loads(bytes_data)
for key, value in props.items():
if key not in disable:
setattr(self, key, value)
deserializers = {
'vocab': lambda b: self.vocab.from_bytes(b),
'tokenizer': lambda b: self.tokenizer.from_bytes(b, vocab=False),
'meta': lambda b: self.meta.update(ujson.loads(b))
}
for proc in self.pipeline:
if not hasattr(proc, 'name'):
continue
if proc.name in disable:
continue
if not hasattr(proc, 'to_disk'):
continue
deserializers[proc.name] = lambda b: proc.from_bytes(b, vocab=False)
util.from_bytes(deserializers, bytes_data)
return self
def _pipe(func, docs):
for doc in docs:
func(doc)

View File

@ -35,7 +35,6 @@ from .syntax import nonproj
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
from ._ml import model_to_bytes, model_from_bytes
from .parts_of_speech import X
@ -160,36 +159,33 @@ class TokenVectorEncoder(object):
def to_bytes(self, **exclude):
serialize = {
'model': lambda: model_to_bytes(self.model),
'model': lambda: util.model_to_bytes(self.model),
'vocab': lambda: self.vocab.to_bytes()
}
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
deserialize = {
'model': lambda b: model_from_bytes(self.model, b),
'model': lambda b: util.model_from_bytes(self.model, b),
'vocab': lambda b: self.vocab.from_bytes(b)
}
util.from_bytes(deserialize, exclude)
return self
def to_disk(self, path, **exclude):
path = util.ensure_path(path)
if not path.exists():
path.mkdir()
if 'vocab' not in exclude:
self.vocab.to_disk(path / 'vocab')
if 'model' not in exclude:
with (path / 'model.bin').open('wb') as file_:
file_.write(util.model_to_bytes(self.model))
serialize = {
'model': lambda p: p.open('w').write(util.model_to_bytes(self.model)),
'vocab': lambda p: self.vocab.to_disk(p)
}
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
path = util.ensure_path(path)
if 'vocab' not in exclude:
self.vocab.from_disk(path / 'vocab')
if 'model.bin' not in exclude:
with (path / 'model.bin').open('rb') as file_:
util.model_from_bytes(self.model, file_.read())
deserialize = {
'model': lambda p: util.model_from_bytes(self.model, p.open('rb').read()),
'vocab': lambda p: self.vocab.from_disk(p)
}
util.from_disk(path, deserialize, exclude)
return self
class NeuralTagger(object):
@ -291,19 +287,33 @@ class NeuralTagger(object):
def to_bytes(self, **exclude):
serialize = {
'model': lambda: model_to_bytes(self.model),
'model': lambda: util.model_to_bytes(self.model),
'vocab': lambda: self.vocab.to_bytes()
}
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
deserialize = {
'model': lambda b: model_from_bytes(self.model, b),
'model': lambda b: util.model_from_bytes(self.model, b),
'vocab': lambda b: self.vocab.from_bytes(b)
}
util.from_bytes(deserialize, exclude)
return self
def to_disk(self, path, **exclude):
serialize = {
'model': lambda p: p.open('w').write(util.model_to_bytes(self.model)),
'vocab': lambda p: self.vocab.to_disk(p)
}
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
deserialize = {
'model': lambda p: util.model_from_bytes(self.model, p.open('rb').read()),
'vocab': lambda p: self.vocab.from_disk(p)
}
util.from_disk(path, deserialize, exclude)
return self
class NeuralLabeller(NeuralTagger):

View File

@ -631,37 +631,53 @@ cdef class Parser:
with self.model[1].use_params(params):
yield
def to_disk(self, path):
path = util.ensure_path(path)
with (path / 'model.bin').open('wb') as file_:
dill.dump(self.model, file_)
def to_disk(self, path, **exclude):
serializers = {
'model': lambda p: p.open('wb').write(
util.model_to_bytes(self.model)),
'vocab': lambda p: self.vocab.to_disk(p),
'moves': lambda p: self.moves.to_disk(p, strings=False),
'cfg': lambda p: ujson.dumps(p.open('w'), self.cfg)
}
util.to_disk(path, serializers, exclude)
def from_disk(self, path):
path = util.ensure_path(path)
with (path / 'model.bin').open('wb') as file_:
self.model = dill.load(file_)
def from_disk(self, path, **exclude):
deserializers = {
'vocab': lambda p: self.vocab.from_disk(p),
'moves': lambda p: self.moves.from_disk(p, strings=False),
'cfg': lambda p: self.cfg.update(ujson.load((path/'cfg.json').open())),
'model': lambda p: None
}
util.from_disk(path, deserializers, exclude)
if 'model' not in exclude:
path = util.ensure_path(path)
if self.model is True:
self.model = self.Model(**self.cfg)
util.model_from_disk(self.model, path / 'model')
return self
def to_bytes(self, **exclude):
serialize = {
serializers = {
'model': lambda: util.model_to_bytes(self.model),
'vocab': lambda: self.vocab.to_bytes(),
'moves': lambda: self.moves.to_bytes(),
'moves': lambda: self.moves.to_bytes(vocab=False),
'cfg': lambda: ujson.dumps(self.cfg)
}
return util.to_bytes(serialize, exclude)
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude):
deserialize = {
deserializers = {
'vocab': lambda b: self.vocab.from_bytes(b),
'moves': lambda b: self.moves.from_bytes(b),
'cfg': lambda b: self.cfg.update(ujson.loads(b)),
'model': lambda b: None
}
msg = util.from_bytes(deserialize, exclude)
msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude:
if self.model is True:
self.model = self.Model(**msg['cfg'])
util.model_from_disk(self.model, msg['model'])
print(msg['cfg'])
self.model = self.Model(self.moves.n_moves)
util.model_from_bytes(self.model, msg['model'])
return self

View File

@ -6,7 +6,9 @@ from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from collections import defaultdict, OrderedDict
import ujson
from .. import util
from ..structs cimport TokenC
from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
@ -153,3 +155,48 @@ cdef class TransitionSystem:
assert self.c[self.n_moves].label == label_id
self.n_moves += 1
return 1
def to_disk(self, path, **exclude):
actions = list(self.move_names)
deserializers = {
'actions': lambda p: ujson.dump(p.open('w'), actions),
'strings': lambda p: self.strings.to_disk(p)
}
util.to_disk(path, deserializers, exclude)
def from_disk(self, path, **exclude):
actions = []
deserializers = {
'strings': lambda p: self.strings.from_disk(p),
'actions': lambda p: actions.extend(ujson.load(p.open()))
}
util.from_disk(path, deserializers, exclude)
for move, label in actions:
self.add_action(move, label)
return self
def to_bytes(self, **exclude):
transitions = []
for trans in self.c[:self.n_moves]:
transitions.append({
'clas': trans.clas,
'move': trans.move,
'label': self.strings[trans.label],
'name': self.move_name(trans.move, trans.label)
})
serializers = {
'transitions': lambda: ujson.dumps(transitions),
'strings': lambda: self.strings.to_bytes()
}
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude):
transitions = []
deserializers = {
'transitions': lambda b: transitions.extend(ujson.loads(b)),
'strings': lambda b: self.strings.from_bytes(b)
}
msg = util.from_bytes(bytes_data, deserializers, exclude)
for trans in transitions:
self.add_action(trans['move'], trans['label'])
return self

View File

@ -0,0 +1,34 @@
import pytest
from ...pipeline import NeuralDependencyParser
from ...vocab import Vocab
@pytest.fixture
def vocab():
return Vocab()
@pytest.fixture
def parser(vocab):
parser = NeuralDependencyParser(vocab)
parser.add_label('nsubj')
parser.model, cfg = parser.Model(parser.moves.n_moves)
parser.cfg.update(cfg)
return parser
@pytest.fixture
def blank_parser(vocab):
parser = NeuralDependencyParser(vocab)
return parser
def test_to_from_bytes(parser, blank_parser):
assert parser.model is not True
assert blank_parser.model is True
assert blank_parser.moves.n_moves != parser.moves.n_moves
bytes_data = parser.to_bytes()
blank_parser.from_bytes(bytes_data)
assert blank_parser.model is not True
assert blank_parser.moves.n_moves == parser.moves.n_moves

View File

@ -417,11 +417,11 @@ def to_bytes(getters, exclude):
for key, getter in getters.items():
if key not in exclude:
serialized[key] = getter()
return messagepack.dumps(serialized)
return msgpack.dumps(serialized)
def from_bytes(bytes_data, setters, exclude):
msg = messagepack.loads(bytes_data)
msg = msgpack.loads(bytes_data)
for key, setter in setters.items():
if key not in exclude:
setter(msg[key])