Work on serialization

This commit is contained in:
Matthew Honnibal 2017-05-29 08:40:45 -05:00
parent deac7eb01c
commit aa4c33914b
4 changed files with 50 additions and 45 deletions

View File

@ -9,6 +9,7 @@ from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.optimizers import Adam, SGD
import random
import ujson
from collections import OrderedDict
from .tokenizer import Tokenizer
from .vocab import Vocab
@ -154,7 +155,7 @@ class Language(object):
if make_doc is True:
factory = self.Defaults.create_tokenizer
make_doc = factory(self, **meta.get('tokenizer', {}))
self.make_doc = make_doc
self.tokenizer = make_doc
if pipeline is True:
self.pipeline = self.Defaults.create_pipeline(self)
elif pipeline:
@ -196,6 +197,9 @@ class Language(object):
doc = proc(doc)
return doc
def make_doc(self, text):
return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None):
"""Update the models in the pipeline.
@ -425,19 +429,17 @@ class Language(object):
from being serialized.
RETURNS (bytes): The serialized form of the `Language` object.
"""
serializers = {
'vocab': lambda: self.vocab.to_bytes(),
'tokenizer': lambda: self.tokenizer.to_bytes(vocab=False),
'meta': lambda: ujson.dumps(self.meta)
}
for proc in self.pipeline:
if not hasattr(proc, 'name'):
continue
if proc.name in disable:
serializers = OrderedDict((
('vocab', lambda: self.vocab.to_bytes()),
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
('meta', lambda: ujson.dumps(self.meta))
))
for i, proc in enumerate(self.pipeline):
if getattr(proc, 'name', None) in disable:
continue
if not hasattr(proc, 'to_bytes'):
continue
serializers[proc.name] = lambda: proc.to_bytes(vocab=False)
serializers[i] = lambda: proc.to_bytes(vocab=False)
return util.to_bytes(serializers, {})
def from_bytes(self, bytes_data, disable=[]):
@ -447,20 +449,18 @@ class Language(object):
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The `Language` object.
"""
deserializers = {
'vocab': lambda b: self.vocab.from_bytes(b),
'tokenizer': lambda b: self.tokenizer.from_bytes(b, vocab=False),
'meta': lambda b: self.meta.update(ujson.loads(b))
}
for proc in self.pipeline:
if not hasattr(proc, 'name'):
deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
('meta', lambda b: self.meta.update(ujson.loads(b)))
))
for i, proc in enumerate(self.pipeline):
if getattr(proc, 'name', None) in disable:
continue
if proc.name in disable:
if not hasattr(proc, 'from_bytes'):
continue
if not hasattr(proc, 'to_disk'):
continue
deserializers[proc.name] = lambda b: proc.from_bytes(b, vocab=False)
util.from_bytes(deserializers, bytes_data, {})
deserializers[i] = lambda b: proc.from_bytes(b, vocab=False)
util.from_bytes(bytes_data, deserializers, {})
return self

View File

@ -9,6 +9,7 @@ import numpy
cimport numpy as np
import cytoolz
import util
from collections import OrderedDict
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine
@ -158,18 +159,18 @@ class TokenVectorEncoder(object):
yield
def to_bytes(self, **exclude):
serialize = {
'model': lambda: util.model_to_bytes(self.model),
'vocab': lambda: self.vocab.to_bytes()
}
serialize = OrderedDict((
('model', lambda: util.model_to_bytes(self.model)),
('vocab', lambda: self.vocab.to_bytes())
))
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
deserialize = {
'model': lambda b: util.model_from_bytes(self.model, b),
'vocab': lambda b: self.vocab.from_bytes(b)
}
util.from_bytes(deserialize, exclude)
deserialize = OrderedDict((
('model', lambda b: util.model_from_bytes(self.model, b)),
('vocab', lambda b: self.vocab.from_bytes(b))
))
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, **exclude):

View File

@ -659,9 +659,10 @@ cdef class Parser:
def to_bytes(self, **exclude):
serializers = {
'model': lambda: util.model_to_bytes(self.model),
'lower_model': lambda: util.model_to_bytes(self.model[0]),
'upper_model': lambda: util.model_to_bytes(self.model[1]),
'vocab': lambda: self.vocab.to_bytes(),
'moves': lambda: self.moves.to_bytes(vocab=False),
'moves': lambda: self.moves.to_bytes(strings=False),
'cfg': lambda: ujson.dumps(self.cfg)
}
return util.to_bytes(serializers, exclude)
@ -669,15 +670,19 @@ cdef class Parser:
def from_bytes(self, bytes_data, **exclude):
deserializers = {
'vocab': lambda b: self.vocab.from_bytes(b),
'moves': lambda b: self.moves.from_bytes(b),
'moves': lambda b: self.moves.from_bytes(b, strings=False),
'cfg': lambda b: self.cfg.update(ujson.loads(b)),
'model': lambda b: None
'lower_model': lambda b: None,
'upper_model': lambda b: None
}
msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude:
if self.model is True:
self.model, cfg = self.Model(self.moves.n_moves)
util.model_from_bytes(self.model, msg['model'])
else:
cfg = {}
util.model_from_bytes(self.model[0], msg['lower_model'])
util.model_from_bytes(self.model[1], msg['upper_model'])
self.cfg.update(cfg)
return self

View File

@ -11,6 +11,7 @@ import sys
import textwrap
import random
import numpy
import io
import msgpack
import msgpack_numpy
@ -447,27 +448,25 @@ def model_to_bytes(model):
i += 1
if hasattr(layer, '_layers'):
queue.extend(layer._layers)
data = {'metas': tuple(metas), 'weights': tuple(weights), 'dims':
tuple(dims)}
data = {'metas': metas, 'weights': weights, 'dims': dims}
return msgpack.dumps(data)
def model_from_bytes(model, bytes_data):
data = msgpack.loads(bytes_data)
metas = data['metas']
weights = data['weights']
metas = data['metas']
dims = data['dims']
queue = [model]
i = 0
for layer in queue:
if hasattr(layer, '_mem'):
params = weights[i]
flat_mem = layer._mem._mem.ravel()
flat_params = params.ravel()
flat_mem[:flat_params.size] = flat_params
layer._mem._offsets.update(metas[i])
blob = layer._mem._get_blob(params.size)
blob[:] = params
layer._mem._offsets = metas[i]
if hasattr(layer, '_dims'):
layer._dims.update(dims[i])
layer._dims[i] = dims[i]
i += 1
if hasattr(layer, '_layers'):
queue.extend(layer._layers)