From aa4c33914bb33db37cf4bac4dbaac90905b69604 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 29 May 2017 08:40:45 -0500 Subject: [PATCH] Work on serialization --- spacy/language.py | 46 +++++++++++++++++++------------------- spacy/pipeline.pyx | 19 ++++++++-------- spacy/syntax/nn_parser.pyx | 15 ++++++++----- spacy/util.py | 15 ++++++------- 4 files changed, 50 insertions(+), 45 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 6c8c7cd73..8f1ae69ca 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -9,6 +9,7 @@ from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.optimizers import Adam, SGD import random import ujson +from collections import OrderedDict from .tokenizer import Tokenizer from .vocab import Vocab @@ -154,7 +155,7 @@ class Language(object): if make_doc is True: factory = self.Defaults.create_tokenizer make_doc = factory(self, **meta.get('tokenizer', {})) - self.make_doc = make_doc + self.tokenizer = make_doc if pipeline is True: self.pipeline = self.Defaults.create_pipeline(self) elif pipeline: @@ -196,6 +197,9 @@ class Language(object): doc = proc(doc) return doc + def make_doc(self, text): + return self.tokenizer(text) + def update(self, docs, golds, drop=0., sgd=None, losses=None): """Update the models in the pipeline. @@ -425,19 +429,17 @@ class Language(object): from being serialized. RETURNS (bytes): The serialized form of the `Language` object. """ - serializers = { - 'vocab': lambda: self.vocab.to_bytes(), - 'tokenizer': lambda: self.tokenizer.to_bytes(vocab=False), - 'meta': lambda: ujson.dumps(self.meta) - } - for proc in self.pipeline: - if not hasattr(proc, 'name'): - continue - if proc.name in disable: + serializers = OrderedDict(( + ('vocab', lambda: self.vocab.to_bytes()), + ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), + ('meta', lambda: ujson.dumps(self.meta)) + )) + for i, proc in enumerate(self.pipeline): + if getattr(proc, 'name', None) in disable: continue if not hasattr(proc, 'to_bytes'): continue - serializers[proc.name] = lambda: proc.to_bytes(vocab=False) + serializers[i] = lambda: proc.to_bytes(vocab=False) return util.to_bytes(serializers, {}) def from_bytes(self, bytes_data, disable=[]): @@ -447,20 +449,18 @@ class Language(object): disable (list): Names of the pipeline components to disable. RETURNS (Language): The `Language` object. """ - deserializers = { - 'vocab': lambda b: self.vocab.from_bytes(b), - 'tokenizer': lambda b: self.tokenizer.from_bytes(b, vocab=False), - 'meta': lambda b: self.meta.update(ujson.loads(b)) - } - for proc in self.pipeline: - if not hasattr(proc, 'name'): + deserializers = OrderedDict(( + ('vocab', lambda b: self.vocab.from_bytes(b)), + ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)), + ('meta', lambda b: self.meta.update(ujson.loads(b))) + )) + for i, proc in enumerate(self.pipeline): + if getattr(proc, 'name', None) in disable: continue - if proc.name in disable: + if not hasattr(proc, 'from_bytes'): continue - if not hasattr(proc, 'to_disk'): - continue - deserializers[proc.name] = lambda b: proc.from_bytes(b, vocab=False) - util.from_bytes(deserializers, bytes_data, {}) + deserializers[i] = lambda b: proc.from_bytes(b, vocab=False) + util.from_bytes(bytes_data, deserializers, {}) return self diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index a4d936a70..3635b68c3 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -9,6 +9,7 @@ import numpy cimport numpy as np import cytoolz import util +from collections import OrderedDict from thinc.api import add, layerize, chain, clone, concatenate, with_flatten from thinc.neural import Model, Maxout, Softmax, Affine @@ -158,18 +159,18 @@ class TokenVectorEncoder(object): yield def to_bytes(self, **exclude): - serialize = { - 'model': lambda: util.model_to_bytes(self.model), - 'vocab': lambda: self.vocab.to_bytes() - } + serialize = OrderedDict(( + ('model', lambda: util.model_to_bytes(self.model)), + ('vocab', lambda: self.vocab.to_bytes()) + )) return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): - deserialize = { - 'model': lambda b: util.model_from_bytes(self.model, b), - 'vocab': lambda b: self.vocab.from_bytes(b) - } - util.from_bytes(deserialize, exclude) + deserialize = OrderedDict(( + ('model', lambda b: util.model_from_bytes(self.model, b)), + ('vocab', lambda b: self.vocab.from_bytes(b)) + )) + util.from_bytes(bytes_data, deserialize, exclude) return self def to_disk(self, path, **exclude): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 0270a6890..d49e9cdef 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -659,9 +659,10 @@ cdef class Parser: def to_bytes(self, **exclude): serializers = { - 'model': lambda: util.model_to_bytes(self.model), + 'lower_model': lambda: util.model_to_bytes(self.model[0]), + 'upper_model': lambda: util.model_to_bytes(self.model[1]), 'vocab': lambda: self.vocab.to_bytes(), - 'moves': lambda: self.moves.to_bytes(vocab=False), + 'moves': lambda: self.moves.to_bytes(strings=False), 'cfg': lambda: ujson.dumps(self.cfg) } return util.to_bytes(serializers, exclude) @@ -669,15 +670,19 @@ cdef class Parser: def from_bytes(self, bytes_data, **exclude): deserializers = { 'vocab': lambda b: self.vocab.from_bytes(b), - 'moves': lambda b: self.moves.from_bytes(b), + 'moves': lambda b: self.moves.from_bytes(b, strings=False), 'cfg': lambda b: self.cfg.update(ujson.loads(b)), - 'model': lambda b: None + 'lower_model': lambda b: None, + 'upper_model': lambda b: None } msg = util.from_bytes(bytes_data, deserializers, exclude) if 'model' not in exclude: if self.model is True: self.model, cfg = self.Model(self.moves.n_moves) - util.model_from_bytes(self.model, msg['model']) + else: + cfg = {} + util.model_from_bytes(self.model[0], msg['lower_model']) + util.model_from_bytes(self.model[1], msg['upper_model']) self.cfg.update(cfg) return self diff --git a/spacy/util.py b/spacy/util.py index fbcf3ae6b..6c8386e2a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -11,6 +11,7 @@ import sys import textwrap import random import numpy +import io import msgpack import msgpack_numpy @@ -447,27 +448,25 @@ def model_to_bytes(model): i += 1 if hasattr(layer, '_layers'): queue.extend(layer._layers) - data = {'metas': tuple(metas), 'weights': tuple(weights), 'dims': - tuple(dims)} + data = {'metas': metas, 'weights': weights, 'dims': dims} return msgpack.dumps(data) def model_from_bytes(model, bytes_data): data = msgpack.loads(bytes_data) - metas = data['metas'] weights = data['weights'] + metas = data['metas'] dims = data['dims'] queue = [model] i = 0 for layer in queue: if hasattr(layer, '_mem'): params = weights[i] - flat_mem = layer._mem._mem.ravel() - flat_params = params.ravel() - flat_mem[:flat_params.size] = flat_params - layer._mem._offsets.update(metas[i]) + blob = layer._mem._get_blob(params.size) + blob[:] = params + layer._mem._offsets = metas[i] if hasattr(layer, '_dims'): - layer._dims.update(dims[i]) + layer._dims[i] = dims[i] i += 1 if hasattr(layer, '_layers'): queue.extend(layer._layers)