Fix serialization of tag_map in NeuralTagger

This commit is contained in:
Matthew Honnibal 2017-06-02 10:18:37 -05:00
parent acc47e2673
commit 5f4d328e2c
1 changed files with 12 additions and 3 deletions

View File

@ -11,6 +11,7 @@ import cytoolz
import util import util
from collections import OrderedDict from collections import OrderedDict
import ujson import ujson
import msgpack
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine from thinc.neural import Model, Maxout, Softmax, Affine
@ -301,7 +302,8 @@ class NeuralTagger(object):
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
serialize = OrderedDict(( serialize = OrderedDict((
('model', lambda: self.model.to_bytes()), ('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes()) ('vocab', lambda: self.vocab.to_bytes()),
('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map))
)) ))
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
@ -311,8 +313,15 @@ class NeuralTagger(object):
token_vector_width = util.env_opt('token_vector_width', 128) token_vector_width = util.env_opt('token_vector_width', 128)
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model.from_bytes(b) self.model.from_bytes(b)
def load_tag_map(b):
tag_map = msgpack.loads(b)
self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer)
deserialize = OrderedDict(( deserialize = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)), ('vocab', lambda b: self.vocab.from_bytes(b)),
('tag_map', load_tag_map),
('model', lambda b: load_model(b)), ('model', lambda b: load_model(b)),
)) ))
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
@ -321,7 +330,7 @@ class NeuralTagger(object):
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
serialize = OrderedDict(( serialize = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)), ('vocab', lambda p: self.vocab.to_disk(p)),
('tag_map', lambda p: p.open('w').write(json_dumps( ('tag_map', lambda p: p.open('w').write(msgpack.dumps(
self.vocab.morphology.tag_map))), self.vocab.morphology.tag_map))),
('model', lambda p: p.open('wb').write(self.model.to_bytes())), ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
)) ))
@ -336,7 +345,7 @@ class NeuralTagger(object):
def load_tag_map(p): def load_tag_map(p):
with p.open() as file_: with p.open() as file_:
tag_map = ujson.loads(file_.read()) tag_map = msgpack.loads(file_.read())
self.vocab.morphology = Morphology( self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map, self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer) lemmatizer=self.vocab.morphology.lemmatizer)