From ae8010b5262755c82ba82364932d6a8817e974c2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 1 Jun 2017 02:56:12 -0500 Subject: [PATCH] Move weight serialization to Thinc --- requirements.txt | 4 +-- setup.py | 4 +-- spacy/pipeline.pyx | 8 +++--- spacy/syntax/nn_parser.pyx | 8 +++--- spacy/util.py | 54 -------------------------------------- 5 files changed, 12 insertions(+), 66 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1fca476d1..636dcf334 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,8 +3,8 @@ pathlib numpy>=1.7 cymem>=1.30,<1.32 preshed>=1.0.0,<2.0.0 -thinc>=6.6.0,<6.7.0 -murmurhash>=0.26,<0.27 +thinc>=6.7.0,<6.8.0 +murmurhash>=0.28,<0.29 plac<1.0.0,>=0.9.6 six ujson>=1.35 diff --git a/setup.py b/setup.py index 093f0c199..7b40fb4e1 100755 --- a/setup.py +++ b/setup.py @@ -188,10 +188,10 @@ def setup_package(): ext_modules=ext_modules, install_requires=[ 'numpy>=1.7', - 'murmurhash>=0.26,<0.27', + 'murmurhash>=0.28,<0.29', 'cymem>=1.30,<1.32', 'preshed>=1.0.0,<2.0.0', - 'thinc>=6.6.0,<6.7.0', + 'thinc>=6.7.0,<6.8.0', 'plac<1.0.0,>=0.9.6', 'pip>=9.0.0,<10.0.0', 'six', diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index ff7098439..a4d307e64 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -169,7 +169,7 @@ class TokenVectorEncoder(object): if self.model is True: self.model = self.Model() deserialize = OrderedDict(( - ('model', lambda b: util.model_from_bytes(self.model, b)), + ('model', lambda b: self.model.from_bytes(b)), ('vocab', lambda b: self.vocab.from_bytes(b)) )) util.from_bytes(bytes_data, deserialize, exclude) @@ -186,7 +186,7 @@ class TokenVectorEncoder(object): if self.model is True: self.model = self.Model() deserialize = OrderedDict(( - ('model', lambda p: util.model_from_bytes(self.model, p.open('rb').read())), + ('model', lambda p: self.model.from_bytes(p.open('rb').read())), ('vocab', lambda p: self.vocab.from_disk(p)) )) util.from_disk(path, deserialize, exclude) @@ -307,7 +307,7 @@ class NeuralTagger(object): if self.model is True: token_vector_width = util.env_opt('token_vector_width', 128) self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) - util.model_from_bytes(self.model, b) + self.model.from_bytes(b) deserialize = OrderedDict(( ('vocab', lambda b: self.vocab.from_bytes(b)), ('model', lambda b: load_model(b)), @@ -324,7 +324,7 @@ class NeuralTagger(object): def from_disk(self, path, **exclude): deserialize = { - 'model': lambda p: util.model_from_bytes(self.model, p.open('rb').read()), + 'model': lambda p: self.model.from_bytes(p.open('rb').read()), 'vocab': lambda p: self.vocab.from_disk(p) } util.from_disk(path, deserialize, exclude) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index d156156d6..82f4e82f3 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -660,10 +660,10 @@ cdef class Parser: cfg = {} with (path / 'lower_model').open('rb') as file_: bytes_data = file_.read() - util.model_from_bytes(self.model[0], bytes_data) + self.model[0].from_bytes(bytes_data) with (path / 'upper_model').open('rb') as file_: bytes_data = file_.read() - util.model_from_bytes(self.model[1], bytes_data) + self.model[1].from_bytes(bytes_data) self.cfg.update(cfg) return self @@ -691,8 +691,8 @@ cdef class Parser: self.model, cfg = self.Model(self.moves.n_moves) else: cfg = {} - util.model_from_bytes(self.model[0], msg['lower_model']) - util.model_from_bytes(self.model[1], msg['upper_model']) + self.model[0].from_bytes(msg['lower_model']) + util.model[1].from_bytes(msg['upper_model']) self.cfg.update(cfg) return self diff --git a/spacy/util.py b/spacy/util.py index dabceb4a8..7120be98c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -452,60 +452,6 @@ def from_disk(path, readers, exclude): return path -# This stuff really belongs in thinc -- but I expect -# to refactor how all this works in thinc anyway. -# What a mess! -def model_to_bytes(model): - weights = [] - queue = [model] - i = 0 - for layer in queue: - if hasattr(layer, '_mem'): - weights.append({ - 'dims': normalize_string_keys(getattr(layer, '_dims', {})), - 'params': []}) - if hasattr(layer, 'seed'): - weights[-1]['seed'] = layer.seed - - for (id_, name), (start, row, shape) in layer._mem._offsets.items(): - if row == 1: - continue - param = layer._mem.get((id_, name)) - if not isinstance(layer._mem.weights, numpy.ndarray): - param = param.get() - weights[-1]['params'].append( - { - 'name': name, - 'offset': start, - 'shape': shape, - 'value': param, - } - ) - i += 1 - if hasattr(layer, '_layers'): - queue.extend(layer._layers) - return msgpack.dumps({'weights': weights}) - - -def model_from_bytes(model, bytes_data): - data = msgpack.loads(bytes_data) - weights = data['weights'] - queue = [model] - i = 0 - for layer in queue: - if hasattr(layer, '_mem'): - if 'seed' in weights[i]: - layer.seed = weights[i]['seed'] - for dim, value in weights[i]['dims'].items(): - setattr(layer, dim, value) - for param in weights[i]['params']: - dest = getattr(layer, param['name']) - copy_array(dest, param['value']) - i += 1 - if hasattr(layer, '_layers'): - queue.extend(layer._layers) - - def print_table(data, title=None): """Print data in table format.