From 03a215c5fd577dc5c76ad9887938e1fc64264134 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 19 Oct 2017 13:44:49 +0200 Subject: [PATCH 01/28] Make PrecomputableAffines work --- spacy/_ml.py | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index b07e179f0..ad6ef6361 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -30,6 +30,8 @@ from . import util import numpy import io +from blis.py import einsum + # TODO: Unset this once we don't want to support models previous models. import thinc.neural._classes.layernorm thinc.neural._classes.layernorm.set_compat_six_eight(False) @@ -105,9 +107,7 @@ def _preprocess_doc(docs, drop=0.): def _init_for_precomputed(W, ops): if (W**2).sum() != 0.: return - reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2])) - ops.xavier_uniform_init(reshaped) - W[:] = reshaped.reshape(W.shape) + ops.xavier_uniform_init(W, inplace=True) @describe.on_data(_set_dimensions_if_needed) @@ -116,7 +116,7 @@ def _init_for_precomputed(W, ops): nF=Dimension("Number of features"), nO=Dimension("Output size"), W=Synapses("Weights matrix", - lambda obj: (obj.nF, obj.nO, obj.nI), + lambda obj: (obj.nI, obj.nF * obj.nO), lambda W, ops: _init_for_precomputed(W, ops)), b=Biases("Bias vector", lambda obj: (obj.nO,)), @@ -130,31 +130,43 @@ class PrecomputableAffine(Model): self.nI = nI self.nF = nF + @property + def nIF(self): + return self.nI * self.nF + + @property + def nFO(self): + return self.nF * self.nO + def begin_update(self, X, drop=0.): + nN = X.shape[0] # X: (b, i) - # Yf: (b, f, i) + # Xf: (b, f, i) + # Yf: (b, f, o) # dY: (b, o) # dYf: (b, f, o) - #Yf = numpy.einsum('bi,foi->bfo', X, self.W) - Yf = self.ops.xp.tensordot( - X, self.W, axes=[[1], [2]]) - Yf += self.b + # W: (i, fo) + # Yf = numpy.einsum('bi,i_fo->b_fo', X, self.W) + Yf = einsum('ab,bc->ac', X, self.W).reshape((nN, self.nF, self.nO)) def backward(dY_ids, sgd=None): - tensordot = self.ops.xp.tensordot dY, ids = dY_ids + nB = ids.shape[0] Xf = X[ids] + Xf = Xf.reshape((nB, self.nIF)) - #dXf = numpy.einsum('bo,foi->bfi', dY, self.W) - dXf = tensordot(dY, self.W, axes=[[1], [1]]) - #dW = numpy.einsum('bo,bfi->ofi', dY, Xf) - dW = tensordot(dY, Xf, axes=[[0], [0]]) - # ofi -> foi - self.d_W += dW.transpose((1, 0, 2)) - self.d_b += dY.sum(axis=0) + dW_re = self.d_W.reshape((self.nIF, self.nO)) + W_re = self.d_W.reshape((self.nIF, self.nO)) + # bo,if_o->bif + dXf = einsum('ab,cb->ac', dY, W_re) + # b_if,bo->if_o + einsum('ab,ac->bc', Xf, dY, out=dW_re) + # self.d_b += dY.sum(axis=0) if sgd is not None: sgd(self._mem.weights, self._mem.gradient, key=self.id) - return dXf + dXf = dXf.reshape((nB, self.nI, self.nF)) + dXf = dXf.transpose((0, 2, 1)) + return self.ops.xp.ascontiguousarray(dXf) return Yf, backward From b54b4b8a974087577d3e1d22bdc90d3e64ef8bd7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 19 Oct 2017 13:45:18 +0200 Subject: [PATCH 02/28] Make parser_maxout_pieces hyper-param work --- spacy/syntax/nn_parser.pyx | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index cb26b8d37..361e61a99 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -153,7 +153,7 @@ cdef class precompute_hiddens: if bp_nonlinearity is not None: d_state_vector = bp_nonlinearity(d_state_vector, sgd) # This will usually be on GPU - if isinstance(d_state_vector, numpy.ndarray): + if not isinstance(d_state_vector, self.ops.xp.ndarray): d_state_vector = self.ops.xp.array(d_state_vector) d_tokens = bp_hiddens((d_state_vector, token_ids), sgd) return d_tokens @@ -244,8 +244,8 @@ cdef class Parser: if depth != 1: raise ValueError("Currently parser depth is hard-coded to 1.") parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) - if parser_maxout_pieces != 2: - raise ValueError("Currently parser_maxout_pieces is hard-coded to 2") + #if parser_maxout_pieces != 2: + # raise ValueError("Currently parser_maxout_pieces is hard-coded to 2") token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128)) hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200)) embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) @@ -258,9 +258,13 @@ cdef class Parser: tok2vec = Tok2Vec(token_vector_width, embed_size, pretrained_dims=cfg.get('pretrained_dims', 0)) tok2vec = chain(tok2vec, flatten) - lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class, - nF=cls.nr_feature, nP=parser_maxout_pieces, - nI=token_vector_width) + if parser_maxout_pieces >= 2: + lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class, + nF=cls.nr_feature, nP=parser_maxout_pieces, + nI=token_vector_width) + else: + lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, + nF=cls.nr_feature, nI=token_vector_width) with Model.use_device('cpu'): upper = chain( @@ -413,7 +417,7 @@ cdef class Parser: for stcls in state_objs: if not stcls.c.is_final(): states.push_back(stcls.c) - + feat_weights = state2vec.get_feat_weights() cdef int i cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T) @@ -438,7 +442,7 @@ cdef class Parser: is_valid = calloc(nr_class, sizeof(int)) vectors = calloc(nr_hidden * nr_piece, sizeof(float)) scores = calloc(nr_class, sizeof(float)) - + while not state.is_final(): state.set_context_tokens(token_ids, nr_feat) memset(vectors, 0, nr_hidden * nr_piece * sizeof(float)) @@ -448,7 +452,12 @@ cdef class Parser: V = vectors W = hW for i in range(nr_hidden): - feature = V[0] if V[0] >= V[1] else V[1] + if nr_piece == 1: + feature = V[0] + elif nr_piece == 2: + feature = V[0] if V[0] >= V[1] else V[1] + else: + feature = Vec.max(V, nr_piece) for j in range(nr_class): scores[j] += feature * W[j] W += nr_class From b00d0a2c979ee81bc5f343d40b330445850219c2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 19 Oct 2017 18:42:11 +0200 Subject: [PATCH 03/28] Fix bias in parser --- spacy/_ml.py | 10 +++++----- spacy/syntax/nn_parser.pxd | 3 ++- spacy/syntax/nn_parser.pyx | 22 +++++++++++++++------- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index ad6ef6361..2b82f3d9b 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -148,6 +148,7 @@ class PrecomputableAffine(Model): # W: (i, fo) # Yf = numpy.einsum('bi,i_fo->b_fo', X, self.W) Yf = einsum('ab,bc->ac', X, self.W).reshape((nN, self.nF, self.nO)) + #Yf = self.ops.xp.dot(X, self.W).reshape((nN, self.nF, self.nO)) def backward(dY_ids, sgd=None): dY, ids = dY_ids nB = ids.shape[0] @@ -155,12 +156,14 @@ class PrecomputableAffine(Model): Xf = Xf.reshape((nB, self.nIF)) dW_re = self.d_W.reshape((self.nIF, self.nO)) - W_re = self.d_W.reshape((self.nIF, self.nO)) + W_re = self.W.reshape((self.nIF, self.nO)) # bo,if_o->bif dXf = einsum('ab,cb->ac', dY, W_re) + #dXf = self.ops.xp.dot(dY, W_re.T) # b_if,bo->if_o einsum('ab,ac->bc', Xf, dY, out=dW_re) - # self.d_b += dY.sum(axis=0) + #self.ops.xp.dot(Xf.T, dY, out=dW_re) + self.d_b += dY.sum(axis=0) if sgd is not None: sgd(self._mem.weights, self._mem.gradient, key=self.id) @@ -208,7 +211,6 @@ class PrecomputableMaxouts(Model): ascontiguous = self.ops.xp.ascontiguousarray Yfp = tensordot(X, self.W, axes=[[1], [3]]) - Yfp += self.b def backward(dYp_ids, sgd=None): dYp, ids = dYp_ids @@ -380,8 +382,6 @@ def reapply(layer, n_times): return wrap(reapply_fwd, layer) - - def asarray(ops, dtype): def forward(X, drop=0.): return ops.asarray(X, dtype=dtype), None diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd index 1d389609b..56615c6f1 100644 --- a/spacy/syntax/nn_parser.pxd +++ b/spacy/syntax/nn_parser.pxd @@ -16,5 +16,6 @@ cdef class Parser: cdef public object _multitasks cdef void _parseC(self, StateC* state, - const float* feat_weights, const float* hW, const float* hb, + const float* feat_weights, const float* bias, + const float* hW, const float* hb, int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 361e61a99..755c87369 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -101,6 +101,7 @@ cdef class precompute_hiddens: cdef public object ops cdef np.ndarray _features cdef np.ndarray _cached + cdef np.ndarray bias cdef object _cuda_stream cdef object _bp_hiddens @@ -118,6 +119,7 @@ cdef class precompute_hiddens: self.nO = cached.shape[2] self.nP = getattr(lower_model, 'nP', 1) self.ops = lower_model.ops + self.bias = lower_model.b self._is_synchronized = False self._cuda_stream = cuda_stream self._cached = cached @@ -147,6 +149,7 @@ cdef class precompute_hiddens: sum_state_features(state_vector.data, feat_weights, &ids[0,0], token_ids.shape[0], self.nF, self.nO*self.nP) + state_vector += self.bias.ravel() state_vector, bp_nonlinearity = self._nonlinearity(state_vector) def backward(d_state_vector, sgd=None): @@ -161,14 +164,15 @@ cdef class precompute_hiddens: def _nonlinearity(self, state_vector): if self.nP == 1: - return state_vector, None + mask = state_vector >= 0. + return state_vector * mask, lambda dY, sgd=None: dY * mask state_vector = state_vector.reshape( (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP)) best, which = self.ops.maxout(state_vector) - def backprop(d_best, sgd=None): - return self.ops.backprop_maxout(d_best, which, self.nP) - return best, backprop + def backprop_maxout(d_best, sgd=None): + return self.ops.backprop_maxout(d_best, which, self.nP) + return best, backprop_maxout cdef void sum_state_features(float* output, @@ -425,18 +429,20 @@ cdef class Parser: hW = hidden_weights.data hb = hidden_bias.data + bias = state2vec.bias.data cdef int nr_hidden = hidden_weights.shape[0] cdef int nr_task = states.size() with nogil: for i in cython.parallel.prange(nr_task, num_threads=2, schedule='guided'): self._parseC(states[i], - feat_weights, hW, hb, + feat_weights, bias, hW, hb, nr_class, nr_hidden, nr_feat, nr_piece) return state_objs cdef void _parseC(self, StateC* state, - const float* feat_weights, const float* hW, const float* hb, + const float* feat_weights, const float* bias, + const float* hW, const float* hb, int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil: token_ids = calloc(nr_feat, sizeof(int)) is_valid = calloc(nr_class, sizeof(int)) @@ -449,11 +455,13 @@ cdef class Parser: memset(scores, 0, nr_class * sizeof(float)) sum_state_features(vectors, feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece) + for i in range(nr_hidden * nr_piece): + vectors[i] += bias[i] V = vectors W = hW for i in range(nr_hidden): if nr_piece == 1: - feature = V[0] + feature = V[0] if V[0] >= 0. else 0. elif nr_piece == 2: feature = V[0] if V[0] >= V[1] else V[1] else: From a17a1b60c7718ef5958cae6ae1bf72df51fbfd02 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 19 Oct 2017 20:26:37 +0200 Subject: [PATCH 04/28] Clean up redundant PrecomputableMaxouts class --- spacy/_ml.py | 99 +++++++++++++--------------------------------------- 1 file changed, 24 insertions(+), 75 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 2b82f3d9b..1f504ec4a 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -30,8 +30,6 @@ from . import util import numpy import io -from blis.py import einsum - # TODO: Unset this once we don't want to support models previous models. import thinc.neural._classes.layernorm thinc.neural._classes.layernorm.set_compat_six_eight(False) @@ -107,7 +105,9 @@ def _preprocess_doc(docs, drop=0.): def _init_for_precomputed(W, ops): if (W**2).sum() != 0.: return + W = W.reshape((W.shape[0] * W.shape[1], W.shape[2])) ops.xavier_uniform_init(W, inplace=True) + return W @describe.on_data(_set_dimensions_if_needed) @@ -116,7 +116,7 @@ def _init_for_precomputed(W, ops): nF=Dimension("Number of features"), nO=Dimension("Output size"), W=Synapses("Weights matrix", - lambda obj: (obj.nI, obj.nF * obj.nO), + lambda obj: (obj.nI, obj.nF, obj.nO), lambda W, ops: _init_for_precomputed(W, ops)), b=Biases("Bias vector", lambda obj: (obj.nO,)), @@ -131,7 +131,7 @@ class PrecomputableAffine(Model): self.nF = nF @property - def nIF(self): + def nFI(self): return self.nI * self.nF @property @@ -145,87 +145,34 @@ class PrecomputableAffine(Model): # Yf: (b, f, o) # dY: (b, o) # dYf: (b, f, o) - # W: (i, fo) - # Yf = numpy.einsum('bi,i_fo->b_fo', X, self.W) - Yf = einsum('ab,bc->ac', X, self.W).reshape((nN, self.nF, self.nO)) - #Yf = self.ops.xp.dot(X, self.W).reshape((nN, self.nF, self.nO)) + # W: (i, f, o) + W = self.W.reshape((self.nI, self.nFO)) + Yf = self.ops.xp.dot(X, W) + Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO)) + #Yf = einsum('ab,bc->ac', X, W) def backward(dY_ids, sgd=None): dY, ids = dY_ids - nB = ids.shape[0] Xf = X[ids] - Xf = Xf.reshape((nB, self.nIF)) - - dW_re = self.d_W.reshape((self.nIF, self.nO)) - W_re = self.W.reshape((self.nIF, self.nO)) - # bo,if_o->bif - dXf = einsum('ab,cb->ac', dY, W_re) - #dXf = self.ops.xp.dot(dY, W_re.T) - # b_if,bo->if_o - einsum('ab,ac->bc', Xf, dY, out=dW_re) - #self.ops.xp.dot(Xf.T, dY, out=dW_re) + # bo,fi_o->b_if -> b_fi + W_o_fi = self._transpose(self.W, shape=(self.nO, self.nFI)) + dXf = self.ops.xp.dot(dY, W_o_fi).reshape((Xf.shape[0], self.nF, self.nI)) + # bo,b_fi->o_fi + dW = Xf.reshape((Xf.shape[0], self.nFI)) + dW = self.ops.xp.dot(Xf.T, dY) + dW = dW.reshape((self.nO, self.nF, self.nI)) + self.d_W += dW.transpose((2, 1, 0)) self.d_b += dY.sum(axis=0) if sgd is not None: sgd(self._mem.weights, self._mem.gradient, key=self.id) - dXf = dXf.reshape((nB, self.nI, self.nF)) - dXf = dXf.transpose((0, 2, 1)) - return self.ops.xp.ascontiguousarray(dXf) + return dXf return Yf, backward + def _transpose(self, weights, shape): + weights = weights.transpose((2, 1, 0)) + weights = self.ops.xp.ascontiguousarray(weights) + return weights.reshape(shape) -@describe.on_data(_set_dimensions_if_needed) -@describe.attributes( - nI=Dimension("Input size"), - nF=Dimension("Number of features"), - nP=Dimension("Number of pieces"), - nO=Dimension("Output size"), - W=Synapses("Weights matrix", - lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI), - lambda W, ops: ops.xavier_uniform_init(W)), - b=Biases("Bias vector", - lambda obj: (obj.nO, obj.nP)), - d_W=Gradient("W"), - d_b=Gradient("b") -) -class PrecomputableMaxouts(Model): - def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs): - Model.__init__(self, **kwargs) - self.nO = nO - self.nP = nP - self.nI = nI - self.nF = nF - - def begin_update(self, X, drop=0.): - # X: (b, i) - # Yfp: (b, f, o, p) - # Xf: (f, b, i) - # dYp: (b, o, p) - # W: (f, o, p, i) - # b: (o, p) - - # bi,opfi->bfop - # bop,fopi->bfi - # bop,fbi->opfi : fopi - - tensordot = self.ops.xp.tensordot - ascontiguous = self.ops.xp.ascontiguousarray - - Yfp = tensordot(X, self.W, axes=[[1], [3]]) - - def backward(dYp_ids, sgd=None): - dYp, ids = dYp_ids - Xf = X[ids] - - dXf = tensordot(dYp, self.W, axes=[[1, 2], [1,2]]) - dW = tensordot(dYp, Xf, axes=[[0], [0]]) - - self.d_W += dW.transpose((2, 0, 1, 3)) - self.d_b += dYp.sum(axis=0) - - if sgd is not None: - sgd(self._mem.weights, self._mem.gradient, key=self.id) - return dXf - return Yfp, backward # Thinc's Embed class is a bit broken atm, so drop this here. from thinc import describe @@ -382,6 +329,8 @@ def reapply(layer, n_times): return wrap(reapply_fwd, layer) + + def asarray(ops, dtype): def forward(X, drop=0.): return ops.asarray(X, dtype=dtype), None From a8850b4282f4c16edcc7fa3fc5906599ced2278a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 19 Oct 2017 20:27:34 +0200 Subject: [PATCH 05/28] Remove redundant PrecomputableMaxouts class --- spacy/syntax/nn_parser.pyx | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 755c87369..10a79750b 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -47,7 +47,7 @@ from thinc.neural.util import get_array_module from .. import util from ..util import get_async, get_cuda_stream -from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts +from .._ml import zero_init, PrecomputableAffine from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune from .._ml import Residual, drop_layer, flatten from .._ml import link_vectors_to_models @@ -153,8 +153,7 @@ cdef class precompute_hiddens: state_vector, bp_nonlinearity = self._nonlinearity(state_vector) def backward(d_state_vector, sgd=None): - if bp_nonlinearity is not None: - d_state_vector = bp_nonlinearity(d_state_vector, sgd) + d_state_vector = bp_nonlinearity(d_state_vector, sgd) # This will usually be on GPU if not isinstance(d_state_vector, self.ops.xp.ndarray): d_state_vector = self.ops.xp.array(d_state_vector) @@ -165,14 +164,18 @@ cdef class precompute_hiddens: def _nonlinearity(self, state_vector): if self.nP == 1: mask = state_vector >= 0. - return state_vector * mask, lambda dY, sgd=None: dY * mask - state_vector = state_vector.reshape( - (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP)) - best, which = self.ops.maxout(state_vector) + state_vector *= mask + else: + state_vector = state_vector.reshape( + (state_vector.shape[0], self.nO, self.nP)) + state_vector, mask = self.ops.maxout(state_vector) - def backprop_maxout(d_best, sgd=None): - return self.ops.backprop_maxout(d_best, which, self.nP) - return best, backprop_maxout + def backprop_nonlinearity(d_best, sgd=None): + if self.nP == 1: + return d_best * mask + else: + return self.ops.backprop_maxout(d_best, mask, self.nP) + return state_vector, backprop_nonlinearity cdef void sum_state_features(float* output, @@ -262,13 +265,8 @@ cdef class Parser: tok2vec = Tok2Vec(token_vector_width, embed_size, pretrained_dims=cfg.get('pretrained_dims', 0)) tok2vec = chain(tok2vec, flatten) - if parser_maxout_pieces >= 2: - lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class, - nF=cls.nr_feature, nP=parser_maxout_pieces, - nI=token_vector_width) - else: - lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, - nF=cls.nr_feature, nI=token_vector_width) + lower = PrecomputableAffine(hidden_width * parser_maxout_pieces, + nF=cls.nr_feature, nI=token_vector_width) with Model.use_device('cpu'): upper = chain( From 827cd8a883397f00ab110a6356cb579742b6a52f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 03:07:17 +0200 Subject: [PATCH 06/28] Fix support of maxout pieces in parser --- spacy/syntax/nn_parser.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 10a79750b..465e4d877 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -116,8 +116,8 @@ cdef class precompute_hiddens: else: cached = gpu_cached self.nF = cached.shape[1] - self.nO = cached.shape[2] self.nP = getattr(lower_model, 'nP', 1) + self.nO = cached.shape[2] // self.nP self.ops = lower_model.ops self.bias = lower_model.b self._is_synchronized = False @@ -174,7 +174,8 @@ cdef class precompute_hiddens: if self.nP == 1: return d_best * mask else: - return self.ops.backprop_maxout(d_best, mask, self.nP) + d_vector = self.ops.backprop_maxout(d_best, mask, self.nP) + return d_vector.reshape((d_vector.shape[0], self.nO*self.nP)) return state_vector, backprop_nonlinearity @@ -267,6 +268,7 @@ cdef class Parser: tok2vec = chain(tok2vec, flatten) lower = PrecomputableAffine(hidden_width * parser_maxout_pieces, nF=cls.nr_feature, nI=token_vector_width) + lower.nP = parser_maxout_pieces with Model.use_device('cpu'): upper = chain( From 64658e02e5d5d32a2d4f2174801ee96294b2c769 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 03:07:45 +0200 Subject: [PATCH 07/28] Implement fancier initialisation for precomputed layer --- spacy/_ml.py | 64 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 12 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 1f504ec4a..1f0bfa5b6 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -13,7 +13,8 @@ from thinc.api import uniqued, wrap, flatten_add_lengths, noop from thinc.linear.linear import LinearModel from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.util import get_array_module +from thinc.neural.util import get_array_module, copy_array +from thinc.neural._lsuv import svd_orthonormal import random import cytoolz @@ -22,6 +23,7 @@ from thinc import describe from thinc.describe import Dimension, Synapses, Biases, Gradient from thinc.neural._classes.affine import _set_dimensions_if_needed import thinc.extra.load_nlp +from thinc.neural._lsuv import svd_orthonormal from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER from .tokens.doc import Doc @@ -102,22 +104,14 @@ def _preprocess_doc(docs, drop=0.): return (keys, vals, lengths), None -def _init_for_precomputed(W, ops): - if (W**2).sum() != 0.: - return - W = W.reshape((W.shape[0] * W.shape[1], W.shape[2])) - ops.xavier_uniform_init(W, inplace=True) - return W - - -@describe.on_data(_set_dimensions_if_needed) +@describe.on_data(_set_dimensions_if_needed, + lambda model, X, y: model.init_weights(model)) @describe.attributes( nI=Dimension("Input size"), nF=Dimension("Number of features"), nO=Dimension("Output size"), W=Synapses("Weights matrix", - lambda obj: (obj.nI, obj.nF, obj.nO), - lambda W, ops: _init_for_precomputed(W, ops)), + lambda obj: (obj.nI, obj.nF, obj.nO)), b=Biases("Bias vector", lambda obj: (obj.nO,)), d_W=Gradient("W"), @@ -173,6 +167,52 @@ class PrecomputableAffine(Model): weights = self.ops.xp.ascontiguousarray(weights) return weights.reshape(shape) + @staticmethod + def init_weights(model): + '''This is like the 'layer sequential unit variance', but instead + of taking the actual inputs, we randomly generate whitened data. + + Why's this all so complicated? We have a huge number of inputs, + and the maxout unit makes guessing the dynamics tricky. Instead + we set the maxout weights to values that empirically result in + whitened outputs given whitened inputs. + ''' + if (model.W**2).sum() != 0.: + return + model.ops.normal_init(model.W, model.nFI, inplace=True) + + ids = numpy.zeros((5000, model.nF), dtype='i') + ids += numpy.asarray(numpy.random.uniform(0, 1000, ids.shape), dtype='i') + tokvecs = numpy.zeros((5000, model.nI), dtype='f') + tokvecs += numpy.random.normal(loc=0., scale=1., + size=tokvecs.size).reshape(tokvecs.shape) + + def predict(ids, tokvecs): + hiddens = model(tokvecs) + vector = model.ops.allocate((hiddens.shape[0], model.nO)) + model.ops.scatter_add(vector, ids, hiddens) + vector += model.b + if model.nP >= 2: + vector = vector.reshape((ids.shape[0], model.nO//model.nP, model.nP)) + return model.ops.maxout(vector)[0] + else: + return vector * (vector >= 0) + + tol_var = 0.01 + tol_mean = 0.01 + t_max = 10 + t_i = 0 + for t_i in range(t_max): + acts1 = predict(ids, tokvecs) + var = numpy.var(acts1) + mean = numpy.mean(acts1) + if abs(var - 1.0) >= tol_var: + model.W /= numpy.sqrt(var) + elif abs(mean) >= tol_mean: + model.b -= mean + else: + break + # Thinc's Embed class is a bit broken atm, so drop this here. from thinc import describe From b10173655589b038ba1e69e937eddf03819dc94d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 12:14:52 +0200 Subject: [PATCH 08/28] Fix precomputed layer --- spacy/_ml.py | 46 ++++++++++++---------------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 1f0bfa5b6..934832a63 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -111,7 +111,7 @@ def _preprocess_doc(docs, drop=0.): nF=Dimension("Number of features"), nO=Dimension("Output size"), W=Synapses("Weights matrix", - lambda obj: (obj.nI, obj.nF, obj.nO)), + lambda obj: (obj.nF, obj.nO, obj.nI)), b=Biases("Bias vector", lambda obj: (obj.nO,)), d_W=Gradient("W"), @@ -124,37 +124,20 @@ class PrecomputableAffine(Model): self.nI = nI self.nF = nF - @property - def nFI(self): - return self.nI * self.nF - - @property - def nFO(self): - return self.nF * self.nO - def begin_update(self, X, drop=0.): - nN = X.shape[0] - # X: (b, i) - # Xf: (b, f, i) - # Yf: (b, f, o) - # dY: (b, o) - # dYf: (b, f, o) - # W: (i, f, o) - W = self.W.reshape((self.nI, self.nFO)) - Yf = self.ops.xp.dot(X, W) - Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO)) - #Yf = einsum('ab,bc->ac', X, W) + tensordot = self.ops.xp.tensordot + ascontiguous = self.ops.xp.ascontiguousarray + + Yf = tensordot(X, self.W, axes=[[1], [2]]) + def backward(dY_ids, sgd=None): dY, ids = dY_ids Xf = X[ids] - # bo,fi_o->b_if -> b_fi - W_o_fi = self._transpose(self.W, shape=(self.nO, self.nFI)) - dXf = self.ops.xp.dot(dY, W_o_fi).reshape((Xf.shape[0], self.nF, self.nI)) - # bo,b_fi->o_fi - dW = Xf.reshape((Xf.shape[0], self.nFI)) - dW = self.ops.xp.dot(Xf.T, dY) - dW = dW.reshape((self.nO, self.nF, self.nI)) - self.d_W += dW.transpose((2, 1, 0)) + + dXf = tensordot(dY, self.W, axes=[[1], [1]]) + dW = tensordot(dY, Xf, axes=[[0], [0]]) + + self.d_W += dW.transpose((1, 0, 2)) self.d_b += dY.sum(axis=0) if sgd is not None: @@ -162,11 +145,6 @@ class PrecomputableAffine(Model): return dXf return Yf, backward - def _transpose(self, weights, shape): - weights = weights.transpose((2, 1, 0)) - weights = self.ops.xp.ascontiguousarray(weights) - return weights.reshape(shape) - @staticmethod def init_weights(model): '''This is like the 'layer sequential unit variance', but instead @@ -179,7 +157,7 @@ class PrecomputableAffine(Model): ''' if (model.W**2).sum() != 0.: return - model.ops.normal_init(model.W, model.nFI, inplace=True) + model.ops.normal_init(model.W, model.nF * model.nI, inplace=True) ids = numpy.zeros((5000, model.nF), dtype='i') ids += numpy.asarray(numpy.random.uniform(0, 1000, ids.shape), dtype='i') From 3faf9189a275c775fe04aafc56ac95a1cb4393b2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 16:23:31 +0200 Subject: [PATCH 09/28] Make parser hidden shape consistent even if maxout==1 --- spacy/_ml.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 934832a63..8d1b81048 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -110,17 +110,19 @@ def _preprocess_doc(docs, drop=0.): nI=Dimension("Input size"), nF=Dimension("Number of features"), nO=Dimension("Output size"), + nP=Dimension("Maxout pieces"), W=Synapses("Weights matrix", - lambda obj: (obj.nF, obj.nO, obj.nI)), + lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)), b=Biases("Bias vector", - lambda obj: (obj.nO,)), + lambda obj: (obj.nO, obj.nP)), d_W=Gradient("W"), d_b=Gradient("b") ) class PrecomputableAffine(Model): - def __init__(self, nO=None, nI=None, nF=None, **kwargs): + def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs): Model.__init__(self, **kwargs) self.nO = nO + self.nP = nP self.nI = nI self.nF = nF @@ -128,16 +130,16 @@ class PrecomputableAffine(Model): tensordot = self.ops.xp.tensordot ascontiguous = self.ops.xp.ascontiguousarray - Yf = tensordot(X, self.W, axes=[[1], [2]]) + Yf = tensordot(X, self.W, axes=[[1], [3]]) def backward(dY_ids, sgd=None): dY, ids = dY_ids Xf = X[ids] - dXf = tensordot(dY, self.W, axes=[[1], [1]]) + dXf = tensordot(dY, self.W, axes=[[1,2], [1,2]]) dW = tensordot(dY, Xf, axes=[[0], [0]]) - - self.d_W += dW.transpose((1, 0, 2)) + # (o, p, f, i) --> (f, o, p, i) + self.d_W += dW.transpose((2, 0, 1, 3)) self.d_b += dY.sum(axis=0) if sgd is not None: @@ -167,11 +169,10 @@ class PrecomputableAffine(Model): def predict(ids, tokvecs): hiddens = model(tokvecs) - vector = model.ops.allocate((hiddens.shape[0], model.nO)) + vector = model.ops.allocate((hiddens.shape[0], model.nO, model.nP)) model.ops.scatter_add(vector, ids, hiddens) vector += model.b if model.nP >= 2: - vector = vector.reshape((ids.shape[0], model.nO//model.nP, model.nP)) return model.ops.maxout(vector)[0] else: return vector * (vector >= 0) From 10367981553bcf1b7361cdbb76bdb50ad9d06b6f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 16:24:16 +0200 Subject: [PATCH 10/28] Make parser consistent if maxout==1 --- spacy/syntax/nn_parser.pyx | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 465e4d877..f95d4e0cd 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -136,7 +136,8 @@ cdef class precompute_hiddens: return self.begin_update(X)[0] def begin_update(self, token_ids, drop=0.): - cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f') + cdef np.ndarray state_vector = numpy.zeros( + (token_ids.shape[0], self.nO, self.nP), dtype='f') # This is tricky, but (assuming GPU available); # - Input to forward on CPU # - Output from forward on CPU @@ -166,16 +167,13 @@ cdef class precompute_hiddens: mask = state_vector >= 0. state_vector *= mask else: - state_vector = state_vector.reshape( - (state_vector.shape[0], self.nO, self.nP)) state_vector, mask = self.ops.maxout(state_vector) def backprop_nonlinearity(d_best, sgd=None): if self.nP == 1: return d_best * mask else: - d_vector = self.ops.backprop_maxout(d_best, mask, self.nP) - return d_vector.reshape((d_vector.shape[0], self.nO*self.nP)) + return self.ops.backprop_maxout(d_best, mask, self.nP) return state_vector, backprop_nonlinearity @@ -266,8 +264,9 @@ cdef class Parser: tok2vec = Tok2Vec(token_vector_width, embed_size, pretrained_dims=cfg.get('pretrained_dims', 0)) tok2vec = chain(tok2vec, flatten) - lower = PrecomputableAffine(hidden_width * parser_maxout_pieces, - nF=cls.nr_feature, nI=token_vector_width) + lower = PrecomputableAffine(hidden_width, + nF=cls.nr_feature, nI=token_vector_width, + nP=parser_maxout_pieces) lower.nP = parser_maxout_pieces with Model.use_device('cpu'): From e7556ff048cb0c15b9fdae303852e1bb72925936 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 23 Oct 2017 18:16:23 +0200 Subject: [PATCH 11/28] Fix non-maxout parser --- spacy/syntax/nn_parser.pyx | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index f95d4e0cd..eb33d4a7b 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -117,7 +117,7 @@ cdef class precompute_hiddens: cached = gpu_cached self.nF = cached.shape[1] self.nP = getattr(lower_model, 'nP', 1) - self.nO = cached.shape[2] // self.nP + self.nO = cached.shape[2] self.ops = lower_model.ops self.bias = lower_model.b self._is_synchronized = False @@ -150,7 +150,7 @@ cdef class precompute_hiddens: sum_state_features(state_vector.data, feat_weights, &ids[0,0], token_ids.shape[0], self.nF, self.nO*self.nP) - state_vector += self.bias.ravel() + state_vector += self.bias state_vector, bp_nonlinearity = self._nonlinearity(state_vector) def backward(d_state_vector, sgd=None): @@ -164,6 +164,7 @@ cdef class precompute_hiddens: def _nonlinearity(self, state_vector): if self.nP == 1: + state_vector = state_vector.reshape(state_vector.shape[:-1]) mask = state_vector >= 0. state_vector *= mask else: @@ -171,7 +172,9 @@ cdef class precompute_hiddens: def backprop_nonlinearity(d_best, sgd=None): if self.nP == 1: - return d_best * mask + d_best *= mask + d_best = d_best.reshape((d_best.shape + (1,))) + return d_best else: return self.ops.backprop_maxout(d_best, mask, self.nP) return state_vector, backprop_nonlinearity From bb25bdcd923534108691174850449f98711c6834 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 01:16:55 +0000 Subject: [PATCH 12/28] Adjust call to scatter_add for the new version --- spacy/syntax/nn_parser.pyx | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index c9a4926fc..96fdbab6d 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -1,5 +1,4 @@ # cython: infer_types=True -# cython: profile=True # cython: cdivision=True # cython: boundscheck=False # coding: utf-8 @@ -435,8 +434,7 @@ cdef class Parser: cdef int nr_hidden = hidden_weights.shape[0] cdef int nr_task = states.size() with nogil: - for i in cython.parallel.prange(nr_task, num_threads=2, - schedule='guided'): + for i in range(nr_task): self._parseC(states[i], feat_weights, bias, hW, hb, nr_class, nr_hidden, nr_feat, nr_piece) @@ -697,9 +695,10 @@ cdef class Parser: xp = get_array_module(d_tokvecs) for ids, d_vector, bp_vector in backprops: d_state_features = bp_vector(d_vector, sgd=sgd) - mask = ids >= 0 - d_state_features *= mask.reshape(ids.shape + (1,)) - self.model[0].ops.scatter_add(d_tokvecs, ids * mask, + ids = ids.flatten() + d_state_features = d_state_features.reshape( + (ids.size, d_state_features.shape[2])) + self.model[0].ops.scatter_add(d_tokvecs, ids, d_state_features) bp_tokvecs(d_tokvecs, sgd=sgd) From 783c0c87958e0af281f346de8d1957b93000c74a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 01:17:54 +0000 Subject: [PATCH 13/28] Remove unnecessary bz2 import --- spacy/vocab.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index bcd1f3c10..1a91c2c0e 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,7 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -import bz2 import ujson import re import numpy From b9616419e1395745ce59288d01e591d72f80f0c8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 01:18:05 +0000 Subject: [PATCH 14/28] Add try/except around bz2 import --- spacy/cli/model.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/spacy/cli/model.py b/spacy/cli/model.py index 14e75647e..bcc1626bc 100644 --- a/spacy/cli/model.py +++ b/spacy/cli/model.py @@ -1,8 +1,11 @@ # coding: utf8 from __future__ import unicode_literals -import bz2 -import gzip +try: + import bz2 + import gzip +except ImportError: + pass import math from ast import literal_eval from pathlib import Path From f6fef30adc217ed84dc658bc849cdee039663750 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 10:16:41 +0000 Subject: [PATCH 15/28] Remove dead code from spacy._ml --- spacy/_ml.py | 71 ++-------------------------------------------------- 1 file changed, 2 insertions(+), 69 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index b85f6ef9d..dd80e5b1a 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -348,58 +348,12 @@ def reapply(layer, n_times): return wrap(reapply_fwd, layer) - - def asarray(ops, dtype): def forward(X, drop=0.): return ops.asarray(X, dtype=dtype), None return layerize(forward) -def foreach(layer): - def forward(Xs, drop=0.): - results = [] - backprops = [] - for X in Xs: - result, bp = layer.begin_update(X, drop=drop) - results.append(result) - backprops.append(bp) - def backward(d_results, sgd=None): - dXs = [] - for d_result, backprop in zip(d_results, backprops): - dXs.append(backprop(d_result, sgd)) - return dXs - return results, backward - model = layerize(forward) - model._layers.append(layer) - return model - - -def rebatch(size, layer): - ops = layer.ops - def forward(X, drop=0.): - if X.shape[0] < size: - return layer.begin_update(X) - parts = _divide_array(X, size) - results, bp_results = zip(*[layer.begin_update(p, drop=drop) - for p in parts]) - y = ops.flatten(results) - def backward(dy, sgd=None): - d_parts = [bp(y, sgd=sgd) for bp, y in - zip(bp_results, _divide_array(dy, size))] - try: - dX = ops.flatten(d_parts) - except TypeError: - dX = None - except ValueError: - dX = None - return dX - return y, backward - model = layerize(forward) - model._layers.append(layer) - return model - - def _divide_array(X, size): parts = [] index = 0 @@ -508,11 +462,13 @@ def preprocess_doc(docs, drop=0.): vals = ops.allocate(keys.shape[0]) + 1 return (keys, vals, lengths), None + def getitem(i): def getitem_fwd(X, drop=0.): return X[i], None return layerize(getitem_fwd) + def build_tagger_model(nr_class, **cfg): embed_size = util.env_opt('embed_size', 7000) if 'token_vector_width' in cfg: @@ -552,29 +508,6 @@ def SpacyVectors(docs, drop=0.): return batch, None -def foreach(layer, drop_factor=1.0): - '''Map a layer across elements in a list''' - def foreach_fwd(Xs, drop=0.): - drop *= drop_factor - ys = [] - backprops = [] - for X in Xs: - y, bp_y = layer.begin_update(X, drop=drop) - ys.append(y) - backprops.append(bp_y) - def foreach_bwd(d_ys, sgd=None): - d_Xs = [] - for d_y, bp_y in zip(d_ys, backprops): - if bp_y is not None and bp_y is not None: - d_Xs.append(d_y, sgd=sgd) - else: - d_Xs.append(None) - return d_Xs - return ys, foreach_bwd - model = wrap(foreach_fwd, layer) - return model - - def build_text_classifier(nr_class, width=64, **cfg): nr_vector = cfg.get('nr_vector', 5000) pretrained_dims = cfg.get('pretrained_dims', 0) From 642eb28c168ae1251459bf0a8960cf68cdc1004b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 10:16:58 +0000 Subject: [PATCH 16/28] Don't compile with OpenMP by default --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2e2b816b7..a33826c23 100755 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ LINK_OPTIONS = { # I don't understand this very well yet. See Issue #267 # Fingers crossed! -USE_OPENMP_DEFAULT = '1' if sys.platform != 'darwin' else None +USE_OPENMP_DEFAULT = '0' if sys.platform != 'darwin' else None if os.environ.get('USE_OPENMP', USE_OPENMP_DEFAULT) == '1': if sys.platform == 'darwin': COMPILE_OPTIONS['other'].append('-fopenmp') From c9987cf131a5cc8d41437136dad1c765f20e5862 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 10:18:36 +0000 Subject: [PATCH 17/28] Avoid use of numpy.tensordot --- spacy/_ml.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index dd80e5b1a..de2bd4b86 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -127,24 +127,34 @@ class PrecomputableAffine(Model): self.nF = nF def begin_update(self, X, drop=0.): - tensordot = self.ops.xp.tensordot - ascontiguous = self.ops.xp.ascontiguousarray - - Yf = tensordot(X, self.W, axes=[[1], [3]]) + Yf = self.ops.dot(X, + self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T) + + Yf = Yf.reshape((X.shape[0], self.nF, self.nO, self.nP)) def backward(dY_ids, sgd=None): dY, ids = dY_ids Xf = X[ids] + Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI)) - dXf = tensordot(dY, self.W, axes=[[1,2], [1,2]]) - dW = tensordot(dY, Xf, axes=[[0], [0]]) - # (o, p, f, i) --> (f, o, p, i) - self.d_W += dW.transpose((2, 0, 1, 3)) self.d_b += dY.sum(axis=0) + dY = dY.reshape((dY.shape[0], self.nO*self.nP)) + + Wopfi = self.W.transpose((1, 2, 0, 3)) + Wopfi = self.ops.xp.ascontiguousarray(Wopfi) + Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI)) + dXf = self.ops.dot(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi) + + # Reuse the buffer + dWopfi = Wopfi; dWopfi.fill(0.) + self.ops.xp.dot(dY.T, Xf, out=dWopfi) + dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI)) + # (o, p, f, i) --> (f, o, p, i) + self.d_W += dWopfi.transpose((2, 0, 1, 3)) if sgd is not None: sgd(self._mem.weights, self._mem.gradient, key=self.id) - return dXf + return dXf.reshape((dXf.shape[0], self.nF, self.nI)) return Yf, backward @staticmethod @@ -168,9 +178,9 @@ class PrecomputableAffine(Model): size=tokvecs.size).reshape(tokvecs.shape) def predict(ids, tokvecs): - hiddens = model(tokvecs) + hiddens = model(tokvecs) # (b, f, o, p) vector = model.ops.allocate((hiddens.shape[0], model.nO, model.nP)) - model.ops.scatter_add(vector, ids, hiddens) + model.ops.xp.add.at(vector, ids, hiddens) vector += model.b if model.nP >= 2: return model.ops.maxout(vector)[0] @@ -318,8 +328,7 @@ def Tok2Vec(width, embed_size, **kwargs): tok2vec = ( FeatureExtracter(cols) - >> with_flatten( - embed >> (convolution ** 4), pad=4) + >> with_flatten(embed >> (convolution ** 4), pad=4) ) # Work around thinc API limitations :(. TODO: Revise in Thinc 7 From 75a637fa439893d4d60e23a9aa3e2af241faf84a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 10:19:56 +0000 Subject: [PATCH 18/28] Remove redundant imports from _ml --- spacy/pipeline.pyx | 2 +- spacy/syntax/nn_parser.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 7c1976dfa..685c8ee00 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -42,7 +42,7 @@ from .syntax import nonproj from .compat import json_dumps from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS -from ._ml import rebatch, Tok2Vec, flatten +from ._ml import Tok2Vec, flatten from ._ml import build_text_classifier, build_tagger_model from ._ml import link_vectors_to_models from .parts_of_speech import X diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 96fdbab6d..773ab4e63 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -47,7 +47,7 @@ from thinc.neural.util import get_array_module from .. import util from ..util import get_async, get_cuda_stream from .._ml import zero_init, PrecomputableAffine -from .._ml import Tok2Vec, doc2feats, rebatch +from .._ml import Tok2Vec, doc2feats from .._ml import Residual, drop_layer, flatten from .._ml import link_vectors_to_models from .._ml import HistoryFeatures From 4d048e94d3eaa88e038e56967c0bf7599d11f6ae Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 10:23:49 +0000 Subject: [PATCH 19/28] Add compat for thinc.neural.optimizers.Optimizer --- spacy/compat.py | 4 ++++ spacy/language.py | 11 ++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/spacy/compat.py b/spacy/compat.py index 81243ce1b..31b33e771 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -30,6 +30,10 @@ try: except ImportError: cupy = None +try: + from thinc.optimizers import Optimizer +except ImportError: + from thinc.optimizers import Adam as Optimizer pickle = pickle copy_reg = copy_reg diff --git a/spacy/language.py b/spacy/language.py index 933ca772d..adc2860eb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -3,7 +3,6 @@ from __future__ import absolute_import, unicode_literals from contextlib import contextmanager from thinc.neural import Model -from thinc.neural.optimizers import Adam import random import ujson from collections import OrderedDict @@ -21,6 +20,7 @@ from .syntax.parser import get_templates from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer +from .compat import Optimizer from .compat import json_dumps, izip, copy_reg from .scorer import Scorer from ._ml import link_vectors_to_models @@ -359,7 +359,8 @@ class Language(object): return if sgd is None: if self._optimizer is None: - self._optimizer = Adam(Model.ops, 0.001) + self._optimizer = Optimizer(Model.ops, 0.001, + beta1=0.9, beta2=0.0, nesterov=True) sgd = self._optimizer grads = {} def get_grads(W, dW, key=None): @@ -400,8 +401,8 @@ class Language(object): eps = util.env_opt('optimizer_eps', 1e-08) L2 = util.env_opt('L2_penalty', 1e-6) max_grad_norm = util.env_opt('grad_norm_clip', 1.) - self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, - beta2=beta2, eps=eps) + self._optimizer = Optimizer(Model.ops, learn_rate, L2=L2, beta1=beta1, + beta2=beta2, eps=eps, nesterov=True) self._optimizer.max_grad_norm = max_grad_norm self._optimizer.device = device return self._optimizer @@ -440,7 +441,7 @@ class Language(object): eps = util.env_opt('optimizer_eps', 1e-08) L2 = util.env_opt('L2_penalty', 1e-6) max_grad_norm = util.env_opt('grad_norm_clip', 1.) - self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, + self._optimizer = Optimizer(Model.ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps) self._optimizer.max_grad_norm = max_grad_norm self._optimizer.device = device From 19a2b9bf27f768a2c3f8c8033b1679e950b493a6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 27 Oct 2017 12:33:42 +0000 Subject: [PATCH 20/28] Fix import of Optimizer --- spacy/compat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/compat.py b/spacy/compat.py index 31b33e771..8dd3d6b03 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -31,9 +31,9 @@ except ImportError: cupy = None try: - from thinc.optimizers import Optimizer + from thinc.neural.optimizers import Optimizer except ImportError: - from thinc.optimizers import Adam as Optimizer + from thinc.neural.optimizers import Adam as Optimizer pickle = pickle copy_reg = copy_reg From fb0c96f39a1c3f8a2cec8844effab950c6503088 Mon Sep 17 00:00:00 2001 From: Explosion Bot Date: Sat, 28 Oct 2017 11:58:16 +0200 Subject: [PATCH 21/28] Fix optimizer loading --- spacy/language.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 1b52829b6..959fee916 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -379,8 +379,7 @@ class Language(object): return if sgd is None: if self._optimizer is None: - self._optimizer = Optimizer(Model.ops, 0.001, - beta1=0.9, beta2=0.0, nesterov=True) + self._optimizer = Adam(Model.ops, 0.001) sgd = self._optimizer grads = {} def get_grads(W, dW, key=None): @@ -422,7 +421,7 @@ class Language(object): L2 = util.env_opt('L2_penalty', 1e-6) max_grad_norm = util.env_opt('grad_norm_clip', 1.) self._optimizer = Optimizer(Model.ops, learn_rate, L2=L2, beta1=beta1, - beta2=beta2, eps=eps, nesterov=True) + beta2=beta2, eps=eps) self._optimizer.max_grad_norm = max_grad_norm self._optimizer.device = device return self._optimizer From df4803cc6deedbbb19eff179f46a058753b95b98 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 28 Oct 2017 16:45:14 +0000 Subject: [PATCH 22/28] Add learned missing values for parser --- spacy/_ml.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index de89e04d0..c956de601 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -88,7 +88,11 @@ def _preprocess_doc(docs, drop=0.): lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)), b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)), + pad=Synapses("Pad", + lambda obj: (1, obj.nF, obj.nO, obj.nP), + lambda M, ops: ops.normal_init(M, 1.)), d_W=Gradient("W"), + d_pad=Gradient("pad"), d_b=Gradient("b")) class PrecomputableAffine(Model): def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs): @@ -99,13 +103,14 @@ class PrecomputableAffine(Model): self.nF = nF def begin_update(self, X, drop=0.): - Yf = self.ops.dot(X, - self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T) - - Yf = Yf.reshape((X.shape[0], self.nF, self.nO, self.nP)) + Yf = self.ops.xp.dot(X, + self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T) + Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP)) + Yf = self._add_padding(Yf) def backward(dY_ids, sgd=None): dY, ids = dY_ids + dY, ids = self._backprop_padding(dY, ids) Xf = X[ids] Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI)) @@ -116,7 +121,7 @@ class PrecomputableAffine(Model): Wopfi = self.ops.xp.ascontiguousarray(Wopfi) Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI)) dXf = self.ops.dot(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi) - + # Reuse the buffer dWopfi = Wopfi; dWopfi.fill(0.) self.ops.xp.dot(dY.T, Xf, out=dWopfi) @@ -128,6 +133,17 @@ class PrecomputableAffine(Model): sgd(self._mem.weights, self._mem.gradient, key=self.id) return dXf.reshape((dXf.shape[0], self.nF, self.nI)) return Yf, backward + + def _add_padding(self, Yf): + Yf_padded = self.ops.xp.vstack((self.pad, Yf)) + return Yf_padded[1:] + + def _backprop_padding(self, dY, ids): + for i in range(ids.shape[0]): + for j in range(ids.shape[1]): + if ids[i, j] < 0: + self.d_pad[0, j] += dY[i, j] + return dY, ids @staticmethod def init_weights(model): From 5414e2f14b7c0dbcbcec08b1d7a101c5521491e7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 28 Oct 2017 16:45:54 +0000 Subject: [PATCH 23/28] Use missing features in parser --- spacy/syntax/nn_parser.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 1aa4443d0..558e88b3e 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -157,12 +157,14 @@ cdef void sum_state_features(float* output, const float* cached, const int* token_ids, int B, int F, int O) nogil: cdef int idx, b, f, i cdef const float* feature + padding = cached - (F * O) for b in range(B): for f in range(F): if token_ids[f] < 0: - continue - idx = token_ids[f] * F * O + f*O - feature = &cached[idx] + feature = &padding[f*O] + else: + idx = token_ids[f] * F * O + f*O + feature = &cached[idx] for i in range(O): output[i] += feature[i] output += O From 6ef72864fa23199a837e9197db8005f059255cce Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 28 Oct 2017 17:05:01 +0000 Subject: [PATCH 24/28] Improve initialization for hidden layers --- spacy/_ml.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index c956de601..018589537 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -166,14 +166,18 @@ class PrecomputableAffine(Model): size=tokvecs.size).reshape(tokvecs.shape) def predict(ids, tokvecs): - hiddens = model(tokvecs) # (b, f, o, p) - vector = model.ops.allocate((hiddens.shape[0], model.nO, model.nP)) - model.ops.xp.add.at(vector, ids, hiddens) - vector += model.b + # nS ids. nW tokvecs + hiddens = model(tokvecs) # (nW, f, o, p) + # need nS vectors + vectors = model.ops.allocate((ids.shape[0], model.nO, model.nP)) + for i, feats in enumerate(ids): + for j, id_ in enumerate(feats): + vectors[i] += hiddens[id_, j] + vectors += model.b if model.nP >= 2: - return model.ops.maxout(vector)[0] + return model.ops.maxout(vectors)[0] else: - return vector * (vector >= 0) + return vectors * (vectors >= 0) tol_var = 0.01 tol_mean = 0.01 From 3b910973213fb2d7d99be52e772561e980dd7b0c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 28 Oct 2017 17:05:11 +0000 Subject: [PATCH 25/28] Whitespace --- spacy/_ml.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 018589537..c99f840b7 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -241,9 +241,11 @@ def Tok2Vec(width, embed_size, **kwargs): tok2vec = ( FeatureExtracter(cols) - >> with_flatten(embed >> (convolution ** 4), pad=4) + >> with_flatten( + embed + >> convolution ** 4, pad=4 + ) ) - # Work around thinc API limitations :(. TODO: Revise in Thinc 7 tok2vec.nO = width tok2vec.embed = embed From 314f5b9cdbcbaa0d188bd1a21402d6cfd890b534 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 28 Oct 2017 18:20:10 +0000 Subject: [PATCH 26/28] Require thinc 6.10.0 --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0b46b38d5..01e41c993 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ pathlib numpy>=1.7 cymem>=1.30,<1.32 preshed>=1.0.0,<2.0.0 -thinc>=6.9.0,<6.10.0 +thinc>=6.10.0,<6.11.0 murmurhash>=0.28,<0.29 plac<1.0.0,>=0.9.6 six diff --git a/setup.py b/setup.py index 37bfd0495..727df5e4e 100755 --- a/setup.py +++ b/setup.py @@ -190,7 +190,7 @@ def setup_package(): 'murmurhash>=0.28,<0.29', 'cymem>=1.30,<1.32', 'preshed>=1.0.0,<2.0.0', - 'thinc>=6.9.0,<6.10.0', + 'thinc>=6.10.0,<6.11.0', 'plac<1.0.0,>=0.9.6', 'six', 'pathlib', From b713d10d970a570d61eb553ea8e055974b36c949 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 28 Oct 2017 23:01:14 +0000 Subject: [PATCH 27/28] Switch to 13 features in parser --- spacy/syntax/nn_parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 558e88b3e..e480bd1dc 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -680,7 +680,7 @@ cdef class Parser: lower, stream, drop=0.0) return (tokvecs, bp_tokvecs), state2vec, upper - nr_feature = 8 + nr_feature = 13 def get_token_ids(self, states): cdef StateClass state From a0c7dabb722d0985e0f53f09561e10092125ae69 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 28 Oct 2017 23:01:35 +0000 Subject: [PATCH 28/28] Fix bug in 8-token parser features --- spacy/syntax/_state.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 803348b53..5470df470 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -110,7 +110,7 @@ cdef cppclass StateC: ids[3] = this.S(1) ids[4] = this.H(this.S(0)) ids[5] = this.L(this.B(0), 1) - ids[6] = this.L(this.S(0), 2) + ids[6] = this.L(this.S(0), 1) ids[7] = this.R(this.S(0), 1) elif n == 13: ids[0] = this.B(0)