Support optional maxout layer

This commit is contained in:
Matthew Honnibal 2017-05-23 05:58:07 -05:00
parent c55b8fa7c5
commit a8b6d11c5b
1 changed files with 26 additions and 6 deletions

View File

@ -87,7 +87,7 @@ cdef class precompute_hiddens:
we can do all our hard maths up front, packed into large multiplications,
and do the hard-to-program parsing on the CPU.
'''
cdef int nF, nO
cdef int nF, nO, nP
cdef bint _is_synchronized
cdef public object ops
cdef np.ndarray _features
@ -107,8 +107,9 @@ cdef class precompute_hiddens:
cached = gpu_cached
self.nF = cached.shape[1]
self.nO = cached.shape[2]
self.nP = getattr(lower_model, 'nP', 1)
self.ops = lower_model.ops
self._features = numpy.zeros((batch_size, self.nO), dtype='f')
self._features = numpy.zeros((batch_size, self.nO*self.nP), dtype='f')
self._is_synchronized = False
self._cuda_stream = cuda_stream
self._cached = cached
@ -138,9 +139,12 @@ cdef class precompute_hiddens:
cdef int[:, ::1] ids = token_ids
sum_state_features(<float*>state_vector.data,
feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO)
token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
def backward(d_state_vector, sgd=None):
if bp_nonlinearity is not None:
d_state_vector = bp_nonlinearity(d_state_vector, sgd)
# This will usually be on GPU
if isinstance(d_state_vector, numpy.ndarray):
d_state_vector = self.ops.xp.array(d_state_vector)
@ -148,6 +152,15 @@ cdef class precompute_hiddens:
return d_tokens
return state_vector, backward
def _nonlinearity(self, state_vector):
if self.nP == 1:
return state_vector, None
best, which = self.ops.maxout(state_vector, self.nP)
def backprop(d_best, sgd=None):
return self.ops.backprop_maxout(d_best, which, self.nP)
return best, backprop
cdef void sum_state_features(float* output,
const float* cached, const int* token_ids, int B, int F, int O) nogil:
cdef int idx, b, f, i
@ -220,9 +233,16 @@ cdef class Parser:
depth = util.env_opt('parser_hidden_depth', depth)
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
hidden_width = util.env_opt('hidden_width', hidden_width)
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature,
nI=token_vector_width)
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
if parser_maxout_pieces == 1:
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature,
nI=token_vector_width)
else:
lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature,
nP=parser_maxout_pieces,
nI=token_vector_width)
with Model.use_device('cpu'):
if depth == 0: