Support optional maxout layer

This commit is contained in:
Matthew Honnibal 2017-05-23 05:58:07 -05:00
parent c55b8fa7c5
commit a8b6d11c5b
1 changed files with 26 additions and 6 deletions

View File

@ -87,7 +87,7 @@ cdef class precompute_hiddens:
we can do all our hard maths up front, packed into large multiplications, we can do all our hard maths up front, packed into large multiplications,
and do the hard-to-program parsing on the CPU. and do the hard-to-program parsing on the CPU.
''' '''
cdef int nF, nO cdef int nF, nO, nP
cdef bint _is_synchronized cdef bint _is_synchronized
cdef public object ops cdef public object ops
cdef np.ndarray _features cdef np.ndarray _features
@ -107,8 +107,9 @@ cdef class precompute_hiddens:
cached = gpu_cached cached = gpu_cached
self.nF = cached.shape[1] self.nF = cached.shape[1]
self.nO = cached.shape[2] self.nO = cached.shape[2]
self.nP = getattr(lower_model, 'nP', 1)
self.ops = lower_model.ops self.ops = lower_model.ops
self._features = numpy.zeros((batch_size, self.nO), dtype='f') self._features = numpy.zeros((batch_size, self.nO*self.nP), dtype='f')
self._is_synchronized = False self._is_synchronized = False
self._cuda_stream = cuda_stream self._cuda_stream = cuda_stream
self._cached = cached self._cached = cached
@ -138,9 +139,12 @@ cdef class precompute_hiddens:
cdef int[:, ::1] ids = token_ids cdef int[:, ::1] ids = token_ids
sum_state_features(<float*>state_vector.data, sum_state_features(<float*>state_vector.data,
feat_weights, &ids[0,0], feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO) token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
def backward(d_state_vector, sgd=None): def backward(d_state_vector, sgd=None):
if bp_nonlinearity is not None:
d_state_vector = bp_nonlinearity(d_state_vector, sgd)
# This will usually be on GPU # This will usually be on GPU
if isinstance(d_state_vector, numpy.ndarray): if isinstance(d_state_vector, numpy.ndarray):
d_state_vector = self.ops.xp.array(d_state_vector) d_state_vector = self.ops.xp.array(d_state_vector)
@ -148,6 +152,15 @@ cdef class precompute_hiddens:
return d_tokens return d_tokens
return state_vector, backward return state_vector, backward
def _nonlinearity(self, state_vector):
if self.nP == 1:
return state_vector, None
best, which = self.ops.maxout(state_vector, self.nP)
def backprop(d_best, sgd=None):
return self.ops.backprop_maxout(d_best, which, self.nP)
return best, backprop
cdef void sum_state_features(float* output, cdef void sum_state_features(float* output,
const float* cached, const int* token_ids, int B, int F, int O) nogil: const float* cached, const int* token_ids, int B, int F, int O) nogil:
cdef int idx, b, f, i cdef int idx, b, f, i
@ -220,9 +233,16 @@ cdef class Parser:
depth = util.env_opt('parser_hidden_depth', depth) depth = util.env_opt('parser_hidden_depth', depth)
token_vector_width = util.env_opt('token_vector_width', token_vector_width) token_vector_width = util.env_opt('token_vector_width', token_vector_width)
hidden_width = util.env_opt('hidden_width', hidden_width) hidden_width = util.env_opt('hidden_width', hidden_width)
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
nF=cls.nr_feature, if parser_maxout_pieces == 1:
nI=token_vector_width) lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature,
nI=token_vector_width)
else:
lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature,
nP=parser_maxout_pieces,
nI=token_vector_width)
with Model.use_device('cpu'): with Model.use_device('cpu'):
if depth == 0: if depth == 0: