Support optional maxout layer

2017-05-23 05:58:07 -05:00 · 2017-05-23 05:58:07 -05:00 · a8b6d11c5b
parent c55b8fa7c5
commit a8b6d11c5b
1 changed files with 26 additions and 6 deletions
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -87,7 +87,7 @@ cdef class precompute_hiddens:
    we can do all our hard maths up front, packed into large multiplications,
    and do the hard-to-program parsing on the CPU.
    '''
-    cdef int nF, nO
+    cdef int nF, nO, nP
    cdef bint _is_synchronized
    cdef public object ops
    cdef np.ndarray _features
@ -107,8 +107,9 @@ cdef class precompute_hiddens:
            cached = gpu_cached
        self.nF = cached.shape[1]
        self.nO = cached.shape[2]
+        self.nP = getattr(lower_model, 'nP', 1)
        self.ops = lower_model.ops
-        self._features = numpy.zeros((batch_size, self.nO), dtype='f')
+        self._features = numpy.zeros((batch_size, self.nO*self.nP), dtype='f')
        self._is_synchronized = False
        self._cuda_stream = cuda_stream
        self._cached = cached
@ -138,9 +139,12 @@ cdef class precompute_hiddens:
        cdef int[:, ::1] ids = token_ids
        sum_state_features(<float*>state_vector.data,
            feat_weights, &ids[0,0],
-            token_ids.shape[0], self.nF, self.nO)
+            token_ids.shape[0], self.nF, self.nO*self.nP)
+        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)

        def backward(d_state_vector, sgd=None):
+            if bp_nonlinearity is not None:
+                d_state_vector = bp_nonlinearity(d_state_vector, sgd)
            # This will usually be on GPU
            if isinstance(d_state_vector, numpy.ndarray):
                d_state_vector = self.ops.xp.array(d_state_vector)
@ -148,6 +152,15 @@ cdef class precompute_hiddens:
            return d_tokens
        return state_vector, backward

+    def _nonlinearity(self, state_vector):
+        if self.nP == 1:
+            return state_vector, None
+        best, which = self.ops.maxout(state_vector, self.nP)
+        def backprop(d_best, sgd=None):
+            return self.ops.backprop_maxout(d_best, which, self.nP)
+        return best, backprop
+
+
 cdef void sum_state_features(float* output,
        const float* cached, const int* token_ids, int B, int F, int O) nogil:
    cdef int idx, b, f, i
@ -220,9 +233,16 @@ cdef class Parser:
        depth = util.env_opt('parser_hidden_depth', depth)
        token_vector_width = util.env_opt('token_vector_width', token_vector_width)
        hidden_width = util.env_opt('hidden_width', hidden_width)
-        lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
-                    nF=cls.nr_feature,
-                    nI=token_vector_width)
+        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
+        if parser_maxout_pieces == 1:
+            lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
+                        nF=cls.nr_feature,
+                        nI=token_vector_width)
+        else:
+            lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class,
+                        nF=cls.nr_feature,
+                        nP=parser_maxout_pieces,
+                        nI=token_vector_width)

        with Model.use_device('cpu'):
            if depth == 0: