diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 8a418aded..0ae1b19df 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -87,7 +87,7 @@ cdef class precompute_hiddens:
     we can do all our hard maths up front, packed into large multiplications,
     and do the hard-to-program parsing on the CPU.
     '''
-    cdef int nF, nO
+    cdef int nF, nO, nP
     cdef bint _is_synchronized
     cdef public object ops
     cdef np.ndarray _features
@@ -107,8 +107,9 @@ cdef class precompute_hiddens:
             cached = gpu_cached
         self.nF = cached.shape[1]
         self.nO = cached.shape[2]
+        self.nP = getattr(lower_model, 'nP', 1)
         self.ops = lower_model.ops
-        self._features = numpy.zeros((batch_size, self.nO), dtype='f')
+        self._features = numpy.zeros((batch_size, self.nO*self.nP), dtype='f')
         self._is_synchronized = False
         self._cuda_stream = cuda_stream
         self._cached = cached
@@ -138,9 +139,12 @@ cdef class precompute_hiddens:
         cdef int[:, ::1] ids = token_ids
         sum_state_features(<float*>state_vector.data,
             feat_weights, &ids[0,0],
-            token_ids.shape[0], self.nF, self.nO)
+            token_ids.shape[0], self.nF, self.nO*self.nP)
+        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
 
         def backward(d_state_vector, sgd=None):
+            if bp_nonlinearity is not None:
+                d_state_vector = bp_nonlinearity(d_state_vector, sgd)
             # This will usually be on GPU
             if isinstance(d_state_vector, numpy.ndarray):
                 d_state_vector = self.ops.xp.array(d_state_vector)
@@ -148,6 +152,15 @@ cdef class precompute_hiddens:
             return d_tokens
         return state_vector, backward
 
+    def _nonlinearity(self, state_vector):
+        if self.nP == 1:
+            return state_vector, None
+        best, which = self.ops.maxout(state_vector, self.nP)
+        def backprop(d_best, sgd=None):
+            return self.ops.backprop_maxout(d_best, which, self.nP)
+        return best, backprop
+
+
 cdef void sum_state_features(float* output,
         const float* cached, const int* token_ids, int B, int F, int O) nogil:
     cdef int idx, b, f, i
@@ -220,9 +233,16 @@ cdef class Parser:
         depth = util.env_opt('parser_hidden_depth', depth)
         token_vector_width = util.env_opt('token_vector_width', token_vector_width)
         hidden_width = util.env_opt('hidden_width', hidden_width)
-        lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
-                    nF=cls.nr_feature,
-                    nI=token_vector_width)
+        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
+        if parser_maxout_pieces == 1:
+            lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
+                        nF=cls.nr_feature,
+                        nI=token_vector_width)
+        else:
+            lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class,
+                        nF=cls.nr_feature,
+                        nP=parser_maxout_pieces,
+                        nI=token_vector_width)
 
         with Model.use_device('cpu'):
             if depth == 0: