diff --git a/spacy/_ml.py b/spacy/_ml.py
index 1018a9c46..173917a36 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -134,13 +134,14 @@ def Tok2Vec(width, embed_size, preprocess=None):
         shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2)
 
         tok2vec = (
-            flatten
-            >> (lower | prefix | suffix | shape )
-            >> Maxout(width, width*4, pieces=3)
-            >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-            >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-            >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-            >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
+            with_flatten(
+                (lower | prefix | suffix | shape )
+                >> Maxout(width, width*4, pieces=3)
+                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
+            pad=4, ndim=5)
         )
         if preprocess not in (False, None):
             tok2vec = preprocess >> tok2vec
diff --git a/spacy/language.py b/spacy/language.py
index 1e4ae1474..6538b9e27 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -179,10 +179,10 @@ class Language(object):
         tok2vec = self.pipeline[0]
         feats = tok2vec.doc2feats(docs)
         for proc in self.pipeline[1:]:
-            tokvecs, bp_tokvecs = tok2vec.model.begin_update(feats, drop=drop)
             grads = {}
-            d_tokvecs = proc.update((docs, tokvecs), golds, sgd=get_grads, drop=drop)
-            bp_tokvecs(d_tokvecs, sgd=get_grads)
+            tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
+            d_tokvecses = proc.update((docs, tokvecses), golds, sgd=get_grads, drop=drop)
+            bp_tokvecses(d_tokvecses, sgd=get_grads)
             if sgd is not None:
                 for key, (W, dW) in grads.items():
                     # TODO: Unhack this when thinc improves
diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 4cbb666c0..09e79d67d 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -10,7 +10,7 @@ cimport numpy as np
 import cytoolz
 import util
 
-from thinc.api import add, layerize, chain, clone, concatenate
+from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
 from thinc.neural import Model, Maxout, Softmax, Affine
 from thinc.neural._classes.hash_embed import HashEmbed
 from thinc.neural.util import to_categorical
@@ -52,16 +52,16 @@ class TokenVectorEncoder(object):
         self.doc2feats = doc2feats()
         self.model = model
 
-    def __call__(self, docs, state=None):
+    def __call__(self, docs):
         if isinstance(docs, Doc):
             docs = [docs]
-        tokvecs = self.predict(docs)
-        self.set_annotations(docs, tokvecs)
+        tokvecses = self.predict(docs)
+        self.set_annotations(docs, tokvecses)
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
         for docs in cytoolz.partition_all(batch_size, stream):
-            tokvecs = self.predict(docs)
-            self.set_annotations(docs, tokvecs)
+            tokvecses = self.predict(docs)
+            self.set_annotations(docs, tokvecses)
             yield from docs
 
     def predict(self, docs):
@@ -69,11 +69,9 @@ class TokenVectorEncoder(object):
         tokvecs = self.model(feats)
         return tokvecs
 
-    def set_annotations(self, docs, tokvecs):
-        start = 0
-        for doc in docs:
-            doc.tensor = tokvecs[start : start + len(doc)]
-            start += len(doc)
+    def set_annotations(self, docs, tokvecses):
+        for doc, tokvecs in zip(docs, tokvecses):
+            doc.tensor = tokvecs
 
     def begin_update(self, docs, drop=0.):
         if isinstance(docs, Doc):
@@ -136,7 +134,7 @@ class NeuralTagger(object):
         docs, tokvecs = docs_tokvecs
 
         if self.model.nI is None:
-            self.model.nI = tokvecs.shape[1]
+            self.model.nI = tokvecs[0].shape[1]
 
         tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
         loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
@@ -146,6 +144,7 @@ class NeuralTagger(object):
         return d_tokvecs
 
     def get_loss(self, docs, golds, scores):
+        scores = self.model.ops.flatten(scores)
         tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}
 
         cdef int idx = 0
@@ -161,7 +160,7 @@ class NeuralTagger(object):
         correct = self.model.ops.xp.array(correct, dtype='i')
         d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
         loss = (d_scores**2).sum()
-        d_scores = self.model.ops.asarray(d_scores, dtype='f')
+        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
         return float(loss), d_scores
 
     def begin_training(self, gold_tuples, pipeline=None):
@@ -179,9 +178,8 @@ class NeuralTagger(object):
         vocab.morphology = Morphology(vocab.strings, new_tag_map,
                                       vocab.morphology.lemmatizer)
         token_vector_width = pipeline[0].model.nO
-        self.model = rebatch(1024, Softmax(self.vocab.morphology.n_tags,
-                                          token_vector_width))
-        #self.model = Softmax(self.vocab.morphology.n_tags)
+        self.model = with_flatten(
+            Softmax(self.vocab.morphology.n_tags, token_vector_width))
 
     def use_params(self, params):
         with self.model.use_params(params):
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 97685bf4d..32c761be6 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -311,7 +311,8 @@ cdef class Parser:
         return states
 
     def update(self, docs_tokvecs, golds, drop=0., sgd=None):
-        docs, tokvecs = docs_tokvecs
+        docs, tokvec_lists = docs_tokvecs
+        tokvecs = self.model[0].ops.flatten(tokvec_lists)
         if isinstance(docs, Doc) and isinstance(golds, GoldParse):
             docs = [docs]
             golds = [golds]
@@ -324,7 +325,8 @@ cdef class Parser:
         state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
                                                       drop)
 
-        todo = [(s, g) for s, g in zip(states, golds) if not s.is_final()]
+        todo = [(s, g) for (s, g) in zip(states, golds)
+                if not s.is_final()]
 
         backprops = []
         cdef float loss = 0.
@@ -365,7 +367,7 @@ cdef class Parser:
             else:
                 xp.add.at(d_tokvecs,
                     token_ids, d_state_features * active_feats)
-        return d_tokvecs
+        return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
 
     def get_batch_model(self, batch_size, tokvecs, stream, dropout):
         lower, upper = self.model