diff --git a/spacy/_ml.py b/spacy/_ml.py
index 91b530fad..c49bad6d4 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -9,7 +9,7 @@ import cytoolz
 
 from thinc.neural._classes.convolution import ExtractWindow
 from thinc.neural._classes.static_vectors import StaticVectors
-from thinc.neural._classes.batchnorm import BatchNorm
+from thinc.neural._classes.batchnorm import BatchNorm as BN
 from thinc.neural._classes.layernorm import LayerNorm as LN
 from thinc.neural._classes.resnet import Residual
 from thinc.neural import ReLu
@@ -22,6 +22,7 @@ from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
 from thinc.neural._classes.attention import ParametricAttention
 from thinc.linear.linear import LinearModel
 from thinc.api import uniqued, wrap, flatten_add_lengths
+from thinc.neural._classes.rnn import BiLSTM
 
 
 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
@@ -229,14 +230,14 @@ def Tok2Vec(width, embed_size, preprocess=None):
         suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
         shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape')
 
-        embed = (norm | prefix | suffix | shape ) >> Maxout(width, width*4, pieces=3)
+        embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
         tok2vec = (
             with_flatten(
                 asarray(Model.ops, dtype='uint64')
                 >> uniqued(embed, column=5)
                 >> drop_layer(
                     Residual(
-                        (ExtractWindow(nW=1) >> ReLu(width, width*3))
+                        (ExtractWindow(nW=1) >> BN(Maxout(width, width*3)))
                     )
                 ) ** 4, pad=4
             )