diff --git a/requirements.txt b/requirements.txt
index e2a1860ea..e095d04fe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ cython>=0.25
 numpy>=1.15.0
 cymem>=2.0.2,<2.1.0
 preshed>=2.0.1,<2.1.0
-thinc==7.0.0.dev0
+thinc==7.0.0.dev1
 blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 cytoolz>=0.9.0,<0.10.0
diff --git a/setup.py b/setup.py
index 0bf48f709..3c7bd51d8 100755
--- a/setup.py
+++ b/setup.py
@@ -200,7 +200,7 @@ def setup_package():
                 "murmurhash>=0.28.0,<1.1.0",
                 "cymem>=2.0.2,<2.1.0",
                 "preshed>=2.0.1,<2.1.0",
-                "thinc==7.0.0.dev0",
+                "thinc==7.0.0.dev1",
                 "blis>=0.2.2,<0.3.0",
                 "plac<1.0.0,>=0.9.6",
                 "ujson>=1.35",
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 679b1aef6..0cfdec7e9 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -48,11 +48,11 @@ def cosine(vec1, vec2):
 
 def create_default_optimizer(ops, **cfg):
     learn_rate = util.env_opt('learn_rate', 0.001)
-    beta1 = util.env_opt('optimizer_B1', 0.9)
-    beta2 = util.env_opt('optimizer_B2', 0.9)
-    eps = util.env_opt('optimizer_eps', 1e-12)
+    beta1 = util.env_opt('optimizer_B1', 0.8)
+    beta2 = util.env_opt('optimizer_B2', 0.8)
+    eps = util.env_opt('optimizer_eps', 0.00001)
     L2 = util.env_opt('L2_penalty', 1e-6)
-    max_grad_norm = util.env_opt('grad_norm_clip', 1.)
+    max_grad_norm = util.env_opt('grad_norm_clip', 5.)
     optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1,
                      beta2=beta2, eps=eps)
     optimizer.max_grad_norm = max_grad_norm
@@ -445,11 +445,11 @@ def getitem(i):
 
 
 def build_tagger_model(nr_class, **cfg):
-    embed_size = util.env_opt('embed_size', 7000)
+    embed_size = util.env_opt('embed_size', 2000)
     if 'token_vector_width' in cfg:
         token_vector_width = cfg['token_vector_width']
     else:
-        token_vector_width = util.env_opt('token_vector_width', 128)
+        token_vector_width = util.env_opt('token_vector_width', 96)
     pretrained_vectors = cfg.get('pretrained_vectors')
     subword_features = cfg.get('subword_features', True)
     with Model.define_operators({'>>': chain, '+': add}):
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index f46d41452..44f3f3174 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -24,10 +24,12 @@ import sys
 from collections import Counter
 
 import spacy
-from spacy.attrs import ID
+from spacy.tokens import Doc
+from spacy.attrs import ID, HEAD
 from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path
 from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
 from thinc.v2v import Affine
+from thinc.api import wrap
 
 
 def prefer_gpu():
@@ -47,13 +49,14 @@ def load_texts(path):
     '''
     path = ensure_path(path)
     with path.open('r', encoding='utf8') as file_:
-        texts = [json.loads(line)['text'] for line in file_]
+        texts = [json.loads(line) for line in file_]
     random.shuffle(texts)
     return texts
 
+
 def stream_texts():
     for line in sys.stdin:
-        yield json.loads(line)['text']
+        yield json.loads(line)
 
 
 def make_update(model, docs, optimizer, drop=0.):
@@ -65,11 +68,33 @@ def make_update(model, docs, optimizer, drop=0.):
     RETURNS loss: A float for the loss.
     """
     predictions, backprop = model.begin_update(docs, drop=drop)
-    loss, gradients = get_vectors_loss(model.ops, docs, predictions)
+    gradients = get_vectors_loss(model.ops, docs, predictions)
     backprop(gradients, sgd=optimizer)
+    # Don't want to return a cupy object here
+    # The gradients are modified in-place by the BERT MLM,
+    # so we get an accurate loss
+    loss = float((gradients**2).mean())
     return loss
 
 
+def make_docs(nlp, batch):
+    docs = []
+    for record in batch:
+        text = record["text"]
+        if "tokens" in record:
+            doc = Doc(nlp.vocab, words=record["tokens"])
+        else:
+            doc = nlp.make_doc(text)
+        if "heads" in record:
+            heads = record["heads"]
+            heads = numpy.asarray(heads, dtype="uint64")
+            heads = heads.reshape((len(doc), 1))
+            doc = doc.from_array([HEAD], heads)
+        if len(doc) >= 1 and len(doc) < 200:
+            docs.append(doc)
+    return docs
+
+
 def get_vectors_loss(ops, docs, prediction):
     """Compute a mean-squared error loss between the documents' vectors and
     the prediction.    
@@ -84,10 +109,8 @@ def get_vectors_loss(ops, docs, prediction):
     # and look them up all at once. This prevents data copying.
     ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
     target = docs[0].vocab.vectors.data[ids]
-    d_scores = (prediction - target) / prediction.shape[0]
-    # Don't want to return a cupy object here
-    loss = float((d_scores**2).sum())
-    return loss, d_scores
+    d_scores = prediction - target
+    return d_scores
 
 
 def create_pretraining_model(nlp, tok2vec):
@@ -107,15 +130,77 @@ def create_pretraining_model(nlp, tok2vec):
         tok2vec,
         output_layer
     )
+    model = masked_language_model(nlp.vocab, model)
     model.tok2vec = tok2vec
     model.output_layer = output_layer
     model.begin_training([nlp.make_doc('Give it a doc to infer shapes')])
     return model
 
 
+def masked_language_model(vocab, model, mask_prob=0.15):
+    '''Convert a model into a BERT-style masked language model'''
+    vocab_words = [lex.text for lex in vocab if lex.prob != 0.0]
+    vocab_probs = [lex.prob for lex in vocab if lex.prob != 0.0]
+    vocab_words = vocab_words[:10000]
+    vocab_probs = vocab_probs[:10000]
+    vocab_probs = numpy.exp(numpy.array(vocab_probs, dtype='f'))
+    vocab_probs /= vocab_probs.sum()
+    
+    def mlm_forward(docs, drop=0.):
+        mask, docs = apply_mask(docs, vocab_words, vocab_probs,
+                                mask_prob=mask_prob)
+        mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
+        output, backprop = model.begin_update(docs, drop=drop)
+
+        def mlm_backward(d_output, sgd=None):
+            d_output *= 1-mask
+            return backprop(d_output, sgd=sgd)
+
+        return output, mlm_backward
+
+    return wrap(mlm_forward, model)
+
+
+def apply_mask(docs, vocab_texts, vocab_probs, mask_prob=0.15):
+    N = sum(len(doc) for doc in docs)
+    mask = numpy.random.uniform(0., 1.0, (N,))
+    mask = mask >= mask_prob
+    i = 0
+    masked_docs = []
+    for doc in docs:
+        words = []
+        for token in doc:
+            if not mask[i]:
+                word = replace_word(token.text, vocab_texts, vocab_probs)
+            else:
+                word = token.text
+            words.append(word)
+            i += 1
+        spaces = [bool(w.whitespace_) for w in doc]
+        # NB: If you change this implementation to instead modify
+        # the docs in place, take care that the IDs reflect the original
+        # words. Currently we use the original docs to make the vectors
+        # for the target, so we don't lose the original tokens. But if
+        # you modified the docs in place here, you would.
+        masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces))
+    return mask, masked_docs
+
+
+def replace_word(word, vocab_texts, vocab_probs, mask='[MASK]'):
+    roll = random.random()
+    if roll < 0.8:
+        return mask
+    elif roll < 0.9:
+        index = numpy.random.choice(len(vocab_texts), p=vocab_probs)
+        return vocab_texts[index]
+    else:
+        return word
+
+
 class ProgressTracker(object):
     def __init__(self, frequency=100000):
-        self.loss = 0.
+        self.loss = 0.0
+        self.prev_loss = 0.0
         self.nr_word = 0
         self.words_per_epoch = Counter()
         self.frequency = frequency
@@ -132,7 +217,15 @@ class ProgressTracker(object):
             wps = words_since_update / (time.time() - self.last_time)
             self.last_update = self.nr_word
             self.last_time = time.time()
-            status = (epoch, self.nr_word, '%.5f' % self.loss, int(wps))
+            loss_per_word = self.loss - self.prev_loss
+            status = (
+                epoch,
+                self.nr_word,
+                "%.5f" % self.loss,
+                "%.4f" % loss_per_word,
+                int(wps),
+            )
+            self.prev_loss = float(self.loss)
             return status
         else:
             return None
@@ -145,12 +238,13 @@ class ProgressTracker(object):
     width=("Width of CNN layers", "option", "cw", int),
     depth=("Depth of CNN layers", "option", "cd", int),
     embed_rows=("Embedding rows", "option", "er", int),
+    use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
     dropout=("Dropout", "option", "d", float),
     seed=("Seed for random number generators", "option", "s", float),
     nr_iter=("Number of iterations to pretrain", "option", "i", int),
 )
 def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
-        embed_rows=1000, dropout=0.2, nr_iter=10, seed=0):
+        embed_rows=5000, use_vectors=False, dropout=0.2, nr_iter=100, seed=0):
     """
     Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
     using an approximate language-modelling objective. Specifically, we load
@@ -175,11 +269,13 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
     with (output_dir / 'config.json').open('w') as file_:
         file_.write(json.dumps(config))
     has_gpu = prefer_gpu()
+    print("Use GPU?", has_gpu)
     nlp = spacy.load(vectors_model)
+    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
     model = create_pretraining_model(nlp,
                 Tok2Vec(width, embed_rows,
                     conv_depth=depth,
-                    pretrained_vectors=nlp.vocab.vectors.name,
+                    pretrained_vectors=pretrained_vectors,
                     bilstm_depth=0, # Requires PyTorch. Experimental.
                     cnn_maxout_pieces=2, # You can try setting this higher
                     subword_features=True)) # Set to False for character models, e.g. Chinese
@@ -188,19 +284,19 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
     print('Epoch', '#Words', 'Loss', 'w/s')
     texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc) 
     for epoch in range(nr_iter):
-        for batch in minibatch(texts, size=64):
-            docs = [nlp.make_doc(text) for text in batch]
+        for batch in minibatch(texts, size=256):
+            docs = make_docs(nlp, batch)
             loss = make_update(model, docs, optimizer, drop=dropout)
             progress = tracker.update(epoch, loss, docs)
             if progress:
                 print(*progress)
-                if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**6:
+                if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7:
                     break
         with model.use_params(optimizer.averages):
             with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
                 file_.write(model.tok2vec.to_bytes())
             with (output_dir / 'log.jsonl').open('a') as file_:
                 file_.write(json.dumps({'nr_word': tracker.nr_word,
-                    'loss': tracker.loss, 'epoch': epoch}))
+                    'loss': tracker.loss, 'epoch': epoch}) + '\n')
         if texts_loc != '-':
             texts = load_texts(texts_loc)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 01aebfae8..01c8cb199 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -90,11 +90,11 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
     # starts high and decays sharply, to force the optimizer to explore.
     # Batch size starts at 1 and grows, so that we make updates quickly
     # at the beginning of training.
-    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
-                                  util.env_opt('dropout_to', 0.2),
+    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1),
+                                  util.env_opt('dropout_to', 0.1),
                                   util.env_opt('dropout_decay', 0.0))
-    batch_sizes = util.compounding(util.env_opt('batch_from', 1000),
-                                   util.env_opt('batch_to', 1000),
+    batch_sizes = util.compounding(util.env_opt('batch_from', 750),
+                                   util.env_opt('batch_to', 750),
                                    util.env_opt('batch_compound', 1.001))
     lang_class = util.get_lang_class(lang)
     nlp = lang_class()
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 20a319f5d..63d8e0733 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -25,6 +25,7 @@ from .compat import json_dumps
 
 from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
 
+
 def tags_to_entities(tags):
     entities = []
     start = None
@@ -110,19 +111,23 @@ class GoldCorpus(object):
         # Write temp directory with one doc per file, so we can shuffle
         # and stream
         self.tmp_dir = Path(tempfile.mkdtemp())
-        self.write_msgpack(self.tmp_dir / 'train', train)
-        self.write_msgpack(self.tmp_dir / 'dev', dev)
+        self.write_msgpack(self.tmp_dir / 'train', train, limit=self.limit)
+        self.write_msgpack(self.tmp_dir / 'dev', dev, limit=self.limit)
 
     def __del__(self):
         shutil.rmtree(self.tmp_dir)
 
     @staticmethod
-    def write_msgpack(directory, doc_tuples):
+    def write_msgpack(directory, doc_tuples, limit=0):
         if not directory.exists():
             directory.mkdir()
+        n = 0
         for i, doc_tuple in enumerate(doc_tuples):
             with open(directory / '{}.msg'.format(i), 'wb') as file_:
-                msgpack.dump([doc_tuple], file_, use_bin_type=True, encoding='utf8')
+                msgpack.dump([doc_tuple], file_, use_bin_type=True)
+            n += len(doc_tuple[1])
+            if limit and n >= limit:
+                break
     
     @staticmethod
     def walk_corpus(path):
@@ -153,7 +158,7 @@ class GoldCorpus(object):
                 gold_tuples = read_json_file(loc)
             elif loc.parts[-1].endswith('msg'):
                 with loc.open('rb') as file_:
-                    gold_tuples = msgpack.load(file_, encoding='utf8')
+                    gold_tuples = msgpack.load(file_, raw=False)
             else:
                 msg = "Cannot read from file: %s. Supported formats: .json, .msg"
                 raise ValueError(msg % loc)
@@ -350,7 +355,7 @@ def _json_iterate(loc):
                 py_str = py_raw[start : i+1].decode('utf8')
                 try:
                     yield json.loads(py_str)
-                except:
+                except Exception:
                     print(py_str)
                     raise
                 start = -1
diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index f7c4ec4e0..e2a244080 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -759,7 +759,7 @@ class Tagger(Pipe):
             if self.model is True:
                 token_vector_width = util.env_opt(
                     'token_vector_width',
-                    self.cfg.get('token_vector_width', 128))
+                    self.cfg.get('token_vector_width', 96))
                 self.model = self.Model(self.vocab.morphology.n_tags,
                                         **self.cfg)
             self.model.from_bytes(b)
@@ -878,7 +878,7 @@ class MultitaskObjective(Tagger):
 
     @classmethod
     def Model(cls, n_tags, tok2vec=None, **cfg):
-        token_vector_width = util.env_opt('token_vector_width', 128)
+        token_vector_width = util.env_opt('token_vector_width', 96)
         softmax = Softmax(n_tags, token_vector_width)
         model = chain(
             tok2vec,
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 82e87ae61..0cecdb93b 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -63,9 +63,9 @@ cdef class Parser:
         parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
                                             cfg.get('maxout_pieces', 2))
         token_vector_width = util.env_opt('token_vector_width',
-                                           cfg.get('token_vector_width', 128))
-        hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
-        embed_size = util.env_opt('embed_size', cfg.get('embed_size', 5000))
+                                           cfg.get('token_vector_width', 96))
+        hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64))
+        embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000))
         pretrained_vectors = cfg.get('pretrained_vectors', None)
         tok2vec = Tok2Vec(token_vector_width, embed_size,
                           conv_depth=conv_depth,