Merge pull request #5543 from svlandeg/feature/pretrain-config

pretrain from config
2020-06-04 19:07:12 +02:00 · 2020-06-04 19:07:12 +02:00 · 8411d4f4e6
parent 56a9d1b78c 3ade455fd3
commit 8411d4f4e6
39 changed files with 421 additions and 575 deletions
--- a/examples/experiments/onto-joint/defaults.cfg
+++ b/examples/experiments/onto-joint/defaults.cfg
@ -25,6 +25,7 @@ score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
 vectors = null
+discard_oversize = false

 [training.batch_size]
@schedules = "compounding.v1"
@ -32,7 +33,7 @@ start = 1000
 stop = 1000
 compound = 1.001

-[optimizer]
+[training.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
@ -113,3 +114,4 @@ window_size = 1
 embed_size = 10000
 maxout_pieces = 3
 subword_features = true
+dropout = null
--- a/examples/experiments/onto-joint/pretrain.cfg
+++ b/examples/experiments/onto-joint/pretrain.cfg
@ -0,0 +1,137 @@
+# Training hyper-parameters and additional features.
+[training]
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length or number of examples.
+max_length = 0
+limit = 0
+# Data augmentation
+orth_variant_level = 0.0
+dropout = 0.1
+# Controls early-stopping. 0 or -1 mean unlimited.
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 400
+# Other settings
+seed = 0
+accumulate_gradient = 1
+use_pytorch_for_gpu_memory = false
+# Control how scores are printed and checkpoints are evaluated.
+scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
+score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
+# These settings are invalid for the transformer models.
+init_tok2vec = null
+vectors = null
+discard_oversize = false
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 1000
+stop = 1000
+compound = 1.001
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 0.001
+
+[pretraining]
+max_epochs = 1000
+min_length = 5
+max_length = 500
+dropout = 0.2
+n_save_every = null
+batch_size = 3000
+seed = ${training:seed}
+use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
+tok2vec_model = "nlp.pipeline.tok2vec.model"
+
+[pretraining.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 0.001
+
+[pretraining.loss_func]
+@losses = "CosineDistance.v1"
+normalize = true
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.senter]
+factory = "senter"
+
+[nlp.pipeline.ner]
+factory = "ner"
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+
+[nlp.pipeline.senter.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.senter.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 3
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 256
+depth = 6
+window_size = 1
+embed_size = 10000
+maxout_pieces = 3
+subword_features = true
+dropout = null
--- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@ -14,6 +14,7 @@ score_weights = {"las": 0.8, "tags_acc": 0.2}
 limit = 0
 seed = 0
 accumulate_gradient = 2
+discard_oversize = false

 [training.batch_size]
@schedules = "compounding.v1"
@ -21,7 +22,7 @@ start = 100
 stop = 1000
 compound = 1.001

-[optimizer]
+[training.optimizer]
@optimizers = "Adam.v1"
 learn_rate = 0.001
 beta1 = 0.9
@ -65,3 +66,4 @@ depth = 4
 embed_size = 2000
 subword_features = true
 maxout_pieces = 3
+dropout = null
--- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
@ -14,6 +14,7 @@ score_weights = {"las": 0.8, "tags_acc": 0.2}
 limit = 0
 seed = 0
 accumulate_gradient = 2
+discard_oversize = false

 [training.batch_size]
@schedules = "compounding.v1"
@ -21,7 +22,7 @@ start = 100
 stop = 1000
 compound = 1.001

-[optimizer]
+[training.optimizer]
@optimizers = "Adam.v1"
 learn_rate = 0.001
 beta1 = 0.9
@ -66,3 +67,4 @@ window_size = 1
 embed_size = 2000
 maxout_pieces = 3
 subword_features = true
+dropout = null
--- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
@ -12,8 +12,9 @@ max_length = 0
 batch_size = 25
 seed = 0
 accumulate_gradient = 2
+discard_oversize = false

-[optimizer]
+[training.optimizer]
@optimizers = "Adam.v1"
 learn_rate = 0.001
 beta1 = 0.9
@ -36,6 +37,7 @@ nM = 64
 nC = 8
 rows = 2000
 columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
+dropout = null

 [nlp.pipeline.tok2vec.model.extract.features]
@architectures = "spacy.Doc2Feats.v1"
--- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
@ -11,6 +11,7 @@ gold_preproc = true
 max_length = 0
 seed = 0
 accumulate_gradient = 2
+discard_oversize = false

 [training.batch_size]
@schedules = "compounding.v1"
@ -19,7 +20,7 @@ stop = 3000
 compound = 1.001


-[optimizer]
+[training.optimizer]
@optimizers = "Adam.v1"
 learn_rate = 0.001
 beta1 = 0.9
@ -44,3 +45,4 @@ maxout_pieces = 3
 window_size = 1
 subword_features = true
 pretrained_vectors = null
+dropout = null
--- a/examples/training/pretrain_textcat.py
+++ b/examples/training/pretrain_textcat.py
@ -1,212 +0,0 @@
-"""This script is experimental.
-
-Try pre-training the CNN component of the text categorizer using a cheap
-language modelling-like objective. Specifically, we load pretrained vectors
-(from something like word2vec, GloVe, FastText etc), and use the CNN to
-predict the tokens' pretrained vectors. This isn't as easy as it sounds:
-we're not merely doing compression here, because heavy dropout is applied,
-including over the input words. This means the model must often (50% of the time)
-use the context in order to predict the word.
-
-To evaluate the technique, we're pre-training with the 50k texts from the IMDB
-corpus, and then training with only 100 labels. Note that it's a bit dirty to
-pre-train with the development data, but also not *so* terrible: we're not using
-the development labels, after all --- only the unlabelled text.
-"""
-import plac
-import tqdm
-import random
-
-import ml_datasets
-
-import spacy
-from spacy.util import minibatch
-from spacy.pipeline import TextCategorizer
-from spacy.ml.models.tok2vec import build_Tok2Vec_model
-import numpy
-
-
-def load_texts(limit=0):
-    train, dev = ml_datasets.imdb()
-    train_texts, train_labels = zip(*train)
-    dev_texts, dev_labels = zip(*train)
-    train_texts = list(train_texts)
-    dev_texts = list(dev_texts)
-    random.shuffle(train_texts)
-    random.shuffle(dev_texts)
-    if limit >= 1:
-        return train_texts[:limit]
-    else:
-        return list(train_texts) + list(dev_texts)
-
-
-def load_textcat_data(limit=0):
-    """Load data from the IMDB dataset."""
-    # Partition off part of the train data for evaluation
-    train_data, eval_data = ml_datasets.imdb()
-    random.shuffle(train_data)
-    train_data = train_data[-limit:]
-    texts, labels = zip(*train_data)
-    eval_texts, eval_labels = zip(*eval_data)
-    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
-    eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
-    return (texts, cats), (eval_texts, eval_cats)
-
-
-def prefer_gpu():
-    used = spacy.util.use_gpu(0)
-    if used is None:
-        return False
-    else:
-        import cupy.random
-
-        cupy.random.seed(0)
-        return True
-
-
-def build_textcat_model(tok2vec, nr_class, width):
-    from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged
-
-    with Model.define_operators({">>": chain}):
-        model = (
-            tok2vec
-            >> list2ragged()
-            >> reduce_mean()
-            >> Softmax(nr_class, width)
-        )
-    model.set_ref("tok2vec", tok2vec)
-    return model
-
-
-def block_gradients(model):
-    from thinc.api import wrap  # TODO FIX
-
-    def forward(X, drop=0.0):
-        Y, _ = model.begin_update(X, drop=drop)
-        return Y, None
-
-    return wrap(forward, model)
-
-
-def create_pipeline(width, embed_size, vectors_model):
-    print("Load vectors")
-    nlp = spacy.load(vectors_model)
-    print("Start training")
-    textcat = TextCategorizer(
-        nlp.vocab,
-        labels=["POSITIVE", "NEGATIVE"],
-        # TODO: replace with config version
-        model=build_textcat_model(
-            build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width
-        ),
-    )
-
-    nlp.add_pipe(textcat)
-    return nlp
-
-
-def train_tensorizer(nlp, texts, dropout, n_iter):
-    tensorizer = nlp.create_pipe("tensorizer")
-    nlp.add_pipe(tensorizer)
-    optimizer = nlp.begin_training()
-    for i in range(n_iter):
-        losses = {}
-        for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
-            docs = [nlp.make_doc(text) for text in batch]
-            tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout)
-        print(losses)
-    return optimizer
-
-
-def train_textcat(nlp, n_texts, n_iter=10):
-    textcat = nlp.get_pipe("textcat")
-    tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes()
-    (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
-    print(
-        "Using {} examples ({} training, {} evaluation)".format(
-            n_texts, len(train_texts), len(dev_texts)
-        )
-    )
-    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
-
-    with nlp.select_pipes(enable="textcat"):  # only train textcat
-        optimizer = nlp.begin_training()
-        textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights)
-        print("Training the model...")
-        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
-        for i in range(n_iter):
-            losses = {"textcat": 0.0}
-            # batch up the examples using spaCy's minibatch
-            batches = minibatch(tqdm.tqdm(train_data), size=2)
-            for batch in batches:
-                nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
-            with textcat.model.use_params(optimizer.averages):
-                # evaluate on the dev data split off in load_data()
-                scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
-            print(
-                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
-                    losses["textcat"],
-                    scores["textcat_p"],
-                    scores["textcat_r"],
-                    scores["textcat_f"],
-                )
-            )
-
-
-def evaluate_textcat(tokenizer, textcat, texts, cats):
-    docs = (tokenizer(text) for text in texts)
-    tp = 1e-8
-    fp = 1e-8
-    tn = 1e-8
-    fn = 1e-8
-    for i, doc in enumerate(textcat.pipe(docs)):
-        gold = cats[i]
-        for label, score in doc.cats.items():
-            if label not in gold:
-                continue
-            if score >= 0.5 and gold[label] >= 0.5:
-                tp += 1.0
-            elif score >= 0.5 and gold[label] < 0.5:
-                fp += 1.0
-            elif score < 0.5 and gold[label] < 0.5:
-                tn += 1
-            elif score < 0.5 and gold[label] >= 0.5:
-                fn += 1
-    precision = tp / (tp + fp)
-    recall = tp / (tp + fn)
-    f_score = 2 * (precision * recall) / (precision + recall)
-    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
-
-
-@plac.annotations(
-    width=("Width of CNN layers", "positional", None, int),
-    embed_size=("Embedding rows", "positional", None, int),
-    pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
-    train_iters=("Number of iterations to pretrain", "option", "tn", int),
-    train_examples=("Number of labelled examples", "option", "eg", int),
-    vectors_model=("Name or path to vectors model to learn from"),
-)
-def main(
-    width,
-    embed_size,
-    vectors_model,
-    pretrain_iters=30,
-    train_iters=30,
-    train_examples=1000,
-):
-    random.seed(0)
-    numpy.random.seed(0)
-    use_gpu = prefer_gpu()
-    print("Using GPU?", use_gpu)
-
-    nlp = create_pipeline(width, embed_size, vectors_model)
-    print("Load data")
-    texts = load_texts(limit=0)
-    print("Train tensorizer")
-    optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
-    print("Train textcat")
-    train_textcat(nlp, train_examples, n_iter=train_iters)
-
-
-if __name__ == "__main__":
-    plac.call(main)
--- a/spacy/main.py
+++ b/spacy/main.py
@ -2,16 +2,15 @@ if __name__ == "__main__":
    import plac
    import sys
    from wasabi import msg
-    from spacy.cli import download, link, info, package, train, pretrain, convert
+    from spacy.cli import download, link, info, package, pretrain, convert
    from spacy.cli import init_model, profile, evaluate, validate, debug_data
-    from spacy.cli import train_from_config_cli
+    from spacy.cli import train_cli

    commands = {
        "download": download,
        "link": link,
        "info": info,
-        "train": train,
-        "train-from-config": train_from_config_cli,
+        "train": train_cli,
        "pretrain": pretrain,
        "debug-data": debug_data,
        "evaluate": evaluate,
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -4,8 +4,7 @@ from .download import download  # noqa: F401
 from .info import info  # noqa: F401
 from .package import package  # noqa: F401
 from .profile import profile  # noqa: F401
-from .train import train  # noqa: F401
-from .train_from_config import train_from_config_cli  # noqa: F401
+from .train_from_config import train_cli  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -3,48 +3,39 @@ import numpy
 import time
 import re
 from collections import Counter
+import plac
 from pathlib import Path
-from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu
-from thinc.api import CosineDistance, L2Distance
+from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory
 from wasabi import msg
 import srsly

-from ..gold import Example
 from ..errors import Errors
 from ..ml.models.multi_task import build_masked_language_model
 from ..tokens import Doc
 from ..attrs import ID, HEAD
-from ..ml.models.tok2vec import build_Tok2Vec_model
 from .. import util
-from ..util import create_default_optimizer
-from .train import _load_pretrained_tok2vec
+from ..gold import Example


-def pretrain(
+@plac.annotations(
    # fmt: off
-    texts_loc: ("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
-    vectors_model: ("Name or path to spaCy model with vectors to learn from", "positional", None, str),
-    output_dir: ("Directory to write models to on each epoch", "positional", None, str),
-    width: ("Width of CNN layers", "option", "cw", int) = 96,
-    conv_depth: ("Depth of CNN layers", "option", "cd", int) = 4,
-    bilstm_depth: ("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int) = 0,
-    cnn_pieces: ("Maxout size for CNN layers. 1 for Mish", "option", "cP", int) = 3,
-    sa_depth: ("Depth of self-attention layers", "option", "sa", int) = 0,
-    use_chars: ("Whether to use character-based embedding", "flag", "chr", bool) = False,
-    cnn_window: ("Window size for CNN layers", "option", "cW", int) = 1,
-    embed_rows: ("Number of embedding rows", "option", "er", int) = 2000,
-    loss_func: ("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str) = "cosine",
-    use_vectors: ("Whether to use the static vectors as input features", "flag", "uv") = False,
-    dropout: ("Dropout rate", "option", "d", float) = 0.2,
-    n_iter: ("Number of iterations to pretrain", "option", "i", int) = 1000,
-    batch_size: ("Number of words per training batch", "option", "bs", int) = 3000,
-    max_length: ("Max words per example. Longer examples are discarded", "option", "xw", int) = 500,
-    min_length: ("Min words per example. Shorter examples are discarded", "option", "nw", int) = 5,
-    seed: ("Seed for random number generators", "option", "s", int) = 0,
-    n_save_every: ("Save model every X batches.", "option", "se", int) = None,
-    init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
-    epoch_start: ("The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.", "option", "es", int) = None,
+    texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
+    vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str),
+    output_dir=("Directory to write models to on each epoch", "positional", None, Path),
+    config_path=("Path to config file", "positional", None, Path),
+    use_gpu=("Use GPU", "option", "g", int),
+    resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path),
+    epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int),
    # fmt: on
+)
+def pretrain(
+    texts_loc,
+    vectors_model,
+    config_path,
+    output_dir,
+    use_gpu=-1,
+    resume_path=None,
+    epoch_resume=None,
 ):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
@ -58,34 +49,46 @@ def pretrain(
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
-    all settings are the same between pretraining and training. The API and
-    errors around this need some improvement.
+    all settings are the same between pretraining and training. Ideally,
+    this is done by using the same config file for both commands.
    """
-    config = dict(locals())
-    for key in config:
-        if isinstance(config[key], Path):
-            config[key] = str(config[key])
-    util.fix_random_seed(seed)
+    if not config_path or not config_path.exists():
+        msg.fail("Config file not found", config_path, exits=1)

-    has_gpu = prefer_gpu()
-    if has_gpu:
-        import torch
+    if use_gpu >= 0:
+        msg.info("Using GPU")
+        util.use_gpu(use_gpu)
+    else:
+        msg.info("Using CPU")

-        torch.set_default_tensor_type("torch.cuda.FloatTensor")
-    msg.info("Using GPU" if has_gpu else "Not using GPU")
+    msg.info(f"Loading config from: {config_path}")
+    config = util.load_config(config_path, create_objects=False)
+    util.fix_random_seed(config["pretraining"]["seed"])
+    if config["pretraining"]["use_pytorch_for_gpu_memory"]:
+        use_pytorch_for_gpu_memory()

-    output_dir = Path(output_dir)
    if output_dir.exists() and [p for p in output_dir.iterdir()]:
-        msg.warn(
-            "Output directory is not empty",
-            "It is better to use an empty directory or refer to a new output path, "
-            "then the new directory will be created for you.",
-        )
+        if resume_path:
+            msg.warn(
+                "Output directory is not empty. ",
+                "If you're resuming a run from a previous model in this directory, "
+                "the old models for the consecutive epochs will be overwritten "
+                "with the new ones.",
+            )
+        else:
+            msg.warn(
+                "Output directory is not empty. ",
+                "It is better to use an empty directory or refer to a new output path, "
+                "then the new directory will be created for you.",
+            )
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good(f"Created output directory: {output_dir}")
    srsly.write_json(output_dir / "config.json", config)
-    msg.good("Saved settings to config.json")
+    msg.good("Saved config file in the output directory")
+
+    config = util.load_config(config_path, create_objects=True)
+    pretrain_config = config["pretraining"]

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
@ -99,57 +102,50 @@ def pretrain(
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
-        msg.text("Reading input text from stdin...")
+        msg.info("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

    with msg.loading(f"Loading model '{vectors_model}'..."):
        nlp = util.load_model(vectors_model)
    msg.good(f"Loaded model '{vectors_model}'")
-    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors
-    model = create_pretraining_model(
-        nlp,
-        # TODO: replace with config
-        build_Tok2Vec_model(
-            width,
-            embed_rows,
-            conv_depth=conv_depth,
-            pretrained_vectors=pretrained_vectors,
-            bilstm_depth=bilstm_depth,  # Requires PyTorch. Experimental.
-            subword_features=not use_chars,  # Set to False for Chinese etc
-            maxout_pieces=cnn_pieces,  # If set to 1, use Mish activation.
-            window_size=1,
-            char_embed=False,
-            nM=64,
-            nC=8,
-        ),
-    )
-    # Load in pretrained weights
-    if init_tok2vec is not None:
-        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
-        msg.text(f"Loaded pretrained tok2vec for: {components}")
+    tok2vec_path = pretrain_config["tok2vec_model"]
+    tok2vec = config
+    for subpath in tok2vec_path.split("."):
+        tok2vec = tok2vec.get(subpath)
+    model = create_pretraining_model(nlp, tok2vec)
+    optimizer = pretrain_config["optimizer"]
+
+    # Load in pretrained weights to resume from
+    if resume_path is not None:
+        msg.info(f"Resume training tok2vec from: {resume_path}")
+        with resume_path.open("rb") as file_:
+            weights_data = file_.read()
+            model.get_ref("tok2vec").from_bytes(weights_data)
        # Parse the epoch number from the given weight file
-        model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
+        model_name = re.search(r"model\d+\.bin", str(resume_path))
        if model_name:
            # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
-            epoch_start = int(model_name.group(0)[5:][:-4]) + 1
+            epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
+            msg.info(f"Resuming from epoch: {epoch_resume}")
        else:
-            if not epoch_start:
+            if not epoch_resume:
                msg.fail(
-                    "You have to use the --epoch-start argument when using a renamed weight file for --init-tok2vec",
+                    "You have to use the --epoch-resume setting when using a renamed weight file for --resume-path",
                    exits=True,
                )
-            elif epoch_start < 0:
+            elif epoch_resume < 0:
                msg.fail(
-                    f"The argument --epoch-start has to be greater or equal to 0. {epoch_start} is invalid",
+                    f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid",
                    exits=True,
                )
+            else:
+                msg.info(f"Resuming from epoch: {epoch_resume}")
    else:
-        # Without '--init-tok2vec' the '--epoch-start' argument is ignored
-        epoch_start = 0
+        # Without '--resume-path' the '--epoch-resume' argument is ignored
+        epoch_resume = 0

-    optimizer = create_default_optimizer()
    tracker = ProgressTracker(frequency=10000)
-    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}")
+    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

@ -168,28 +164,27 @@ def pretrain(
                file_.write(srsly.json_dumps(log) + "\n")

    skip_counter = 0
-    for epoch in range(epoch_start, n_iter + epoch_start):
-        for batch_id, batch in enumerate(
-            util.minibatch_by_words(
-                (Example(doc=text) for text in texts), size=batch_size
-            )
-        ):
+    loss_func = pretrain_config["loss_func"]
+    for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
+        examples = [Example(doc=text) for text in texts]
+        batches = util.minibatch_by_words(examples, size=pretrain_config["batch_size"])
+        for batch_id, batch in enumerate(batches):
            docs, count = make_docs(
                nlp,
-                [text for (text, _) in batch],
-                max_length=max_length,
-                min_length=min_length,
+                [ex.doc for ex in batch],
+                max_length=pretrain_config["max_length"],
+                min_length=pretrain_config["min_length"],
            )
            skip_counter += count
-            loss = make_update(
-                model, docs, optimizer, objective=loss_func, drop=dropout
-            )
+            loss = make_update(model, docs, optimizer, distance=loss_func)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
                    break
-            if n_save_every and (batch_id % n_save_every == 0):
+            if pretrain_config["n_save_every"] and (
+                batch_id % pretrain_config["n_save_every"] == 0
+            ):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
@ -201,17 +196,17 @@ def pretrain(
    msg.good("Successfully finished pretrain")


-def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
+def make_update(model, docs, optimizer, distance):
    """Perform an update over a single batch of documents.

    docs (iterable): A batch of `Doc` objects.
-    drop (float): The dropout rate.
    optimizer (callable): An optimizer.
    RETURNS loss: A float for the loss.
    """
-    predictions, backprop = model.begin_update(docs, drop=drop)
-    loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
-    backprop(gradients, sgd=optimizer)
+    predictions, backprop = model.begin_update(docs)
+    loss, gradients = get_vectors_loss(model.ops, docs, predictions, distance)
+    backprop(gradients)
+    model.finish_update(optimizer)
    # Don't want to return a cupy object here
    # The gradients are modified in-place by the BERT MLM,
    # so we get an accurate loss
@ -243,12 +238,12 @@ def make_docs(nlp, batch, min_length, max_length):
            heads = numpy.asarray(heads, dtype="uint64")
            heads = heads.reshape((len(doc), 1))
            doc = doc.from_array([HEAD], heads)
-        if len(doc) >= min_length and len(doc) < max_length:
+        if min_length <= len(doc) < max_length:
            docs.append(doc)
    return docs, skip_count


-def get_vectors_loss(ops, docs, prediction, objective="L2"):
+def get_vectors_loss(ops, docs, prediction, distance):
    """Compute a mean-squared error loss between the documents' vectors and
    the prediction.

@ -262,13 +257,6 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
    # and look them up all at once. This prevents data copying.
    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
    target = docs[0].vocab.vectors.data[ids]
-    # TODO: this code originally didn't normalize, but shouldn't normalize=True ?
-    if objective == "L2":
-        distance = L2Distance(normalize=False)
-    elif objective == "cosine":
-        distance = CosineDistance(normalize=False)
-    else:
-        raise ValueError(Errors.E142.format(loss_func=objective))
    d_target, loss = distance(prediction, target)
    return loss, d_target

@ -281,7 +269,7 @@ def create_pretraining_model(nlp, tok2vec):
    """
    output_size = nlp.vocab.vectors.data.shape[1]
    output_layer = chain(
-        Maxout(300, pieces=3, normalize=True, dropout=0.0), Linear(output_size)
+        Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size)
    )
    # This is annoying, but the parser etc have the flatten step after
    # the tok2vec. To load the weights in cleanly, we need to match
@ -289,11 +277,12 @@ def create_pretraining_model(nlp, tok2vec):
    # "tok2vec" has to be the same set of processes as what the components do.
    tok2vec = chain(tok2vec, list2array())
    model = chain(tok2vec, output_layer)
-    model = build_masked_language_model(nlp.vocab, model)
-    model.set_ref("tok2vec", tok2vec)
-    model.set_ref("output_layer", output_layer)
    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
-    return model
+    mlm_model = build_masked_language_model(nlp.vocab, model)
+    mlm_model.set_ref("tok2vec", tok2vec)
+    mlm_model.set_ref("output_layer", output_layer)
+    mlm_model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
+    return mlm_model


 class ProgressTracker(object):
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@ -13,6 +13,7 @@ import random
 from ..gold import GoldCorpus
 from .. import util
 from ..errors import Errors
+from ..ml import models   # don't remove - required to load the built-in architectures

 registry = util.registry

@ -123,7 +124,7 @@ class ConfigSchema(BaseModel):
    use_gpu=("Use GPU", "option", "g", int),
    # fmt: on
 )
-def train_from_config_cli(
+def train_cli(
    train_path,
    dev_path,
    config_path,
@ -132,7 +133,7 @@ def train_from_config_cli(
    raw_text=None,
    debug=False,
    verbose=False,
-    use_gpu=-1
+    use_gpu=-1,
 ):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
@ -156,7 +157,7 @@ def train_from_config_cli(
    else:
        msg.info("Using CPU")

-    train_from_config(
+    train(
        config_path,
        {"train": train_path, "dev": dev_path},
        output_path=output_path,
@ -165,10 +166,11 @@ def train_from_config_cli(
    )


-def train_from_config(
+def train(
    config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
 ):
    msg.info(f"Loading config from: {config_path}")
+    # Read the config first without creating objects, to get to the original nlp_config
    config = util.load_config(config_path, create_objects=False)
    util.fix_random_seed(config["training"]["seed"])
    if config["training"]["use_pytorch_for_gpu_memory"]:
@ -177,8 +179,8 @@ def train_from_config(
    config = util.load_config(config_path, create_objects=True)
    msg.info("Creating nlp from config")
    nlp = util.load_model_from_config(nlp_config)
-    optimizer = config["optimizer"]
    training = config["training"]
+    optimizer = training["optimizer"]
    limit = training["limit"]
    msg.info("Loading training corpus")
    corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
@ -246,13 +248,19 @@ def create_train_batches(nlp, corpus, cfg):
        if len(train_examples) == 0:
            raise ValueError(Errors.E988)
        random.shuffle(train_examples)
-        batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
+        batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"], discard_oversize=cfg["discard_oversize"])
+        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
+        try:
+            first = next(batches)
+            yield first
+        except StopIteration:
+            raise ValueError(Errors.E986)
        for batch in batches:
            yield batch
        epochs_todo -= 1
        # We intentionally compare exactly to 0 here, so that max_epochs < 1
        # will not break.
-        if epochs_todo ==  0:
+        if epochs_todo == 0:
            break


--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -453,8 +453,6 @@ class Errors(object):
            "should be of equal length.")
    E141 = ("Entity vectors should be of length {required} instead of the "
            "provided {found}.")
-    E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or "
-            "'cosine'.")
    E143 = ("Labels for component '{name}' not initialized. Did you forget to "
            "call add_label()?")
    E144 = ("Could not find parameter `{param}` when building the entity "
@ -577,6 +575,8 @@ class Errors(object):

    # TODO: fix numbering after merging develop into master

+    E986 = ("Could not create any training batches: check your input. "
+            "Perhaps discard_oversize should be set to False ?")
    E987 = ("The text of an example training instance is either a Doc or "
            "a string, but found {type} instead.")
    E988 = ("Could not parse any training examples. Ensure the data is "
--- a/spacy/language.py
+++ b/spacy/language.py
@ -231,10 +231,6 @@ class Language(object):

    # Conveniences to access pipeline components
    # Shouldn't be used anymore!
-    @property
-    def tensorizer(self):
-        return self.get_pipe("tensorizer")
-
    @property
    def tagger(self):
        return self.get_pipe("tagger")
--- a/spacy/ml/init.py
+++ b/spacy/ml/init.py
@ -0,0 +1 @@
+from .models import *
--- a/spacy/ml/models/init.py
+++ b/spacy/ml/models/init.py
@ -2,6 +2,5 @@ from .entity_linker import *  # noqa
 from .parser import *  # noqa
 from .simple_ner import *
 from .tagger import *  # noqa
-from .tensorizer import *  # noqa
 from .textcat import *  # noqa
 from .tok2vec import *  # noqa
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@ -1,4 +1,6 @@
-from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init
+import numpy
+
+from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model


 def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96):
@ -24,6 +26,80 @@ def build_cloze_multi_task_model(vocab, tok2vec):
    return model


-def build_masked_language_model(*args, **kwargs):
-    # TODO cf https://github.com/explosion/spaCy/blob/2c107f02a4d60bda2440db0aad1a88cbbf4fb52d/spacy/_ml.py#L828
-    raise NotImplementedError
+def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
+    """Convert a model into a BERT-style masked language model"""
+
+    random_words = _RandomWords(vocab)
+
+    def mlm_forward(model, docs, is_train):
+        mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
+        mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
+        output, backprop = model.get_ref("wrapped-model").begin_update(docs)  # drop=drop
+
+        def mlm_backward(d_output):
+            d_output *= 1 - mask
+            return backprop(d_output)
+
+        return output, mlm_backward
+
+    mlm_model = Model("masked-language-model", mlm_forward, layers=[wrapped_model])
+    mlm_model.set_ref("wrapped-model", wrapped_model)
+
+    return mlm_model
+
+
+class _RandomWords(object):
+    def __init__(self, vocab):
+        self.words = [lex.text for lex in vocab if lex.prob != 0.0]
+        self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
+        self.words = self.words[:10000]
+        self.probs = self.probs[:10000]
+        self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
+        self.probs /= self.probs.sum()
+        self._cache = []
+
+    def next(self):
+        if not self._cache:
+            self._cache.extend(
+                numpy.random.choice(len(self.words), 10000, p=self.probs)
+            )
+        index = self._cache.pop()
+        return self.words[index]
+
+
+def _apply_mask(docs, random_words, mask_prob=0.15):
+    # This needs to be here to avoid circular imports
+    from ...tokens import Doc
+
+    N = sum(len(doc) for doc in docs)
+    mask = numpy.random.uniform(0.0, 1.0, (N,))
+    mask = mask >= mask_prob
+    i = 0
+    masked_docs = []
+    for doc in docs:
+        words = []
+        for token in doc:
+            if not mask[i]:
+                word = _replace_word(token.text, random_words)
+            else:
+                word = token.text
+            words.append(word)
+            i += 1
+        spaces = [bool(w.whitespace_) for w in doc]
+        # NB: If you change this implementation to instead modify
+        # the docs in place, take care that the IDs reflect the original
+        # words. Currently we use the original docs to make the vectors
+        # for the target, so we don't lose the original tokens. But if
+        # you modified the docs in place here, you would.
+        masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces))
+    return mask, masked_docs
+
+
+def _replace_word(word, random_words, mask="[MASK]"):
+    roll = numpy.random.random()
+    if roll < 0.8:
+        return mask
+    elif roll < 0.9:
+        return random_words.next()
+    else:
+        return word
--- a/spacy/ml/models/tensorizer.py
+++ b/spacy/ml/models/tensorizer.py
@ -1,10 +0,0 @@
-from thinc.api import Linear, zero_init
-
-from ... import util
-from ...util import registry
-
-
-@registry.architectures.register("spacy.Tensorizer.v1")
-def build_tensorizer(input_size, output_size):
-    input_size = util.env_opt("token_vector_width", input_size)
-    return Linear(output_size, input_size, init_W=zero_init)
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -49,13 +49,13 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO

@registry.architectures.register("spacy.TextCat.v1")
 def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size,
-                          window_size, conv_depth, nO=None):
+                          window_size, conv_depth, dropout, nO=None):
    cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
-        lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER))
-        prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX))
-        suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX))
-        shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE))
+        lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout)
+        prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout)
+        suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout)
+        shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout)

        width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
        trained_vectors = FeatureExtractor(cols) >> with_array(
@ -114,7 +114,7 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class


@registry.architectures.register("spacy.TextCatLowData.v1")
-def build_text_classifier_lowdata(width, pretrained_vectors, nO=None):
+def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None):
    nlp = util.load_model(pretrained_vectors)
    vectors = nlp.vocab.vectors
    vector_dim = vectors.data.shape[1]
@ -129,7 +129,8 @@ def build_text_classifier_lowdata(width, pretrained_vectors, nO=None):
            >> reduce_sum()
            >> residual(Relu(width, width)) ** 2
            >> Linear(nO, width)
-            >> Dropout(0.0)
-            >> Logistic()
        )
+        if dropout:
+            model = model >> Dropout(dropout)
+        model = model >> Logistic()
    return model
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -49,6 +49,7 @@ def hash_embed_cnn(
    maxout_pieces,
    window_size,
    subword_features,
+    dropout,
 ):
    # Does not use character embeddings: set to False by default
    return build_Tok2Vec_model(
@ -63,6 +64,7 @@ def hash_embed_cnn(
        char_embed=False,
        nM=0,
        nC=0,
+        dropout=dropout,
    )


@ -76,6 +78,7 @@ def hash_charembed_cnn(
    window_size,
    nM,
    nC,
+    dropout,
 ):
    # Allows using character embeddings by setting nC, nM and char_embed=True
    return build_Tok2Vec_model(
@ -90,12 +93,13 @@ def hash_charembed_cnn(
        char_embed=True,
        nM=nM,
        nC=nC,
+        dropout=dropout,
    )


@registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
 def hash_embed_bilstm_v1(
-    pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces
+    pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout
 ):
    # Does not use character embeddings: set to False by default
    return build_Tok2Vec_model(
@ -110,12 +114,13 @@ def hash_embed_bilstm_v1(
        char_embed=False,
        nM=0,
        nC=0,
+        dropout=dropout,
    )


@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
 def hash_char_embed_bilstm_v1(
-    pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC
+    pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC, dropout
 ):
    # Allows using character embeddings by setting nC, nM and char_embed=True
    return build_Tok2Vec_model(
@ -130,6 +135,7 @@ def hash_char_embed_bilstm_v1(
        char_embed=True,
        nM=nM,
        nC=nC,
+        dropout=dropout,
    )


@ -144,19 +150,19 @@ def LayerNormalizedMaxout(width, maxout_pieces):


@registry.architectures.register("spacy.MultiHashEmbed.v1")
-def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix):
-    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"))
+def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout):
+    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
    if use_subwords:
-        prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"))
-        suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"))
-        shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"))
+        prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout)
+        suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout)
+        shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout)

    if pretrained_vectors:
        glove = StaticVectors(
            vectors=pretrained_vectors.data,
            nO=width,
            column=columns.index(ID),
-            dropout=0.0,
+            dropout=dropout,
        )

    with Model.define_operators({">>": chain, "|": concatenate}):
@ -164,13 +170,10 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix):
            embed_layer = norm
        else:
            if use_subwords and pretrained_vectors:
-                nr_columns = 5
                concat_columns = glove | norm | prefix | suffix | shape
            elif use_subwords:
-                nr_columns = 4
                concat_columns = norm | prefix | suffix | shape
            else:
-                nr_columns = 2
                concat_columns = glove | norm

            embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH"))
@ -179,8 +182,8 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix):


@registry.architectures.register("spacy.CharacterEmbed.v1")
-def CharacterEmbed(columns, width, rows, nM, nC, features):
-    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"))
+def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
+    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
    chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC)
    with Model.define_operators({">>": chain, "|": concatenate}):
        embed_layer = chr_embed | features >> with_array(norm)
@ -238,16 +241,17 @@ def build_Tok2Vec_model(
    nC,
    conv_depth,
    bilstm_depth,
+    dropout,
 ) -> Model:
    if char_embed:
        subword_features = False
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
-        norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM))
+        norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout)
        if subword_features:
-            prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX))
-            suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX))
-            shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE))
+            prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout)
+            suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout)
+            shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout)
        else:
            prefix, suffix, shape = (None, None, None)
        if pretrained_vectors is not None:
@ -255,7 +259,7 @@ def build_Tok2Vec_model(
                vectors=pretrained_vectors.data,
                nO=width,
                column=cols.index(ID),
-                dropout=0.0,
+                dropout=dropout,
            )

            if subword_features:
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -1,5 +1,5 @@
 from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
-from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
+from .pipes import TextCategorizer, Pipe, Sentencizer
 from .pipes import SentenceRecognizer
 from .simple_ner import SimpleNER
 from .morphologizer import Morphologizer
@ -14,7 +14,6 @@ __all__ = [
    "EntityRecognizer",
    "EntityLinker",
    "TextCategorizer",
-    "Tensorizer",
    "Tok2Vec",
    "Pipe",
    "Morphologizer",
--- a/spacy/pipeline/defaults/init.py
+++ b/spacy/pipeline/defaults/init.py
@ -63,16 +63,6 @@ def default_tagger():
    return util.load_config(loc, create_objects=True)["model"]


-def default_tensorizer_config():
-    loc = Path(__file__).parent / "tensorizer_defaults.cfg"
-    return util.load_config(loc, create_objects=False)
-
-
-def default_tensorizer():
-    loc = Path(__file__).parent / "tensorizer_defaults.cfg"
-    return util.load_config(loc, create_objects=True)["model"]
-
-
 def default_textcat_config():
    loc = Path(__file__).parent / "textcat_defaults.cfg"
    return util.load_config(loc, create_objects=False)
--- a/spacy/pipeline/defaults/entity_linker_defaults.cfg
+++ b/spacy/pipeline/defaults/entity_linker_defaults.cfg
@ -10,3 +10,4 @@ embed_size = 300
 window_size = 1
 maxout_pieces = 3
 subword_features = true
+dropout = null
--- a/spacy/pipeline/defaults/morphologizer_defaults.cfg
+++ b/spacy/pipeline/defaults/morphologizer_defaults.cfg
@ -11,3 +11,4 @@ window_size = 1
 maxout_pieces = 3
 nM = 64
 nC = 8
+dropout = null
--- a/spacy/pipeline/defaults/ner_defaults.cfg
+++ b/spacy/pipeline/defaults/ner_defaults.cfg
@ -13,3 +13,4 @@ embed_size = 2000
 window_size = 1
 maxout_pieces = 3
 subword_features = true
+dropout = null
--- a/spacy/pipeline/defaults/parser_defaults.cfg
+++ b/spacy/pipeline/defaults/parser_defaults.cfg
@ -13,3 +13,4 @@ embed_size = 2000
 window_size = 1
 maxout_pieces = 3
 subword_features = true
+dropout = null
--- a/spacy/pipeline/defaults/senter_defaults.cfg
+++ b/spacy/pipeline/defaults/senter_defaults.cfg
@ -10,3 +10,4 @@ embed_size = 2000
 window_size = 1
 maxout_pieces = 2
 subword_features = true
+dropout = null
--- a/spacy/pipeline/defaults/simple_ner_defaults.cfg
+++ b/spacy/pipeline/defaults/simple_ner_defaults.cfg
@ -10,3 +10,4 @@ embed_size = 7000
 window_size = 1
 maxout_pieces = 3
 subword_features = true
+dropout = null
--- a/spacy/pipeline/defaults/tagger_defaults.cfg
+++ b/spacy/pipeline/defaults/tagger_defaults.cfg
@ -10,3 +10,4 @@ embed_size = 2000
 window_size = 1
 maxout_pieces = 3
 subword_features = true
+dropout = null
--- a/spacy/pipeline/defaults/tensorizer_defaults.cfg
+++ b/spacy/pipeline/defaults/tensorizer_defaults.cfg
@ -1,4 +0,0 @@
-[model]
-@architectures = "spacy.Tensorizer.v1"
-input_size=96
-output_size=300
--- a/spacy/pipeline/defaults/textcat_cnn_defaults.cfg
+++ b/spacy/pipeline/defaults/textcat_cnn_defaults.cfg
@ -11,3 +11,4 @@ embed_size = 2000
 window_size = 1
 maxout_pieces = 3
 subword_features = true
+dropout = null
--- a/spacy/pipeline/defaults/textcat_defaults.cfg
+++ b/spacy/pipeline/defaults/textcat_defaults.cfg
@ -7,3 +7,4 @@ conv_depth = 2
 embed_size = 2000
 window_size = 1
 ngram_size = 1
+dropout = null
--- a/spacy/pipeline/defaults/tok2vec_defaults.cfg
+++ b/spacy/pipeline/defaults/tok2vec_defaults.cfg
@ -7,3 +7,4 @@ embed_size = 2000
 window_size = 1
 maxout_pieces = 3
 subword_features = true
+dropout = null
--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@ -44,8 +44,8 @@ class SentenceSegmenter(object):
 class SimilarityHook(Pipe):
    """
    Experimental: A pipeline component to install a hook for supervised
-    similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
-    documents. The similarity model can be any object obeying the Thinc `Model`
+    similarity into `Doc` objects.
+    The similarity model can be any object obeying the Thinc `Model`
    interface. By default, the model concatenates the elementwise mean and
    elementwise max of the two tensors, and compares them using the
    Cauchy-like similarity function from Chen (2013):
@ -82,7 +82,7 @@ class SimilarityHook(Pipe):
        sims, bp_sims = self.model.begin_update(doc1_doc2)

    def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
-        """Allocate model, using width from tensorizer in pipeline.
+        """Allocate model, using nO from the first model in the pipeline.

        gold_tuples (iterable): Gold-standard training data.
        pipeline (list): The pipeline the model is part of.
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -16,7 +16,7 @@ from ..morphology cimport Morphology
 from ..vocab cimport Vocab

 from .defaults import default_tagger, default_parser,  default_ner,  default_textcat
-from .defaults import default_nel, default_senter, default_tensorizer
+from .defaults import default_nel, default_senter
 from .functions import merge_subtokens
 from ..language import Language, component
 from ..syntax import nonproj
@ -238,138 +238,6 @@ class Pipe(object):
        return self


-@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer)
-class Tensorizer(Pipe):
-    """Pre-train position-sensitive vectors for tokens."""
-
-    def __init__(self, vocab, model, **cfg):
-        """Construct a new statistical model. Weights are not allocated on
-        initialisation.
-
-        vocab (Vocab): A `Vocab` instance. The model must share the same
-            `Vocab` instance with the `Doc` objects it will process.
-        **cfg: Config parameters.
-        """
-        self.vocab = vocab
-        self.model = model
-        self.input_models = []
-        self.cfg = dict(cfg)
-
-    def __call__(self, example):
-        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
-        model. Vectors are set to the `Doc.tensor` attribute.
-
-        docs (Doc or iterable): One or more documents to add vectors to.
-        RETURNS (dict or None): Intermediate computations.
-        """
-        doc = self._get_doc(example)
-        tokvecses = self.predict([doc])
-        self.set_annotations([doc], tokvecses)
-        if isinstance(example, Example):
-            example.doc = doc
-            return example
-        return doc
-
-    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
-        """Process `Doc` objects as a stream.
-
-        stream (iterator): A sequence of `Doc` or `Example` objects to process.
-        batch_size (int): Number of `Doc` or `Example` objects to group.
-        YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input.
-        """
-        for examples in util.minibatch(stream, size=batch_size):
-            docs = [self._get_doc(ex) for ex in examples]
-            tensors = self.predict(docs)
-            self.set_annotations(docs, tensors)
-
-            if as_example:
-                for ex, doc in zip(examples, docs):
-                    ex.doc = doc
-                    yield ex
-            else:
-                yield from docs
-
-    def predict(self, docs):
-        """Return a single tensor for a batch of documents.
-
-        docs (iterable): A sequence of `Doc` objects.
-        RETURNS (object): Vector representations for each token in the docs.
-        """
-        inputs = self.model.ops.flatten([doc.tensor for doc in docs])
-        outputs = self.model(inputs)
-        return self.model.ops.unflatten(outputs, [len(d) for d in docs])
-
-    def set_annotations(self, docs, tensors):
-        """Set the tensor attribute for a batch of documents.
-
-        docs (iterable): A sequence of `Doc` objects.
-        tensors (object): Vector representation for each token in the docs.
-        """
-        for doc, tensor in zip(docs, tensors):
-            if tensor.shape[0] != len(doc):
-                raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
-            doc.tensor = tensor
-
-    def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None):
-        """Update the model.
-
-        docs (iterable): A batch of `Doc` objects.
-        golds (iterable): A batch of `GoldParse` objects.
-        drop (float): The dropout rate.
-        sgd (callable): An optimizer.
-        RETURNS (dict): Results from the update.
-        """
-        examples = Example.to_example_objects(examples)
-        inputs = []
-        bp_inputs = []
-        set_dropout_rate(self.model, drop)
-        for tok2vec in self.input_models:
-            set_dropout_rate(tok2vec, drop)
-            tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples])
-            inputs.append(tensor)
-            bp_inputs.append(bp_tensor)
-        inputs = self.model.ops.xp.hstack(inputs)
-        scores, bp_scores = self.model.begin_update(inputs)
-        loss, d_scores = self.get_loss(examples, scores)
-        d_inputs = bp_scores(d_scores, sgd=sgd)
-        d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
-        for d_input, bp_input in zip(d_inputs, bp_inputs):
-            bp_input(d_input)
-        if sgd is not None:
-            for tok2vec in self.input_models:
-                tok2vec.finish_update(sgd)
-            self.model.finish_update(sgd)
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
-            losses[self.name] += loss
-        return loss
-
-    def get_loss(self, examples, prediction):
-        examples = Example.to_example_objects(examples)
-        ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
-        target = self.vocab.vectors.data[ids]
-        d_scores = (prediction - target) / prediction.shape[0]
-        loss = (d_scores ** 2).sum()
-        return loss, d_scores
-
-    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
-        """Allocate models, pre-process training data and acquire an
-        optimizer.
-
-        get_examples (iterable): Gold-standard training data.
-        pipeline (list): The pipeline the model is part of.
-        """
-        if pipeline is not None:
-            for name, model in pipeline:
-                if model.has_ref("tok2vec"):
-                    self.input_models.append(model.get_ref("tok2vec"))
-        self.model.initialize()
-        link_vectors_to_models(self.vocab)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
-
-
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger)
 class Tagger(Pipe):
    """Pipeline component for part-of-speech tagging.
@ -1708,4 +1576,4 @@ def ner_factory(nlp, model, **cfg):
        warnings.warn(Warnings.W098.format(name="ner"))
    return EntityRecognizer.from_nlp(nlp, model, **cfg)

-__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
+__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -123,9 +123,9 @@ def test_overfitting_IO():
        {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
        {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
        {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
-        {"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2},
-        {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1},
-        {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3},
+        {"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None},
+        {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None},
+        {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
        {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": True},
        {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": False},
    ],
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -24,6 +24,7 @@ window_size = 1
 embed_size = 2000
 maxout_pieces = 3
 subword_features = true
+dropout = null

 [nlp.pipeline.tagger]
 factory = "tagger"
@ -53,6 +54,7 @@ embed_size = 5555
 window_size = 1
 maxout_pieces = 7
 subword_features = false
+dropout = null
 """


@ -70,6 +72,7 @@ def my_parser():
        nC=8,
        conv_depth=2,
        bilstm_depth=0,
+        dropout=None,
    )
    parser = build_tb_parser_model(
        tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@ -1,7 +1,7 @@
 import pytest
 from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
-from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
-from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger
+from spacy.pipeline import TextCategorizer, SentenceRecognizer
+from spacy.pipeline.defaults import default_parser, default_tagger
 from spacy.pipeline.defaults import default_textcat, default_senter

 from ..util import make_tempdir
@ -95,24 +95,6 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
        assert tagger1_d.to_bytes() == tagger2_d.to_bytes()


-def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
-    tensorizer = Tensorizer(en_vocab, default_tensorizer())
-    tensorizer_b = tensorizer.to_bytes(exclude=["vocab"])
-    new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b)
-    assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b
-
-
-def test_serialize_tensorizer_roundtrip_disk(en_vocab):
-    tensorizer = Tensorizer(en_vocab, default_tensorizer())
-    with make_tempdir() as d:
-        file_path = d / "tensorizer"
-        tensorizer.to_disk(file_path)
-        tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path)
-        assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes(
-            exclude=["vocab"]
-        )
-
-
 def test_serialize_textcat_empty(en_vocab):
    # See issue #1105
    textcat = TextCategorizer(
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@ -15,7 +15,7 @@ def test_empty_doc():
    vocab = Vocab()
    doc = Doc(vocab, words=[])
    # TODO: fix tok2vec arguments
-    tok2vec = build_Tok2Vec_model(width, embed_size)
+    tok2vec = build_Tok2Vec_model(width, embed_size, dropout=None)
    vectors, backprop = tok2vec.begin_update([doc])
    assert len(vectors) == 1
    assert vectors[0].shape == (0, width)
@ -38,6 +38,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
        char_embed=False,
        nM=64,
        nC=8,
+        dropout=None,
    )
    tok2vec.initialize()
    vectors, backprop = tok2vec.begin_update(batch)
@ -50,14 +51,14 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
@pytest.mark.parametrize(
    "tok2vec_config",
    [
-        {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
-        {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
-        {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
-        {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
-        {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
-        {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
-        {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
-        {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
+        {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None},
+        {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None},
+        {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None},
+        {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None},
+        {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None},
+        {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None},
+        {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None},
+        {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None},
    ],
 )
 # fmt: on