diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index fbac4ea7d..6c3a21f4b 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -25,6 +25,7 @@ score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} # These settings are invalid for the transformer models. init_tok2vec = null vectors = null +discard_oversize = false [training.batch_size] @schedules = "compounding.v1" @@ -32,7 +33,7 @@ start = 1000 stop = 1000 compound = 1.001 -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" beta1 = 0.9 beta2 = 0.999 @@ -113,3 +114,4 @@ window_size = 1 embed_size = 10000 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg new file mode 100644 index 000000000..4f1898d69 --- /dev/null +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -0,0 +1,137 @@ +# Training hyper-parameters and additional features. +[training] +# Whether to train on sequences with 'gold standard' sentence boundaries +# and tokens. If you set this to true, take care to ensure your run-time +# data is passed in sentence-by-sentence via some prior preprocessing. +gold_preproc = false +# Limitations on training document length or number of examples. +max_length = 0 +limit = 0 +# Data augmentation +orth_variant_level = 0.0 +dropout = 0.1 +# Controls early-stopping. 0 or -1 mean unlimited. +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 400 +# Other settings +seed = 0 +accumulate_gradient = 1 +use_pytorch_for_gpu_memory = false +# Control how scores are printed and checkpoints are evaluated. +scores = ["speed", "tags_acc", "uas", "las", "ents_f"] +score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} +# These settings are invalid for the transformer models. +init_tok2vec = null +vectors = null +discard_oversize = false + +[training.batch_size] +@schedules = "compounding.v1" +start = 1000 +stop = 1000 +compound = 1.001 + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 + +[pretraining] +max_epochs = 1000 +min_length = 5 +max_length = 500 +dropout = 0.2 +n_save_every = null +batch_size = 3000 +seed = ${training:seed} +use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory} +tok2vec_model = "nlp.pipeline.tok2vec.model" + +[pretraining.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 + +[pretraining.loss_func] +@losses = "CosineDistance.v1" +normalize = true + +[nlp] +lang = "en" +vectors = ${training:vectors} + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.senter] +factory = "senter" + +[nlp.pipeline.ner] +factory = "ner" + +[nlp.pipeline.tagger] +factory = "tagger" + +[nlp.pipeline.parser] +factory = "parser" + +[nlp.pipeline.senter.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.senter.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tagger.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.parser.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 8 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.parser.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 3 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tok2vec.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = ${nlp:vectors} +width = 256 +depth = 6 +window_size = 1 +embed_size = 10000 +maxout_pieces = 3 +subword_features = true +dropout = null diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index e152fa5e0..acbcc8d41 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -14,6 +14,7 @@ score_weights = {"las": 0.8, "tags_acc": 0.2} limit = 0 seed = 0 accumulate_gradient = 2 +discard_oversize = false [training.batch_size] @schedules = "compounding.v1" @@ -21,7 +22,7 @@ start = 100 stop = 1000 compound = 1.001 -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" learn_rate = 0.001 beta1 = 0.9 @@ -65,3 +66,4 @@ depth = 4 embed_size = 2000 subword_features = true maxout_pieces = 3 +dropout = null diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 9a10c45f0..c305c015c 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -14,6 +14,7 @@ score_weights = {"las": 0.8, "tags_acc": 0.2} limit = 0 seed = 0 accumulate_gradient = 2 +discard_oversize = false [training.batch_size] @schedules = "compounding.v1" @@ -21,7 +22,7 @@ start = 100 stop = 1000 compound = 1.001 -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" learn_rate = 0.001 beta1 = 0.9 @@ -66,3 +67,4 @@ window_size = 1 embed_size = 2000 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg index 796c8670f..eca6a22fa 100644 --- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg @@ -12,8 +12,9 @@ max_length = 0 batch_size = 25 seed = 0 accumulate_gradient = 2 +discard_oversize = false -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" learn_rate = 0.001 beta1 = 0.9 @@ -36,6 +37,7 @@ nM = 64 nC = 8 rows = 2000 columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] +dropout = null [nlp.pipeline.tok2vec.model.extract.features] @architectures = "spacy.Doc2Feats.v1" diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg index 3ac70675b..a5fa32b18 100644 --- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg @@ -11,6 +11,7 @@ gold_preproc = true max_length = 0 seed = 0 accumulate_gradient = 2 +discard_oversize = false [training.batch_size] @schedules = "compounding.v1" @@ -19,7 +20,7 @@ stop = 3000 compound = 1.001 -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" learn_rate = 0.001 beta1 = 0.9 @@ -44,3 +45,4 @@ maxout_pieces = 3 window_size = 1 subword_features = true pretrained_vectors = null +dropout = null diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py deleted file mode 100644 index 5c41c0e92..000000000 --- a/examples/training/pretrain_textcat.py +++ /dev/null @@ -1,212 +0,0 @@ -"""This script is experimental. - -Try pre-training the CNN component of the text categorizer using a cheap -language modelling-like objective. Specifically, we load pretrained vectors -(from something like word2vec, GloVe, FastText etc), and use the CNN to -predict the tokens' pretrained vectors. This isn't as easy as it sounds: -we're not merely doing compression here, because heavy dropout is applied, -including over the input words. This means the model must often (50% of the time) -use the context in order to predict the word. - -To evaluate the technique, we're pre-training with the 50k texts from the IMDB -corpus, and then training with only 100 labels. Note that it's a bit dirty to -pre-train with the development data, but also not *so* terrible: we're not using -the development labels, after all --- only the unlabelled text. -""" -import plac -import tqdm -import random - -import ml_datasets - -import spacy -from spacy.util import minibatch -from spacy.pipeline import TextCategorizer -from spacy.ml.models.tok2vec import build_Tok2Vec_model -import numpy - - -def load_texts(limit=0): - train, dev = ml_datasets.imdb() - train_texts, train_labels = zip(*train) - dev_texts, dev_labels = zip(*train) - train_texts = list(train_texts) - dev_texts = list(dev_texts) - random.shuffle(train_texts) - random.shuffle(dev_texts) - if limit >= 1: - return train_texts[:limit] - else: - return list(train_texts) + list(dev_texts) - - -def load_textcat_data(limit=0): - """Load data from the IMDB dataset.""" - # Partition off part of the train data for evaluation - train_data, eval_data = ml_datasets.imdb() - random.shuffle(train_data) - train_data = train_data[-limit:] - texts, labels = zip(*train_data) - eval_texts, eval_labels = zip(*eval_data) - cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels] - eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels] - return (texts, cats), (eval_texts, eval_cats) - - -def prefer_gpu(): - used = spacy.util.use_gpu(0) - if used is None: - return False - else: - import cupy.random - - cupy.random.seed(0) - return True - - -def build_textcat_model(tok2vec, nr_class, width): - from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged - - with Model.define_operators({">>": chain}): - model = ( - tok2vec - >> list2ragged() - >> reduce_mean() - >> Softmax(nr_class, width) - ) - model.set_ref("tok2vec", tok2vec) - return model - - -def block_gradients(model): - from thinc.api import wrap # TODO FIX - - def forward(X, drop=0.0): - Y, _ = model.begin_update(X, drop=drop) - return Y, None - - return wrap(forward, model) - - -def create_pipeline(width, embed_size, vectors_model): - print("Load vectors") - nlp = spacy.load(vectors_model) - print("Start training") - textcat = TextCategorizer( - nlp.vocab, - labels=["POSITIVE", "NEGATIVE"], - # TODO: replace with config version - model=build_textcat_model( - build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width - ), - ) - - nlp.add_pipe(textcat) - return nlp - - -def train_tensorizer(nlp, texts, dropout, n_iter): - tensorizer = nlp.create_pipe("tensorizer") - nlp.add_pipe(tensorizer) - optimizer = nlp.begin_training() - for i in range(n_iter): - losses = {} - for i, batch in enumerate(minibatch(tqdm.tqdm(texts))): - docs = [nlp.make_doc(text) for text in batch] - tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout) - print(losses) - return optimizer - - -def train_textcat(nlp, n_texts, n_iter=10): - textcat = nlp.get_pipe("textcat") - tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes() - (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts) - print( - "Using {} examples ({} training, {} evaluation)".format( - n_texts, len(train_texts), len(dev_texts) - ) - ) - train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats])) - - with nlp.select_pipes(enable="textcat"): # only train textcat - optimizer = nlp.begin_training() - textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights) - print("Training the model...") - print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) - for i in range(n_iter): - losses = {"textcat": 0.0} - # batch up the examples using spaCy's minibatch - batches = minibatch(tqdm.tqdm(train_data), size=2) - for batch in batches: - nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses) - with textcat.model.use_params(optimizer.averages): - # evaluate on the dev data split off in load_data() - scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats) - print( - "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table - losses["textcat"], - scores["textcat_p"], - scores["textcat_r"], - scores["textcat_f"], - ) - ) - - -def evaluate_textcat(tokenizer, textcat, texts, cats): - docs = (tokenizer(text) for text in texts) - tp = 1e-8 - fp = 1e-8 - tn = 1e-8 - fn = 1e-8 - for i, doc in enumerate(textcat.pipe(docs)): - gold = cats[i] - for label, score in doc.cats.items(): - if label not in gold: - continue - if score >= 0.5 and gold[label] >= 0.5: - tp += 1.0 - elif score >= 0.5 and gold[label] < 0.5: - fp += 1.0 - elif score < 0.5 and gold[label] < 0.5: - tn += 1 - elif score < 0.5 and gold[label] >= 0.5: - fn += 1 - precision = tp / (tp + fp) - recall = tp / (tp + fn) - f_score = 2 * (precision * recall) / (precision + recall) - return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score} - - -@plac.annotations( - width=("Width of CNN layers", "positional", None, int), - embed_size=("Embedding rows", "positional", None, int), - pretrain_iters=("Number of iterations to pretrain", "option", "pn", int), - train_iters=("Number of iterations to pretrain", "option", "tn", int), - train_examples=("Number of labelled examples", "option", "eg", int), - vectors_model=("Name or path to vectors model to learn from"), -) -def main( - width, - embed_size, - vectors_model, - pretrain_iters=30, - train_iters=30, - train_examples=1000, -): - random.seed(0) - numpy.random.seed(0) - use_gpu = prefer_gpu() - print("Using GPU?", use_gpu) - - nlp = create_pipeline(width, embed_size, vectors_model) - print("Load data") - texts = load_texts(limit=0) - print("Train tensorizer") - optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters) - print("Train textcat") - train_textcat(nlp, train_examples, n_iter=train_iters) - - -if __name__ == "__main__": - plac.call(main) diff --git a/spacy/__main__.py b/spacy/__main__.py index 71ab1a91a..beed3170d 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -2,16 +2,15 @@ if __name__ == "__main__": import plac import sys from wasabi import msg - from spacy.cli import download, link, info, package, train, pretrain, convert + from spacy.cli import download, link, info, package, pretrain, convert from spacy.cli import init_model, profile, evaluate, validate, debug_data - from spacy.cli import train_from_config_cli + from spacy.cli import train_cli commands = { "download": download, "link": link, "info": info, - "train": train, - "train-from-config": train_from_config_cli, + "train": train_cli, "pretrain": pretrain, "debug-data": debug_data, "evaluate": evaluate, diff --git a/spacy/_ml.py b/spacy/_ml.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 5f83b26c1..2ffbe2d0c 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -4,8 +4,7 @@ from .download import download # noqa: F401 from .info import info # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 -from .train import train # noqa: F401 -from .train_from_config import train_from_config_cli # noqa: F401 +from .train_from_config import train_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .evaluate import evaluate # noqa: F401 diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index b2e3229ee..d37426b5a 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -3,48 +3,39 @@ import numpy import time import re from collections import Counter +import plac from pathlib import Path -from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu -from thinc.api import CosineDistance, L2Distance +from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory from wasabi import msg import srsly -from ..gold import Example from ..errors import Errors from ..ml.models.multi_task import build_masked_language_model from ..tokens import Doc from ..attrs import ID, HEAD -from ..ml.models.tok2vec import build_Tok2Vec_model from .. import util -from ..util import create_default_optimizer -from .train import _load_pretrained_tok2vec +from ..gold import Example -def pretrain( +@plac.annotations( # fmt: off - texts_loc: ("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str), - vectors_model: ("Name or path to spaCy model with vectors to learn from", "positional", None, str), - output_dir: ("Directory to write models to on each epoch", "positional", None, str), - width: ("Width of CNN layers", "option", "cw", int) = 96, - conv_depth: ("Depth of CNN layers", "option", "cd", int) = 4, - bilstm_depth: ("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int) = 0, - cnn_pieces: ("Maxout size for CNN layers. 1 for Mish", "option", "cP", int) = 3, - sa_depth: ("Depth of self-attention layers", "option", "sa", int) = 0, - use_chars: ("Whether to use character-based embedding", "flag", "chr", bool) = False, - cnn_window: ("Window size for CNN layers", "option", "cW", int) = 1, - embed_rows: ("Number of embedding rows", "option", "er", int) = 2000, - loss_func: ("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str) = "cosine", - use_vectors: ("Whether to use the static vectors as input features", "flag", "uv") = False, - dropout: ("Dropout rate", "option", "d", float) = 0.2, - n_iter: ("Number of iterations to pretrain", "option", "i", int) = 1000, - batch_size: ("Number of words per training batch", "option", "bs", int) = 3000, - max_length: ("Max words per example. Longer examples are discarded", "option", "xw", int) = 500, - min_length: ("Min words per example. Shorter examples are discarded", "option", "nw", int) = 5, - seed: ("Seed for random number generators", "option", "s", int) = 0, - n_save_every: ("Save model every X batches.", "option", "se", int) = None, - init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, - epoch_start: ("The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.", "option", "es", int) = None, + texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str), + vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str), + output_dir=("Directory to write models to on each epoch", "positional", None, Path), + config_path=("Path to config file", "positional", None, Path), + use_gpu=("Use GPU", "option", "g", int), + resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path), + epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int), # fmt: on +) +def pretrain( + texts_loc, + vectors_model, + config_path, + output_dir, + use_gpu=-1, + resume_path=None, + epoch_resume=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, @@ -58,34 +49,46 @@ def pretrain( However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure - all settings are the same between pretraining and training. The API and - errors around this need some improvement. + all settings are the same between pretraining and training. Ideally, + this is done by using the same config file for both commands. """ - config = dict(locals()) - for key in config: - if isinstance(config[key], Path): - config[key] = str(config[key]) - util.fix_random_seed(seed) + if not config_path or not config_path.exists(): + msg.fail("Config file not found", config_path, exits=1) - has_gpu = prefer_gpu() - if has_gpu: - import torch + if use_gpu >= 0: + msg.info("Using GPU") + util.use_gpu(use_gpu) + else: + msg.info("Using CPU") - torch.set_default_tensor_type("torch.cuda.FloatTensor") - msg.info("Using GPU" if has_gpu else "Not using GPU") + msg.info(f"Loading config from: {config_path}") + config = util.load_config(config_path, create_objects=False) + util.fix_random_seed(config["pretraining"]["seed"]) + if config["pretraining"]["use_pytorch_for_gpu_memory"]: + use_pytorch_for_gpu_memory() - output_dir = Path(output_dir) if output_dir.exists() and [p for p in output_dir.iterdir()]: - msg.warn( - "Output directory is not empty", - "It is better to use an empty directory or refer to a new output path, " - "then the new directory will be created for you.", - ) + if resume_path: + msg.warn( + "Output directory is not empty. ", + "If you're resuming a run from a previous model in this directory, " + "the old models for the consecutive epochs will be overwritten " + "with the new ones.", + ) + else: + msg.warn( + "Output directory is not empty. ", + "It is better to use an empty directory or refer to a new output path, " + "then the new directory will be created for you.", + ) if not output_dir.exists(): output_dir.mkdir() msg.good(f"Created output directory: {output_dir}") srsly.write_json(output_dir / "config.json", config) - msg.good("Saved settings to config.json") + msg.good("Saved config file in the output directory") + + config = util.load_config(config_path, create_objects=True) + pretrain_config = config["pretraining"] # Load texts from file or stdin if texts_loc != "-": # reading from a file @@ -99,57 +102,50 @@ def pretrain( msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin - msg.text("Reading input text from stdin...") + msg.info("Reading input text from stdin...") texts = srsly.read_jsonl("-") with msg.loading(f"Loading model '{vectors_model}'..."): nlp = util.load_model(vectors_model) msg.good(f"Loaded model '{vectors_model}'") - pretrained_vectors = None if not use_vectors else nlp.vocab.vectors - model = create_pretraining_model( - nlp, - # TODO: replace with config - build_Tok2Vec_model( - width, - embed_rows, - conv_depth=conv_depth, - pretrained_vectors=pretrained_vectors, - bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. - subword_features=not use_chars, # Set to False for Chinese etc - maxout_pieces=cnn_pieces, # If set to 1, use Mish activation. - window_size=1, - char_embed=False, - nM=64, - nC=8, - ), - ) - # Load in pretrained weights - if init_tok2vec is not None: - components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text(f"Loaded pretrained tok2vec for: {components}") + tok2vec_path = pretrain_config["tok2vec_model"] + tok2vec = config + for subpath in tok2vec_path.split("."): + tok2vec = tok2vec.get(subpath) + model = create_pretraining_model(nlp, tok2vec) + optimizer = pretrain_config["optimizer"] + + # Load in pretrained weights to resume from + if resume_path is not None: + msg.info(f"Resume training tok2vec from: {resume_path}") + with resume_path.open("rb") as file_: + weights_data = file_.read() + model.get_ref("tok2vec").from_bytes(weights_data) # Parse the epoch number from the given weight file - model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) + model_name = re.search(r"model\d+\.bin", str(resume_path)) if model_name: # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' - epoch_start = int(model_name.group(0)[5:][:-4]) + 1 + epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 + msg.info(f"Resuming from epoch: {epoch_resume}") else: - if not epoch_start: + if not epoch_resume: msg.fail( - "You have to use the --epoch-start argument when using a renamed weight file for --init-tok2vec", + "You have to use the --epoch-resume setting when using a renamed weight file for --resume-path", exits=True, ) - elif epoch_start < 0: + elif epoch_resume < 0: msg.fail( - f"The argument --epoch-start has to be greater or equal to 0. {epoch_start} is invalid", + f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid", exits=True, ) + else: + msg.info(f"Resuming from epoch: {epoch_resume}") else: - # Without '--init-tok2vec' the '--epoch-start' argument is ignored - epoch_start = 0 + # Without '--resume-path' the '--epoch-resume' argument is ignored + epoch_resume = 0 - optimizer = create_default_optimizer() tracker = ProgressTracker(frequency=10000) - msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}") + msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) @@ -168,28 +164,27 @@ def pretrain( file_.write(srsly.json_dumps(log) + "\n") skip_counter = 0 - for epoch in range(epoch_start, n_iter + epoch_start): - for batch_id, batch in enumerate( - util.minibatch_by_words( - (Example(doc=text) for text in texts), size=batch_size - ) - ): + loss_func = pretrain_config["loss_func"] + for epoch in range(epoch_resume, pretrain_config["max_epochs"]): + examples = [Example(doc=text) for text in texts] + batches = util.minibatch_by_words(examples, size=pretrain_config["batch_size"]) + for batch_id, batch in enumerate(batches): docs, count = make_docs( nlp, - [text for (text, _) in batch], - max_length=max_length, - min_length=min_length, + [ex.doc for ex in batch], + max_length=pretrain_config["max_length"], + min_length=pretrain_config["min_length"], ) skip_counter += count - loss = make_update( - model, docs, optimizer, objective=loss_func, drop=dropout - ) + loss = make_update(model, docs, optimizer, distance=loss_func) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: break - if n_save_every and (batch_id % n_save_every == 0): + if pretrain_config["n_save_every"] and ( + batch_id % pretrain_config["n_save_every"] == 0 + ): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 @@ -201,17 +196,17 @@ def pretrain( msg.good("Successfully finished pretrain") -def make_update(model, docs, optimizer, drop=0.0, objective="L2"): +def make_update(model, docs, optimizer, distance): """Perform an update over a single batch of documents. docs (iterable): A batch of `Doc` objects. - drop (float): The dropout rate. optimizer (callable): An optimizer. RETURNS loss: A float for the loss. """ - predictions, backprop = model.begin_update(docs, drop=drop) - loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective) - backprop(gradients, sgd=optimizer) + predictions, backprop = model.begin_update(docs) + loss, gradients = get_vectors_loss(model.ops, docs, predictions, distance) + backprop(gradients) + model.finish_update(optimizer) # Don't want to return a cupy object here # The gradients are modified in-place by the BERT MLM, # so we get an accurate loss @@ -243,12 +238,12 @@ def make_docs(nlp, batch, min_length, max_length): heads = numpy.asarray(heads, dtype="uint64") heads = heads.reshape((len(doc), 1)) doc = doc.from_array([HEAD], heads) - if len(doc) >= min_length and len(doc) < max_length: + if min_length <= len(doc) < max_length: docs.append(doc) return docs, skip_count -def get_vectors_loss(ops, docs, prediction, objective="L2"): +def get_vectors_loss(ops, docs, prediction, distance): """Compute a mean-squared error loss between the documents' vectors and the prediction. @@ -262,13 +257,6 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"): # and look them up all at once. This prevents data copying. ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = docs[0].vocab.vectors.data[ids] - # TODO: this code originally didn't normalize, but shouldn't normalize=True ? - if objective == "L2": - distance = L2Distance(normalize=False) - elif objective == "cosine": - distance = CosineDistance(normalize=False) - else: - raise ValueError(Errors.E142.format(loss_func=objective)) d_target, loss = distance(prediction, target) return loss, d_target @@ -281,7 +269,7 @@ def create_pretraining_model(nlp, tok2vec): """ output_size = nlp.vocab.vectors.data.shape[1] output_layer = chain( - Maxout(300, pieces=3, normalize=True, dropout=0.0), Linear(output_size) + Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size) ) # This is annoying, but the parser etc have the flatten step after # the tok2vec. To load the weights in cleanly, we need to match @@ -289,11 +277,12 @@ def create_pretraining_model(nlp, tok2vec): # "tok2vec" has to be the same set of processes as what the components do. tok2vec = chain(tok2vec, list2array()) model = chain(tok2vec, output_layer) - model = build_masked_language_model(nlp.vocab, model) - model.set_ref("tok2vec", tok2vec) - model.set_ref("output_layer", output_layer) model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) - return model + mlm_model = build_masked_language_model(nlp.vocab, model) + mlm_model.set_ref("tok2vec", tok2vec) + mlm_model.set_ref("output_layer", output_layer) + mlm_model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) + return mlm_model class ProgressTracker(object): diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index c0e3bd169..a6d0a0abc 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -13,6 +13,7 @@ import random from ..gold import GoldCorpus from .. import util from ..errors import Errors +from ..ml import models # don't remove - required to load the built-in architectures registry = util.registry @@ -123,7 +124,7 @@ class ConfigSchema(BaseModel): use_gpu=("Use GPU", "option", "g", int), # fmt: on ) -def train_from_config_cli( +def train_cli( train_path, dev_path, config_path, @@ -132,7 +133,7 @@ def train_from_config_cli( raw_text=None, debug=False, verbose=False, - use_gpu=-1 + use_gpu=-1, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's @@ -156,7 +157,7 @@ def train_from_config_cli( else: msg.info("Using CPU") - train_from_config( + train( config_path, {"train": train_path, "dev": dev_path}, output_path=output_path, @@ -165,10 +166,11 @@ def train_from_config_cli( ) -def train_from_config( +def train( config_path, data_paths, raw_text=None, meta_path=None, output_path=None, ): msg.info(f"Loading config from: {config_path}") + # Read the config first without creating objects, to get to the original nlp_config config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) if config["training"]["use_pytorch_for_gpu_memory"]: @@ -177,8 +179,8 @@ def train_from_config( config = util.load_config(config_path, create_objects=True) msg.info("Creating nlp from config") nlp = util.load_model_from_config(nlp_config) - optimizer = config["optimizer"] training = config["training"] + optimizer = training["optimizer"] limit = training["limit"] msg.info("Loading training corpus") corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) @@ -246,13 +248,19 @@ def create_train_batches(nlp, corpus, cfg): if len(train_examples) == 0: raise ValueError(Errors.E988) random.shuffle(train_examples) - batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"]) + batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"], discard_oversize=cfg["discard_oversize"]) + # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop + try: + first = next(batches) + yield first + except StopIteration: + raise ValueError(Errors.E986) for batch in batches: yield batch epochs_todo -= 1 # We intentionally compare exactly to 0 here, so that max_epochs < 1 # will not break. - if epochs_todo == 0: + if epochs_todo == 0: break diff --git a/spacy/errors.py b/spacy/errors.py index 6184c078c..ce931f0a1 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -453,8 +453,6 @@ class Errors(object): "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " "provided {found}.") - E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or " - "'cosine'.") E143 = ("Labels for component '{name}' not initialized. Did you forget to " "call add_label()?") E144 = ("Could not find parameter `{param}` when building the entity " @@ -577,6 +575,8 @@ class Errors(object): # TODO: fix numbering after merging develop into master + E986 = ("Could not create any training batches: check your input. " + "Perhaps discard_oversize should be set to False ?") E987 = ("The text of an example training instance is either a Doc or " "a string, but found {type} instead.") E988 = ("Could not parse any training examples. Ensure the data is " diff --git a/spacy/language.py b/spacy/language.py index f281fa1ba..6341dc858 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -231,10 +231,6 @@ class Language(object): # Conveniences to access pipeline components # Shouldn't be used anymore! - @property - def tensorizer(self): - return self.get_pipe("tensorizer") - @property def tagger(self): return self.get_pipe("tagger") diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py index e69de29bb..aed4fa323 100644 --- a/spacy/ml/__init__.py +++ b/spacy/ml/__init__.py @@ -0,0 +1 @@ +from .models import * diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index ef1e8efca..40cde2437 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -2,6 +2,5 @@ from .entity_linker import * # noqa from .parser import * # noqa from .simple_ner import * from .tagger import * # noqa -from .tensorizer import * # noqa from .textcat import * # noqa from .tok2vec import * # noqa diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 1c193df82..8000d1aff 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -1,4 +1,6 @@ -from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init +import numpy + +from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96): @@ -24,6 +26,80 @@ def build_cloze_multi_task_model(vocab, tok2vec): return model -def build_masked_language_model(*args, **kwargs): - # TODO cf https://github.com/explosion/spaCy/blob/2c107f02a4d60bda2440db0aad1a88cbbf4fb52d/spacy/_ml.py#L828 - raise NotImplementedError +def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): + """Convert a model into a BERT-style masked language model""" + + random_words = _RandomWords(vocab) + + def mlm_forward(model, docs, is_train): + mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) + mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) + output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop + + def mlm_backward(d_output): + d_output *= 1 - mask + return backprop(d_output) + + return output, mlm_backward + + mlm_model = Model("masked-language-model", mlm_forward, layers=[wrapped_model]) + mlm_model.set_ref("wrapped-model", wrapped_model) + + return mlm_model + + +class _RandomWords(object): + def __init__(self, vocab): + self.words = [lex.text for lex in vocab if lex.prob != 0.0] + self.probs = [lex.prob for lex in vocab if lex.prob != 0.0] + self.words = self.words[:10000] + self.probs = self.probs[:10000] + self.probs = numpy.exp(numpy.array(self.probs, dtype="f")) + self.probs /= self.probs.sum() + self._cache = [] + + def next(self): + if not self._cache: + self._cache.extend( + numpy.random.choice(len(self.words), 10000, p=self.probs) + ) + index = self._cache.pop() + return self.words[index] + + +def _apply_mask(docs, random_words, mask_prob=0.15): + # This needs to be here to avoid circular imports + from ...tokens import Doc + + N = sum(len(doc) for doc in docs) + mask = numpy.random.uniform(0.0, 1.0, (N,)) + mask = mask >= mask_prob + i = 0 + masked_docs = [] + for doc in docs: + words = [] + for token in doc: + if not mask[i]: + word = _replace_word(token.text, random_words) + else: + word = token.text + words.append(word) + i += 1 + spaces = [bool(w.whitespace_) for w in doc] + # NB: If you change this implementation to instead modify + # the docs in place, take care that the IDs reflect the original + # words. Currently we use the original docs to make the vectors + # for the target, so we don't lose the original tokens. But if + # you modified the docs in place here, you would. + masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces)) + return mask, masked_docs + + +def _replace_word(word, random_words, mask="[MASK]"): + roll = numpy.random.random() + if roll < 0.8: + return mask + elif roll < 0.9: + return random_words.next() + else: + return word diff --git a/spacy/ml/models/tensorizer.py b/spacy/ml/models/tensorizer.py deleted file mode 100644 index f66610b64..000000000 --- a/spacy/ml/models/tensorizer.py +++ /dev/null @@ -1,10 +0,0 @@ -from thinc.api import Linear, zero_init - -from ... import util -from ...util import registry - - -@registry.architectures.register("spacy.Tensorizer.v1") -def build_tensorizer(input_size, output_size): - input_size = util.env_opt("token_vector_width", input_size) - return Linear(output_size, input_size, init_W=zero_init) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index ce31d058c..141c66f79 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -49,13 +49,13 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO @registry.architectures.register("spacy.TextCat.v1") def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size, - window_size, conv_depth, nO=None): + window_size, conv_depth, dropout, nO=None): cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER)) - prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX)) - suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX)) - shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE)) + lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout) + prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout) + suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout) + shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) trained_vectors = FeatureExtractor(cols) >> with_array( @@ -114,7 +114,7 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class @registry.architectures.register("spacy.TextCatLowData.v1") -def build_text_classifier_lowdata(width, pretrained_vectors, nO=None): +def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None): nlp = util.load_model(pretrained_vectors) vectors = nlp.vocab.vectors vector_dim = vectors.data.shape[1] @@ -129,7 +129,8 @@ def build_text_classifier_lowdata(width, pretrained_vectors, nO=None): >> reduce_sum() >> residual(Relu(width, width)) ** 2 >> Linear(nO, width) - >> Dropout(0.0) - >> Logistic() ) + if dropout: + model = model >> Dropout(dropout) + model = model >> Logistic() return model diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index a2e8f589a..53798e57c 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -49,6 +49,7 @@ def hash_embed_cnn( maxout_pieces, window_size, subword_features, + dropout, ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -63,6 +64,7 @@ def hash_embed_cnn( char_embed=False, nM=0, nC=0, + dropout=dropout, ) @@ -76,6 +78,7 @@ def hash_charembed_cnn( window_size, nM, nC, + dropout, ): # Allows using character embeddings by setting nC, nM and char_embed=True return build_Tok2Vec_model( @@ -90,12 +93,13 @@ def hash_charembed_cnn( char_embed=True, nM=nM, nC=nC, + dropout=dropout, ) @registry.architectures.register("spacy.HashEmbedBiLSTM.v1") def hash_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces + pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -110,12 +114,13 @@ def hash_embed_bilstm_v1( char_embed=False, nM=0, nC=0, + dropout=dropout, ) @registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") def hash_char_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC + pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC, dropout ): # Allows using character embeddings by setting nC, nM and char_embed=True return build_Tok2Vec_model( @@ -130,6 +135,7 @@ def hash_char_embed_bilstm_v1( char_embed=True, nM=nM, nC=nC, + dropout=dropout, ) @@ -144,19 +150,19 @@ def LayerNormalizedMaxout(width, maxout_pieces): @registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): - norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM")) +def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout): + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) if use_subwords: - prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX")) - suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX")) - shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE")) + prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout) + suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout) + shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout) if pretrained_vectors: glove = StaticVectors( vectors=pretrained_vectors.data, nO=width, column=columns.index(ID), - dropout=0.0, + dropout=dropout, ) with Model.define_operators({">>": chain, "|": concatenate}): @@ -164,13 +170,10 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): embed_layer = norm else: if use_subwords and pretrained_vectors: - nr_columns = 5 concat_columns = glove | norm | prefix | suffix | shape elif use_subwords: - nr_columns = 4 concat_columns = norm | prefix | suffix | shape else: - nr_columns = 2 concat_columns = glove | norm embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH")) @@ -179,8 +182,8 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): @registry.architectures.register("spacy.CharacterEmbed.v1") -def CharacterEmbed(columns, width, rows, nM, nC, features): - norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM")) +def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) with Model.define_operators({">>": chain, "|": concatenate}): embed_layer = chr_embed | features >> with_array(norm) @@ -238,16 +241,17 @@ def build_Tok2Vec_model( nC, conv_depth, bilstm_depth, + dropout, ) -> Model: if char_embed: subword_features = False cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM)) + norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout) if subword_features: - prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX)) - suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX)) - shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE)) + prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout) + suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout) + shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout) else: prefix, suffix, shape = (None, None, None) if pretrained_vectors is not None: @@ -255,7 +259,7 @@ def build_Tok2Vec_model( vectors=pretrained_vectors.data, nO=width, column=cols.index(ID), - dropout=0.0, + dropout=dropout, ) if subword_features: diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index b2866bad2..116a08e92 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,5 +1,5 @@ from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker -from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer +from .pipes import TextCategorizer, Pipe, Sentencizer from .pipes import SentenceRecognizer from .simple_ner import SimpleNER from .morphologizer import Morphologizer @@ -14,7 +14,6 @@ __all__ = [ "EntityRecognizer", "EntityLinker", "TextCategorizer", - "Tensorizer", "Tok2Vec", "Pipe", "Morphologizer", diff --git a/spacy/pipeline/defaults/__init__.py b/spacy/pipeline/defaults/__init__.py index e17e2d3b4..483c6bbd6 100644 --- a/spacy/pipeline/defaults/__init__.py +++ b/spacy/pipeline/defaults/__init__.py @@ -63,16 +63,6 @@ def default_tagger(): return util.load_config(loc, create_objects=True)["model"] -def default_tensorizer_config(): - loc = Path(__file__).parent / "tensorizer_defaults.cfg" - return util.load_config(loc, create_objects=False) - - -def default_tensorizer(): - loc = Path(__file__).parent / "tensorizer_defaults.cfg" - return util.load_config(loc, create_objects=True)["model"] - - def default_textcat_config(): loc = Path(__file__).parent / "textcat_defaults.cfg" return util.load_config(loc, create_objects=False) diff --git a/spacy/pipeline/defaults/entity_linker_defaults.cfg b/spacy/pipeline/defaults/entity_linker_defaults.cfg index 6a591ec3e..8dddf9e7b 100644 --- a/spacy/pipeline/defaults/entity_linker_defaults.cfg +++ b/spacy/pipeline/defaults/entity_linker_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 300 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/morphologizer_defaults.cfg b/spacy/pipeline/defaults/morphologizer_defaults.cfg index 150eca507..6ee053a08 100644 --- a/spacy/pipeline/defaults/morphologizer_defaults.cfg +++ b/spacy/pipeline/defaults/morphologizer_defaults.cfg @@ -11,3 +11,4 @@ window_size = 1 maxout_pieces = 3 nM = 64 nC = 8 +dropout = null diff --git a/spacy/pipeline/defaults/ner_defaults.cfg b/spacy/pipeline/defaults/ner_defaults.cfg index db2c131f5..eb926c43b 100644 --- a/spacy/pipeline/defaults/ner_defaults.cfg +++ b/spacy/pipeline/defaults/ner_defaults.cfg @@ -13,3 +13,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/parser_defaults.cfg b/spacy/pipeline/defaults/parser_defaults.cfg index 9cbb6eadb..6fe0fd7cb 100644 --- a/spacy/pipeline/defaults/parser_defaults.cfg +++ b/spacy/pipeline/defaults/parser_defaults.cfg @@ -13,3 +13,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/senter_defaults.cfg b/spacy/pipeline/defaults/senter_defaults.cfg index ffa2c6ce2..304e42b01 100644 --- a/spacy/pipeline/defaults/senter_defaults.cfg +++ b/spacy/pipeline/defaults/senter_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 2 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/simple_ner_defaults.cfg b/spacy/pipeline/defaults/simple_ner_defaults.cfg index 4e3b640df..7f206a636 100644 --- a/spacy/pipeline/defaults/simple_ner_defaults.cfg +++ b/spacy/pipeline/defaults/simple_ner_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 7000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/tagger_defaults.cfg b/spacy/pipeline/defaults/tagger_defaults.cfg index 5aea80a32..f26c5f099 100644 --- a/spacy/pipeline/defaults/tagger_defaults.cfg +++ b/spacy/pipeline/defaults/tagger_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/tensorizer_defaults.cfg b/spacy/pipeline/defaults/tensorizer_defaults.cfg deleted file mode 100644 index 81880a109..000000000 --- a/spacy/pipeline/defaults/tensorizer_defaults.cfg +++ /dev/null @@ -1,4 +0,0 @@ -[model] -@architectures = "spacy.Tensorizer.v1" -input_size=96 -output_size=300 diff --git a/spacy/pipeline/defaults/textcat_cnn_defaults.cfg b/spacy/pipeline/defaults/textcat_cnn_defaults.cfg index cea1bfe54..91f3a1742 100644 --- a/spacy/pipeline/defaults/textcat_cnn_defaults.cfg +++ b/spacy/pipeline/defaults/textcat_cnn_defaults.cfg @@ -11,3 +11,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/textcat_defaults.cfg b/spacy/pipeline/defaults/textcat_defaults.cfg index 9477b2995..0981cf77c 100644 --- a/spacy/pipeline/defaults/textcat_defaults.cfg +++ b/spacy/pipeline/defaults/textcat_defaults.cfg @@ -7,3 +7,4 @@ conv_depth = 2 embed_size = 2000 window_size = 1 ngram_size = 1 +dropout = null diff --git a/spacy/pipeline/defaults/tok2vec_defaults.cfg b/spacy/pipeline/defaults/tok2vec_defaults.cfg index 9475d4aab..d2718eed1 100644 --- a/spacy/pipeline/defaults/tok2vec_defaults.cfg +++ b/spacy/pipeline/defaults/tok2vec_defaults.cfg @@ -7,3 +7,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index 351323ae9..a97e7be68 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -44,8 +44,8 @@ class SentenceSegmenter(object): class SimilarityHook(Pipe): """ Experimental: A pipeline component to install a hook for supervised - similarity into `Doc` objects. Requires a `Tensorizer` to pre-process - documents. The similarity model can be any object obeying the Thinc `Model` + similarity into `Doc` objects. + The similarity model can be any object obeying the Thinc `Model` interface. By default, the model concatenates the elementwise mean and elementwise max of the two tensors, and compares them using the Cauchy-like similarity function from Chen (2013): @@ -82,7 +82,7 @@ class SimilarityHook(Pipe): sims, bp_sims = self.model.begin_update(doc1_doc2) def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs): - """Allocate model, using width from tensorizer in pipeline. + """Allocate model, using nO from the first model in the pipeline. gold_tuples (iterable): Gold-standard training data. pipeline (list): The pipeline the model is part of. diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 42110efb0..a6edf00d9 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -16,7 +16,7 @@ from ..morphology cimport Morphology from ..vocab cimport Vocab from .defaults import default_tagger, default_parser, default_ner, default_textcat -from .defaults import default_nel, default_senter, default_tensorizer +from .defaults import default_nel, default_senter from .functions import merge_subtokens from ..language import Language, component from ..syntax import nonproj @@ -238,138 +238,6 @@ class Pipe(object): return self -@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer) -class Tensorizer(Pipe): - """Pre-train position-sensitive vectors for tokens.""" - - def __init__(self, vocab, model, **cfg): - """Construct a new statistical model. Weights are not allocated on - initialisation. - - vocab (Vocab): A `Vocab` instance. The model must share the same - `Vocab` instance with the `Doc` objects it will process. - **cfg: Config parameters. - """ - self.vocab = vocab - self.model = model - self.input_models = [] - self.cfg = dict(cfg) - - def __call__(self, example): - """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM - model. Vectors are set to the `Doc.tensor` attribute. - - docs (Doc or iterable): One or more documents to add vectors to. - RETURNS (dict or None): Intermediate computations. - """ - doc = self._get_doc(example) - tokvecses = self.predict([doc]) - self.set_annotations([doc], tokvecses) - if isinstance(example, Example): - example.doc = doc - return example - return doc - - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - """Process `Doc` objects as a stream. - - stream (iterator): A sequence of `Doc` or `Example` objects to process. - batch_size (int): Number of `Doc` or `Example` objects to group. - YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input. - """ - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] - tensors = self.predict(docs) - self.set_annotations(docs, tensors) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs - - def predict(self, docs): - """Return a single tensor for a batch of documents. - - docs (iterable): A sequence of `Doc` objects. - RETURNS (object): Vector representations for each token in the docs. - """ - inputs = self.model.ops.flatten([doc.tensor for doc in docs]) - outputs = self.model(inputs) - return self.model.ops.unflatten(outputs, [len(d) for d in docs]) - - def set_annotations(self, docs, tensors): - """Set the tensor attribute for a batch of documents. - - docs (iterable): A sequence of `Doc` objects. - tensors (object): Vector representation for each token in the docs. - """ - for doc, tensor in zip(docs, tensors): - if tensor.shape[0] != len(doc): - raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc))) - doc.tensor = tensor - - def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None): - """Update the model. - - docs (iterable): A batch of `Doc` objects. - golds (iterable): A batch of `GoldParse` objects. - drop (float): The dropout rate. - sgd (callable): An optimizer. - RETURNS (dict): Results from the update. - """ - examples = Example.to_example_objects(examples) - inputs = [] - bp_inputs = [] - set_dropout_rate(self.model, drop) - for tok2vec in self.input_models: - set_dropout_rate(tok2vec, drop) - tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples]) - inputs.append(tensor) - bp_inputs.append(bp_tensor) - inputs = self.model.ops.xp.hstack(inputs) - scores, bp_scores = self.model.begin_update(inputs) - loss, d_scores = self.get_loss(examples, scores) - d_inputs = bp_scores(d_scores, sgd=sgd) - d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1) - for d_input, bp_input in zip(d_inputs, bp_inputs): - bp_input(d_input) - if sgd is not None: - for tok2vec in self.input_models: - tok2vec.finish_update(sgd) - self.model.finish_update(sgd) - if losses is not None: - losses.setdefault(self.name, 0.0) - losses[self.name] += loss - return loss - - def get_loss(self, examples, prediction): - examples = Example.to_example_objects(examples) - ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples]) - target = self.vocab.vectors.data[ids] - d_scores = (prediction - target) / prediction.shape[0] - loss = (d_scores ** 2).sum() - return loss, d_scores - - def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): - """Allocate models, pre-process training data and acquire an - optimizer. - - get_examples (iterable): Gold-standard training data. - pipeline (list): The pipeline the model is part of. - """ - if pipeline is not None: - for name, model in pipeline: - if model.has_ref("tok2vec"): - self.input_models.append(model.get_ref("tok2vec")) - self.model.initialize() - link_vectors_to_models(self.vocab) - if sgd is None: - sgd = self.create_optimizer() - return sgd - - @component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger) class Tagger(Pipe): """Pipeline component for part-of-speech tagging. @@ -1708,4 +1576,4 @@ def ner_factory(nlp, model, **cfg): warnings.warn(Warnings.W098.format(name="ner")) return EntityRecognizer.from_nlp(nlp, model, **cfg) -__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] +__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 725a4fd69..179659597 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -123,9 +123,9 @@ def test_overfitting_IO(): {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, - {"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2}, - {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1}, - {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": True}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": False}, ], diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index ba63adfa4..870a980f2 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -24,6 +24,7 @@ window_size = 1 embed_size = 2000 maxout_pieces = 3 subword_features = true +dropout = null [nlp.pipeline.tagger] factory = "tagger" @@ -53,6 +54,7 @@ embed_size = 5555 window_size = 1 maxout_pieces = 7 subword_features = false +dropout = null """ @@ -70,6 +72,7 @@ def my_parser(): nC=8, conv_depth=2, bilstm_depth=0, + dropout=None, ) parser = build_tb_parser_model( tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 4fc277c4f..595a35a9f 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,7 +1,7 @@ import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer -from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer -from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger +from spacy.pipeline import TextCategorizer, SentenceRecognizer +from spacy.pipeline.defaults import default_parser, default_tagger from spacy.pipeline.defaults import default_textcat, default_senter from ..util import make_tempdir @@ -95,24 +95,6 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): assert tagger1_d.to_bytes() == tagger2_d.to_bytes() -def test_serialize_tensorizer_roundtrip_bytes(en_vocab): - tensorizer = Tensorizer(en_vocab, default_tensorizer()) - tensorizer_b = tensorizer.to_bytes(exclude=["vocab"]) - new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b) - assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b - - -def test_serialize_tensorizer_roundtrip_disk(en_vocab): - tensorizer = Tensorizer(en_vocab, default_tensorizer()) - with make_tempdir() as d: - file_path = d / "tensorizer" - tensorizer.to_disk(file_path) - tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path) - assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes( - exclude=["vocab"] - ) - - def test_serialize_textcat_empty(en_vocab): # See issue #1105 textcat = TextCategorizer( diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 9c2e9004b..ee1f9dead 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -15,7 +15,7 @@ def test_empty_doc(): vocab = Vocab() doc = Doc(vocab, words=[]) # TODO: fix tok2vec arguments - tok2vec = build_Tok2Vec_model(width, embed_size) + tok2vec = build_Tok2Vec_model(width, embed_size, dropout=None) vectors, backprop = tok2vec.begin_update([doc]) assert len(vectors) == 1 assert vectors[0].shape == (0, width) @@ -38,6 +38,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): char_embed=False, nM=64, nC=8, + dropout=None, ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update(batch) @@ -50,14 +51,14 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): @pytest.mark.parametrize( "tok2vec_config", [ - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, ], ) # fmt: on