Merge pull request #5543 from svlandeg/feature/pretrain-config

pretrain from config
This commit is contained in:
Matthew Honnibal 2020-06-04 19:07:12 +02:00 committed by GitHub
commit 8411d4f4e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
39 changed files with 421 additions and 575 deletions

View File

@ -25,6 +25,7 @@ score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
# These settings are invalid for the transformer models.
init_tok2vec = null
vectors = null
discard_oversize = false
[training.batch_size]
@schedules = "compounding.v1"
@ -32,7 +33,7 @@ start = 1000
stop = 1000
compound = 1.001
[optimizer]
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
@ -113,3 +114,4 @@ window_size = 1
embed_size = 10000
maxout_pieces = 3
subword_features = true
dropout = null

View File

@ -0,0 +1,137 @@
# Training hyper-parameters and additional features.
[training]
# Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing.
gold_preproc = false
# Limitations on training document length or number of examples.
max_length = 0
limit = 0
# Data augmentation
orth_variant_level = 0.0
dropout = 0.1
# Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 400
# Other settings
seed = 0
accumulate_gradient = 1
use_pytorch_for_gpu_memory = false
# Control how scores are printed and checkpoints are evaluated.
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
# These settings are invalid for the transformer models.
init_tok2vec = null
vectors = null
discard_oversize = false
[training.batch_size]
@schedules = "compounding.v1"
start = 1000
stop = 1000
compound = 1.001
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 0.001
[pretraining]
max_epochs = 1000
min_length = 5
max_length = 500
dropout = 0.2
n_save_every = null
batch_size = 3000
seed = ${training:seed}
use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
tok2vec_model = "nlp.pipeline.tok2vec.model"
[pretraining.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 0.001
[pretraining.loss_func]
@losses = "CosineDistance.v1"
normalize = true
[nlp]
lang = "en"
vectors = ${training:vectors}
[nlp.pipeline.tok2vec]
factory = "tok2vec"
[nlp.pipeline.senter]
factory = "senter"
[nlp.pipeline.ner]
factory = "ner"
[nlp.pipeline.tagger]
factory = "tagger"
[nlp.pipeline.parser]
factory = "parser"
[nlp.pipeline.senter.model]
@architectures = "spacy.Tagger.v1"
[nlp.pipeline.senter.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.tagger.model]
@architectures = "spacy.Tagger.v1"
[nlp.pipeline.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
hidden_width = 128
maxout_pieces = 3
use_upper = false
[nlp.pipeline.parser.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3
hidden_width = 128
maxout_pieces = 3
use_upper = false
[nlp.pipeline.ner.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = ${nlp:vectors}
width = 256
depth = 6
window_size = 1
embed_size = 10000
maxout_pieces = 3
subword_features = true
dropout = null

View File

@ -14,6 +14,7 @@ score_weights = {"las": 0.8, "tags_acc": 0.2}
limit = 0
seed = 0
accumulate_gradient = 2
discard_oversize = false
[training.batch_size]
@schedules = "compounding.v1"
@ -21,7 +22,7 @@ start = 100
stop = 1000
compound = 1.001
[optimizer]
[training.optimizer]
@optimizers = "Adam.v1"
learn_rate = 0.001
beta1 = 0.9
@ -65,3 +66,4 @@ depth = 4
embed_size = 2000
subword_features = true
maxout_pieces = 3
dropout = null

View File

@ -14,6 +14,7 @@ score_weights = {"las": 0.8, "tags_acc": 0.2}
limit = 0
seed = 0
accumulate_gradient = 2
discard_oversize = false
[training.batch_size]
@schedules = "compounding.v1"
@ -21,7 +22,7 @@ start = 100
stop = 1000
compound = 1.001
[optimizer]
[training.optimizer]
@optimizers = "Adam.v1"
learn_rate = 0.001
beta1 = 0.9
@ -66,3 +67,4 @@ window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true
dropout = null

View File

@ -12,8 +12,9 @@ max_length = 0
batch_size = 25
seed = 0
accumulate_gradient = 2
discard_oversize = false
[optimizer]
[training.optimizer]
@optimizers = "Adam.v1"
learn_rate = 0.001
beta1 = 0.9
@ -36,6 +37,7 @@ nM = 64
nC = 8
rows = 2000
columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
dropout = null
[nlp.pipeline.tok2vec.model.extract.features]
@architectures = "spacy.Doc2Feats.v1"

View File

@ -11,6 +11,7 @@ gold_preproc = true
max_length = 0
seed = 0
accumulate_gradient = 2
discard_oversize = false
[training.batch_size]
@schedules = "compounding.v1"
@ -19,7 +20,7 @@ stop = 3000
compound = 1.001
[optimizer]
[training.optimizer]
@optimizers = "Adam.v1"
learn_rate = 0.001
beta1 = 0.9
@ -44,3 +45,4 @@ maxout_pieces = 3
window_size = 1
subword_features = true
pretrained_vectors = null
dropout = null

View File

@ -1,212 +0,0 @@
"""This script is experimental.
Try pre-training the CNN component of the text categorizer using a cheap
language modelling-like objective. Specifically, we load pretrained vectors
(from something like word2vec, GloVe, FastText etc), and use the CNN to
predict the tokens' pretrained vectors. This isn't as easy as it sounds:
we're not merely doing compression here, because heavy dropout is applied,
including over the input words. This means the model must often (50% of the time)
use the context in order to predict the word.
To evaluate the technique, we're pre-training with the 50k texts from the IMDB
corpus, and then training with only 100 labels. Note that it's a bit dirty to
pre-train with the development data, but also not *so* terrible: we're not using
the development labels, after all --- only the unlabelled text.
"""
import plac
import tqdm
import random
import ml_datasets
import spacy
from spacy.util import minibatch
from spacy.pipeline import TextCategorizer
from spacy.ml.models.tok2vec import build_Tok2Vec_model
import numpy
def load_texts(limit=0):
train, dev = ml_datasets.imdb()
train_texts, train_labels = zip(*train)
dev_texts, dev_labels = zip(*train)
train_texts = list(train_texts)
dev_texts = list(dev_texts)
random.shuffle(train_texts)
random.shuffle(dev_texts)
if limit >= 1:
return train_texts[:limit]
else:
return list(train_texts) + list(dev_texts)
def load_textcat_data(limit=0):
"""Load data from the IMDB dataset."""
# Partition off part of the train data for evaluation
train_data, eval_data = ml_datasets.imdb()
random.shuffle(train_data)
train_data = train_data[-limit:]
texts, labels = zip(*train_data)
eval_texts, eval_labels = zip(*eval_data)
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
return (texts, cats), (eval_texts, eval_cats)
def prefer_gpu():
used = spacy.util.use_gpu(0)
if used is None:
return False
else:
import cupy.random
cupy.random.seed(0)
return True
def build_textcat_model(tok2vec, nr_class, width):
from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged
with Model.define_operators({">>": chain}):
model = (
tok2vec
>> list2ragged()
>> reduce_mean()
>> Softmax(nr_class, width)
)
model.set_ref("tok2vec", tok2vec)
return model
def block_gradients(model):
from thinc.api import wrap # TODO FIX
def forward(X, drop=0.0):
Y, _ = model.begin_update(X, drop=drop)
return Y, None
return wrap(forward, model)
def create_pipeline(width, embed_size, vectors_model):
print("Load vectors")
nlp = spacy.load(vectors_model)
print("Start training")
textcat = TextCategorizer(
nlp.vocab,
labels=["POSITIVE", "NEGATIVE"],
# TODO: replace with config version
model=build_textcat_model(
build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width
),
)
nlp.add_pipe(textcat)
return nlp
def train_tensorizer(nlp, texts, dropout, n_iter):
tensorizer = nlp.create_pipe("tensorizer")
nlp.add_pipe(tensorizer)
optimizer = nlp.begin_training()
for i in range(n_iter):
losses = {}
for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
docs = [nlp.make_doc(text) for text in batch]
tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout)
print(losses)
return optimizer
def train_textcat(nlp, n_texts, n_iter=10):
textcat = nlp.get_pipe("textcat")
tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes()
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
print(
"Using {} examples ({} training, {} evaluation)".format(
n_texts, len(train_texts), len(dev_texts)
)
)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
with nlp.select_pipes(enable="textcat"): # only train textcat
optimizer = nlp.begin_training()
textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights)
print("Training the model...")
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
for i in range(n_iter):
losses = {"textcat": 0.0}
# batch up the examples using spaCy's minibatch
batches = minibatch(tqdm.tqdm(train_data), size=2)
for batch in batches:
nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data()
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
print(
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
losses["textcat"],
scores["textcat_p"],
scores["textcat_r"],
scores["textcat_f"],
)
)
def evaluate_textcat(tokenizer, textcat, texts, cats):
docs = (tokenizer(text) for text in texts)
tp = 1e-8
fp = 1e-8
tn = 1e-8
fn = 1e-8
for i, doc in enumerate(textcat.pipe(docs)):
gold = cats[i]
for label, score in doc.cats.items():
if label not in gold:
continue
if score >= 0.5 and gold[label] >= 0.5:
tp += 1.0
elif score >= 0.5 and gold[label] < 0.5:
fp += 1.0
elif score < 0.5 and gold[label] < 0.5:
tn += 1
elif score < 0.5 and gold[label] >= 0.5:
fn += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f_score = 2 * (precision * recall) / (precision + recall)
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
@plac.annotations(
width=("Width of CNN layers", "positional", None, int),
embed_size=("Embedding rows", "positional", None, int),
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
train_iters=("Number of iterations to pretrain", "option", "tn", int),
train_examples=("Number of labelled examples", "option", "eg", int),
vectors_model=("Name or path to vectors model to learn from"),
)
def main(
width,
embed_size,
vectors_model,
pretrain_iters=30,
train_iters=30,
train_examples=1000,
):
random.seed(0)
numpy.random.seed(0)
use_gpu = prefer_gpu()
print("Using GPU?", use_gpu)
nlp = create_pipeline(width, embed_size, vectors_model)
print("Load data")
texts = load_texts(limit=0)
print("Train tensorizer")
optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
print("Train textcat")
train_textcat(nlp, train_examples, n_iter=train_iters)
if __name__ == "__main__":
plac.call(main)

View File

@ -2,16 +2,15 @@ if __name__ == "__main__":
import plac
import sys
from wasabi import msg
from spacy.cli import download, link, info, package, train, pretrain, convert
from spacy.cli import download, link, info, package, pretrain, convert
from spacy.cli import init_model, profile, evaluate, validate, debug_data
from spacy.cli import train_from_config_cli
from spacy.cli import train_cli
commands = {
"download": download,
"link": link,
"info": info,
"train": train,
"train-from-config": train_from_config_cli,
"train": train_cli,
"pretrain": pretrain,
"debug-data": debug_data,
"evaluate": evaluate,

View File

View File

@ -4,8 +4,7 @@ from .download import download # noqa: F401
from .info import info # noqa: F401
from .package import package # noqa: F401
from .profile import profile # noqa: F401
from .train import train # noqa: F401
from .train_from_config import train_from_config_cli # noqa: F401
from .train_from_config import train_cli # noqa: F401
from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .evaluate import evaluate # noqa: F401

View File

@ -3,48 +3,39 @@ import numpy
import time
import re
from collections import Counter
import plac
from pathlib import Path
from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu
from thinc.api import CosineDistance, L2Distance
from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory
from wasabi import msg
import srsly
from ..gold import Example
from ..errors import Errors
from ..ml.models.multi_task import build_masked_language_model
from ..tokens import Doc
from ..attrs import ID, HEAD
from ..ml.models.tok2vec import build_Tok2Vec_model
from .. import util
from ..util import create_default_optimizer
from .train import _load_pretrained_tok2vec
from ..gold import Example
def pretrain(
@plac.annotations(
# fmt: off
texts_loc: ("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
vectors_model: ("Name or path to spaCy model with vectors to learn from", "positional", None, str),
output_dir: ("Directory to write models to on each epoch", "positional", None, str),
width: ("Width of CNN layers", "option", "cw", int) = 96,
conv_depth: ("Depth of CNN layers", "option", "cd", int) = 4,
bilstm_depth: ("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int) = 0,
cnn_pieces: ("Maxout size for CNN layers. 1 for Mish", "option", "cP", int) = 3,
sa_depth: ("Depth of self-attention layers", "option", "sa", int) = 0,
use_chars: ("Whether to use character-based embedding", "flag", "chr", bool) = False,
cnn_window: ("Window size for CNN layers", "option", "cW", int) = 1,
embed_rows: ("Number of embedding rows", "option", "er", int) = 2000,
loss_func: ("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str) = "cosine",
use_vectors: ("Whether to use the static vectors as input features", "flag", "uv") = False,
dropout: ("Dropout rate", "option", "d", float) = 0.2,
n_iter: ("Number of iterations to pretrain", "option", "i", int) = 1000,
batch_size: ("Number of words per training batch", "option", "bs", int) = 3000,
max_length: ("Max words per example. Longer examples are discarded", "option", "xw", int) = 500,
min_length: ("Min words per example. Shorter examples are discarded", "option", "nw", int) = 5,
seed: ("Seed for random number generators", "option", "s", int) = 0,
n_save_every: ("Save model every X batches.", "option", "se", int) = None,
init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
epoch_start: ("The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.", "option", "es", int) = None,
texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str),
output_dir=("Directory to write models to on each epoch", "positional", None, Path),
config_path=("Path to config file", "positional", None, Path),
use_gpu=("Use GPU", "option", "g", int),
resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path),
epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int),
# fmt: on
)
def pretrain(
texts_loc,
vectors_model,
config_path,
output_dir,
use_gpu=-1,
resume_path=None,
epoch_resume=None,
):
"""
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
@ -58,34 +49,46 @@ def pretrain(
However, it's still quite experimental, so your mileage may vary.
To load the weights back in during 'spacy train', you need to ensure
all settings are the same between pretraining and training. The API and
errors around this need some improvement.
all settings are the same between pretraining and training. Ideally,
this is done by using the same config file for both commands.
"""
config = dict(locals())
for key in config:
if isinstance(config[key], Path):
config[key] = str(config[key])
util.fix_random_seed(seed)
if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1)
has_gpu = prefer_gpu()
if has_gpu:
import torch
if use_gpu >= 0:
msg.info("Using GPU")
util.use_gpu(use_gpu)
else:
msg.info("Using CPU")
torch.set_default_tensor_type("torch.cuda.FloatTensor")
msg.info("Using GPU" if has_gpu else "Not using GPU")
msg.info(f"Loading config from: {config_path}")
config = util.load_config(config_path, create_objects=False)
util.fix_random_seed(config["pretraining"]["seed"])
if config["pretraining"]["use_pytorch_for_gpu_memory"]:
use_pytorch_for_gpu_memory()
output_dir = Path(output_dir)
if output_dir.exists() and [p for p in output_dir.iterdir()]:
msg.warn(
"Output directory is not empty",
"It is better to use an empty directory or refer to a new output path, "
"then the new directory will be created for you.",
)
if resume_path:
msg.warn(
"Output directory is not empty. ",
"If you're resuming a run from a previous model in this directory, "
"the old models for the consecutive epochs will be overwritten "
"with the new ones.",
)
else:
msg.warn(
"Output directory is not empty. ",
"It is better to use an empty directory or refer to a new output path, "
"then the new directory will be created for you.",
)
if not output_dir.exists():
output_dir.mkdir()
msg.good(f"Created output directory: {output_dir}")
srsly.write_json(output_dir / "config.json", config)
msg.good("Saved settings to config.json")
msg.good("Saved config file in the output directory")
config = util.load_config(config_path, create_objects=True)
pretrain_config = config["pretraining"]
# Load texts from file or stdin
if texts_loc != "-": # reading from a file
@ -99,57 +102,50 @@ def pretrain(
msg.good("Loaded input texts")
random.shuffle(texts)
else: # reading from stdin
msg.text("Reading input text from stdin...")
msg.info("Reading input text from stdin...")
texts = srsly.read_jsonl("-")
with msg.loading(f"Loading model '{vectors_model}'..."):
nlp = util.load_model(vectors_model)
msg.good(f"Loaded model '{vectors_model}'")
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors
model = create_pretraining_model(
nlp,
# TODO: replace with config
build_Tok2Vec_model(
width,
embed_rows,
conv_depth=conv_depth,
pretrained_vectors=pretrained_vectors,
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
subword_features=not use_chars, # Set to False for Chinese etc
maxout_pieces=cnn_pieces, # If set to 1, use Mish activation.
window_size=1,
char_embed=False,
nM=64,
nC=8,
),
)
# Load in pretrained weights
if init_tok2vec is not None:
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
msg.text(f"Loaded pretrained tok2vec for: {components}")
tok2vec_path = pretrain_config["tok2vec_model"]
tok2vec = config
for subpath in tok2vec_path.split("."):
tok2vec = tok2vec.get(subpath)
model = create_pretraining_model(nlp, tok2vec)
optimizer = pretrain_config["optimizer"]
# Load in pretrained weights to resume from
if resume_path is not None:
msg.info(f"Resume training tok2vec from: {resume_path}")
with resume_path.open("rb") as file_:
weights_data = file_.read()
model.get_ref("tok2vec").from_bytes(weights_data)
# Parse the epoch number from the given weight file
model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
model_name = re.search(r"model\d+\.bin", str(resume_path))
if model_name:
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
epoch_start = int(model_name.group(0)[5:][:-4]) + 1
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
msg.info(f"Resuming from epoch: {epoch_resume}")
else:
if not epoch_start:
if not epoch_resume:
msg.fail(
"You have to use the --epoch-start argument when using a renamed weight file for --init-tok2vec",
"You have to use the --epoch-resume setting when using a renamed weight file for --resume-path",
exits=True,
)
elif epoch_start < 0:
elif epoch_resume < 0:
msg.fail(
f"The argument --epoch-start has to be greater or equal to 0. {epoch_start} is invalid",
f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid",
exits=True,
)
else:
msg.info(f"Resuming from epoch: {epoch_resume}")
else:
# Without '--init-tok2vec' the '--epoch-start' argument is ignored
epoch_start = 0
# Without '--resume-path' the '--epoch-resume' argument is ignored
epoch_resume = 0
optimizer = create_default_optimizer()
tracker = ProgressTracker(frequency=10000)
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}")
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
@ -168,28 +164,27 @@ def pretrain(
file_.write(srsly.json_dumps(log) + "\n")
skip_counter = 0
for epoch in range(epoch_start, n_iter + epoch_start):
for batch_id, batch in enumerate(
util.minibatch_by_words(
(Example(doc=text) for text in texts), size=batch_size
)
):
loss_func = pretrain_config["loss_func"]
for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
examples = [Example(doc=text) for text in texts]
batches = util.minibatch_by_words(examples, size=pretrain_config["batch_size"])
for batch_id, batch in enumerate(batches):
docs, count = make_docs(
nlp,
[text for (text, _) in batch],
max_length=max_length,
min_length=min_length,
[ex.doc for ex in batch],
max_length=pretrain_config["max_length"],
min_length=pretrain_config["min_length"],
)
skip_counter += count
loss = make_update(
model, docs, optimizer, objective=loss_func, drop=dropout
)
loss = make_update(model, docs, optimizer, distance=loss_func)
progress = tracker.update(epoch, loss, docs)
if progress:
msg.row(progress, **row_settings)
if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
break
if n_save_every and (batch_id % n_save_every == 0):
if pretrain_config["n_save_every"] and (
batch_id % pretrain_config["n_save_every"] == 0
):
_save_model(epoch, is_temp=True)
_save_model(epoch)
tracker.epoch_loss = 0.0
@ -201,17 +196,17 @@ def pretrain(
msg.good("Successfully finished pretrain")
def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
def make_update(model, docs, optimizer, distance):
"""Perform an update over a single batch of documents.
docs (iterable): A batch of `Doc` objects.
drop (float): The dropout rate.
optimizer (callable): An optimizer.
RETURNS loss: A float for the loss.
"""
predictions, backprop = model.begin_update(docs, drop=drop)
loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
backprop(gradients, sgd=optimizer)
predictions, backprop = model.begin_update(docs)
loss, gradients = get_vectors_loss(model.ops, docs, predictions, distance)
backprop(gradients)
model.finish_update(optimizer)
# Don't want to return a cupy object here
# The gradients are modified in-place by the BERT MLM,
# so we get an accurate loss
@ -243,12 +238,12 @@ def make_docs(nlp, batch, min_length, max_length):
heads = numpy.asarray(heads, dtype="uint64")
heads = heads.reshape((len(doc), 1))
doc = doc.from_array([HEAD], heads)
if len(doc) >= min_length and len(doc) < max_length:
if min_length <= len(doc) < max_length:
docs.append(doc)
return docs, skip_count
def get_vectors_loss(ops, docs, prediction, objective="L2"):
def get_vectors_loss(ops, docs, prediction, distance):
"""Compute a mean-squared error loss between the documents' vectors and
the prediction.
@ -262,13 +257,6 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
# TODO: this code originally didn't normalize, but shouldn't normalize=True ?
if objective == "L2":
distance = L2Distance(normalize=False)
elif objective == "cosine":
distance = CosineDistance(normalize=False)
else:
raise ValueError(Errors.E142.format(loss_func=objective))
d_target, loss = distance(prediction, target)
return loss, d_target
@ -281,7 +269,7 @@ def create_pretraining_model(nlp, tok2vec):
"""
output_size = nlp.vocab.vectors.data.shape[1]
output_layer = chain(
Maxout(300, pieces=3, normalize=True, dropout=0.0), Linear(output_size)
Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size)
)
# This is annoying, but the parser etc have the flatten step after
# the tok2vec. To load the weights in cleanly, we need to match
@ -289,11 +277,12 @@ def create_pretraining_model(nlp, tok2vec):
# "tok2vec" has to be the same set of processes as what the components do.
tok2vec = chain(tok2vec, list2array())
model = chain(tok2vec, output_layer)
model = build_masked_language_model(nlp.vocab, model)
model.set_ref("tok2vec", tok2vec)
model.set_ref("output_layer", output_layer)
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
return model
mlm_model = build_masked_language_model(nlp.vocab, model)
mlm_model.set_ref("tok2vec", tok2vec)
mlm_model.set_ref("output_layer", output_layer)
mlm_model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
return mlm_model
class ProgressTracker(object):

View File

@ -13,6 +13,7 @@ import random
from ..gold import GoldCorpus
from .. import util
from ..errors import Errors
from ..ml import models # don't remove - required to load the built-in architectures
registry = util.registry
@ -123,7 +124,7 @@ class ConfigSchema(BaseModel):
use_gpu=("Use GPU", "option", "g", int),
# fmt: on
)
def train_from_config_cli(
def train_cli(
train_path,
dev_path,
config_path,
@ -132,7 +133,7 @@ def train_from_config_cli(
raw_text=None,
debug=False,
verbose=False,
use_gpu=-1
use_gpu=-1,
):
"""
Train or update a spaCy model. Requires data to be formatted in spaCy's
@ -156,7 +157,7 @@ def train_from_config_cli(
else:
msg.info("Using CPU")
train_from_config(
train(
config_path,
{"train": train_path, "dev": dev_path},
output_path=output_path,
@ -165,10 +166,11 @@ def train_from_config_cli(
)
def train_from_config(
def train(
config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
):
msg.info(f"Loading config from: {config_path}")
# Read the config first without creating objects, to get to the original nlp_config
config = util.load_config(config_path, create_objects=False)
util.fix_random_seed(config["training"]["seed"])
if config["training"]["use_pytorch_for_gpu_memory"]:
@ -177,8 +179,8 @@ def train_from_config(
config = util.load_config(config_path, create_objects=True)
msg.info("Creating nlp from config")
nlp = util.load_model_from_config(nlp_config)
optimizer = config["optimizer"]
training = config["training"]
optimizer = training["optimizer"]
limit = training["limit"]
msg.info("Loading training corpus")
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
@ -246,13 +248,19 @@ def create_train_batches(nlp, corpus, cfg):
if len(train_examples) == 0:
raise ValueError(Errors.E988)
random.shuffle(train_examples)
batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"], discard_oversize=cfg["discard_oversize"])
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
try:
first = next(batches)
yield first
except StopIteration:
raise ValueError(Errors.E986)
for batch in batches:
yield batch
epochs_todo -= 1
# We intentionally compare exactly to 0 here, so that max_epochs < 1
# will not break.
if epochs_todo == 0:
if epochs_todo == 0:
break

View File

@ -453,8 +453,6 @@ class Errors(object):
"should be of equal length.")
E141 = ("Entity vectors should be of length {required} instead of the "
"provided {found}.")
E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or "
"'cosine'.")
E143 = ("Labels for component '{name}' not initialized. Did you forget to "
"call add_label()?")
E144 = ("Could not find parameter `{param}` when building the entity "
@ -577,6 +575,8 @@ class Errors(object):
# TODO: fix numbering after merging develop into master
E986 = ("Could not create any training batches: check your input. "
"Perhaps discard_oversize should be set to False ?")
E987 = ("The text of an example training instance is either a Doc or "
"a string, but found {type} instead.")
E988 = ("Could not parse any training examples. Ensure the data is "

View File

@ -231,10 +231,6 @@ class Language(object):
# Conveniences to access pipeline components
# Shouldn't be used anymore!
@property
def tensorizer(self):
return self.get_pipe("tensorizer")
@property
def tagger(self):
return self.get_pipe("tagger")

View File

@ -0,0 +1 @@
from .models import *

View File

@ -2,6 +2,5 @@ from .entity_linker import * # noqa
from .parser import * # noqa
from .simple_ner import *
from .tagger import * # noqa
from .tensorizer import * # noqa
from .textcat import * # noqa
from .tok2vec import * # noqa

View File

@ -1,4 +1,6 @@
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init
import numpy
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96):
@ -24,6 +26,80 @@ def build_cloze_multi_task_model(vocab, tok2vec):
return model
def build_masked_language_model(*args, **kwargs):
# TODO cf https://github.com/explosion/spaCy/blob/2c107f02a4d60bda2440db0aad1a88cbbf4fb52d/spacy/_ml.py#L828
raise NotImplementedError
def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
"""Convert a model into a BERT-style masked language model"""
random_words = _RandomWords(vocab)
def mlm_forward(model, docs, is_train):
mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop
def mlm_backward(d_output):
d_output *= 1 - mask
return backprop(d_output)
return output, mlm_backward
mlm_model = Model("masked-language-model", mlm_forward, layers=[wrapped_model])
mlm_model.set_ref("wrapped-model", wrapped_model)
return mlm_model
class _RandomWords(object):
def __init__(self, vocab):
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
self.words = self.words[:10000]
self.probs = self.probs[:10000]
self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
self.probs /= self.probs.sum()
self._cache = []
def next(self):
if not self._cache:
self._cache.extend(
numpy.random.choice(len(self.words), 10000, p=self.probs)
)
index = self._cache.pop()
return self.words[index]
def _apply_mask(docs, random_words, mask_prob=0.15):
# This needs to be here to avoid circular imports
from ...tokens import Doc
N = sum(len(doc) for doc in docs)
mask = numpy.random.uniform(0.0, 1.0, (N,))
mask = mask >= mask_prob
i = 0
masked_docs = []
for doc in docs:
words = []
for token in doc:
if not mask[i]:
word = _replace_word(token.text, random_words)
else:
word = token.text
words.append(word)
i += 1
spaces = [bool(w.whitespace_) for w in doc]
# NB: If you change this implementation to instead modify
# the docs in place, take care that the IDs reflect the original
# words. Currently we use the original docs to make the vectors
# for the target, so we don't lose the original tokens. But if
# you modified the docs in place here, you would.
masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces))
return mask, masked_docs
def _replace_word(word, random_words, mask="[MASK]"):
roll = numpy.random.random()
if roll < 0.8:
return mask
elif roll < 0.9:
return random_words.next()
else:
return word

View File

@ -1,10 +0,0 @@
from thinc.api import Linear, zero_init
from ... import util
from ...util import registry
@registry.architectures.register("spacy.Tensorizer.v1")
def build_tensorizer(input_size, output_size):
input_size = util.env_opt("token_vector_width", input_size)
return Linear(output_size, input_size, init_W=zero_init)

View File

@ -49,13 +49,13 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
@registry.architectures.register("spacy.TextCat.v1")
def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size,
window_size, conv_depth, nO=None):
window_size, conv_depth, dropout, nO=None):
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER))
prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX))
suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX))
shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE))
lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout)
prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout)
suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout)
shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout)
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
trained_vectors = FeatureExtractor(cols) >> with_array(
@ -114,7 +114,7 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
@registry.architectures.register("spacy.TextCatLowData.v1")
def build_text_classifier_lowdata(width, pretrained_vectors, nO=None):
def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None):
nlp = util.load_model(pretrained_vectors)
vectors = nlp.vocab.vectors
vector_dim = vectors.data.shape[1]
@ -129,7 +129,8 @@ def build_text_classifier_lowdata(width, pretrained_vectors, nO=None):
>> reduce_sum()
>> residual(Relu(width, width)) ** 2
>> Linear(nO, width)
>> Dropout(0.0)
>> Logistic()
)
if dropout:
model = model >> Dropout(dropout)
model = model >> Logistic()
return model

View File

@ -49,6 +49,7 @@ def hash_embed_cnn(
maxout_pieces,
window_size,
subword_features,
dropout,
):
# Does not use character embeddings: set to False by default
return build_Tok2Vec_model(
@ -63,6 +64,7 @@ def hash_embed_cnn(
char_embed=False,
nM=0,
nC=0,
dropout=dropout,
)
@ -76,6 +78,7 @@ def hash_charembed_cnn(
window_size,
nM,
nC,
dropout,
):
# Allows using character embeddings by setting nC, nM and char_embed=True
return build_Tok2Vec_model(
@ -90,12 +93,13 @@ def hash_charembed_cnn(
char_embed=True,
nM=nM,
nC=nC,
dropout=dropout,
)
@registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
def hash_embed_bilstm_v1(
pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces
pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout
):
# Does not use character embeddings: set to False by default
return build_Tok2Vec_model(
@ -110,12 +114,13 @@ def hash_embed_bilstm_v1(
char_embed=False,
nM=0,
nC=0,
dropout=dropout,
)
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
def hash_char_embed_bilstm_v1(
pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC
pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC, dropout
):
# Allows using character embeddings by setting nC, nM and char_embed=True
return build_Tok2Vec_model(
@ -130,6 +135,7 @@ def hash_char_embed_bilstm_v1(
char_embed=True,
nM=nM,
nC=nC,
dropout=dropout,
)
@ -144,19 +150,19 @@ def LayerNormalizedMaxout(width, maxout_pieces):
@registry.architectures.register("spacy.MultiHashEmbed.v1")
def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix):
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"))
def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout):
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
if use_subwords:
prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"))
suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"))
shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"))
prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout)
suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout)
shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout)
if pretrained_vectors:
glove = StaticVectors(
vectors=pretrained_vectors.data,
nO=width,
column=columns.index(ID),
dropout=0.0,
dropout=dropout,
)
with Model.define_operators({">>": chain, "|": concatenate}):
@ -164,13 +170,10 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix):
embed_layer = norm
else:
if use_subwords and pretrained_vectors:
nr_columns = 5
concat_columns = glove | norm | prefix | suffix | shape
elif use_subwords:
nr_columns = 4
concat_columns = norm | prefix | suffix | shape
else:
nr_columns = 2
concat_columns = glove | norm
embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH"))
@ -179,8 +182,8 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix):
@registry.architectures.register("spacy.CharacterEmbed.v1")
def CharacterEmbed(columns, width, rows, nM, nC, features):
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"))
def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC)
with Model.define_operators({">>": chain, "|": concatenate}):
embed_layer = chr_embed | features >> with_array(norm)
@ -238,16 +241,17 @@ def build_Tok2Vec_model(
nC,
conv_depth,
bilstm_depth,
dropout,
) -> Model:
if char_embed:
subword_features = False
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM))
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout)
if subword_features:
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX))
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX))
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE))
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout)
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout)
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout)
else:
prefix, suffix, shape = (None, None, None)
if pretrained_vectors is not None:
@ -255,7 +259,7 @@ def build_Tok2Vec_model(
vectors=pretrained_vectors.data,
nO=width,
column=cols.index(ID),
dropout=0.0,
dropout=dropout,
)
if subword_features:

View File

@ -1,5 +1,5 @@
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
from .pipes import TextCategorizer, Pipe, Sentencizer
from .pipes import SentenceRecognizer
from .simple_ner import SimpleNER
from .morphologizer import Morphologizer
@ -14,7 +14,6 @@ __all__ = [
"EntityRecognizer",
"EntityLinker",
"TextCategorizer",
"Tensorizer",
"Tok2Vec",
"Pipe",
"Morphologizer",

View File

@ -63,16 +63,6 @@ def default_tagger():
return util.load_config(loc, create_objects=True)["model"]
def default_tensorizer_config():
loc = Path(__file__).parent / "tensorizer_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_tensorizer():
loc = Path(__file__).parent / "tensorizer_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_textcat_config():
loc = Path(__file__).parent / "textcat_defaults.cfg"
return util.load_config(loc, create_objects=False)

View File

@ -10,3 +10,4 @@ embed_size = 300
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null

View File

@ -11,3 +11,4 @@ window_size = 1
maxout_pieces = 3
nM = 64
nC = 8
dropout = null

View File

@ -13,3 +13,4 @@ embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null

View File

@ -13,3 +13,4 @@ embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null

View File

@ -10,3 +10,4 @@ embed_size = 2000
window_size = 1
maxout_pieces = 2
subword_features = true
dropout = null

View File

@ -10,3 +10,4 @@ embed_size = 7000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null

View File

@ -10,3 +10,4 @@ embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null

View File

@ -1,4 +0,0 @@
[model]
@architectures = "spacy.Tensorizer.v1"
input_size=96
output_size=300

View File

@ -11,3 +11,4 @@ embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null

View File

@ -7,3 +7,4 @@ conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
dropout = null

View File

@ -7,3 +7,4 @@ embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null

View File

@ -44,8 +44,8 @@ class SentenceSegmenter(object):
class SimilarityHook(Pipe):
"""
Experimental: A pipeline component to install a hook for supervised
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
documents. The similarity model can be any object obeying the Thinc `Model`
similarity into `Doc` objects.
The similarity model can be any object obeying the Thinc `Model`
interface. By default, the model concatenates the elementwise mean and
elementwise max of the two tensors, and compares them using the
Cauchy-like similarity function from Chen (2013):
@ -82,7 +82,7 @@ class SimilarityHook(Pipe):
sims, bp_sims = self.model.begin_update(doc1_doc2)
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
"""Allocate model, using width from tensorizer in pipeline.
"""Allocate model, using nO from the first model in the pipeline.
gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.

View File

@ -16,7 +16,7 @@ from ..morphology cimport Morphology
from ..vocab cimport Vocab
from .defaults import default_tagger, default_parser, default_ner, default_textcat
from .defaults import default_nel, default_senter, default_tensorizer
from .defaults import default_nel, default_senter
from .functions import merge_subtokens
from ..language import Language, component
from ..syntax import nonproj
@ -238,138 +238,6 @@ class Pipe(object):
return self
@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer)
class Tensorizer(Pipe):
"""Pre-train position-sensitive vectors for tokens."""
def __init__(self, vocab, model, **cfg):
"""Construct a new statistical model. Weights are not allocated on
initialisation.
vocab (Vocab): A `Vocab` instance. The model must share the same
`Vocab` instance with the `Doc` objects it will process.
**cfg: Config parameters.
"""
self.vocab = vocab
self.model = model
self.input_models = []
self.cfg = dict(cfg)
def __call__(self, example):
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
model. Vectors are set to the `Doc.tensor` attribute.
docs (Doc or iterable): One or more documents to add vectors to.
RETURNS (dict or None): Intermediate computations.
"""
doc = self._get_doc(example)
tokvecses = self.predict([doc])
self.set_annotations([doc], tokvecses)
if isinstance(example, Example):
example.doc = doc
return example
return doc
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
"""Process `Doc` objects as a stream.
stream (iterator): A sequence of `Doc` or `Example` objects to process.
batch_size (int): Number of `Doc` or `Example` objects to group.
YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input.
"""
for examples in util.minibatch(stream, size=batch_size):
docs = [self._get_doc(ex) for ex in examples]
tensors = self.predict(docs)
self.set_annotations(docs, tensors)
if as_example:
for ex, doc in zip(examples, docs):
ex.doc = doc
yield ex
else:
yield from docs
def predict(self, docs):
"""Return a single tensor for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the docs.
"""
inputs = self.model.ops.flatten([doc.tensor for doc in docs])
outputs = self.model(inputs)
return self.model.ops.unflatten(outputs, [len(d) for d in docs])
def set_annotations(self, docs, tensors):
"""Set the tensor attribute for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
tensors (object): Vector representation for each token in the docs.
"""
for doc, tensor in zip(docs, tensors):
if tensor.shape[0] != len(doc):
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
doc.tensor = tensor
def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None):
"""Update the model.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The dropout rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
"""
examples = Example.to_example_objects(examples)
inputs = []
bp_inputs = []
set_dropout_rate(self.model, drop)
for tok2vec in self.input_models:
set_dropout_rate(tok2vec, drop)
tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples])
inputs.append(tensor)
bp_inputs.append(bp_tensor)
inputs = self.model.ops.xp.hstack(inputs)
scores, bp_scores = self.model.begin_update(inputs)
loss, d_scores = self.get_loss(examples, scores)
d_inputs = bp_scores(d_scores, sgd=sgd)
d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
for d_input, bp_input in zip(d_inputs, bp_inputs):
bp_input(d_input)
if sgd is not None:
for tok2vec in self.input_models:
tok2vec.finish_update(sgd)
self.model.finish_update(sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += loss
return loss
def get_loss(self, examples, prediction):
examples = Example.to_example_objects(examples)
ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
target = self.vocab.vectors.data[ids]
d_scores = (prediction - target) / prediction.shape[0]
loss = (d_scores ** 2).sum()
return loss, d_scores
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
"""Allocate models, pre-process training data and acquire an
optimizer.
get_examples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
"""
if pipeline is not None:
for name, model in pipeline:
if model.has_ref("tok2vec"):
self.input_models.append(model.get_ref("tok2vec"))
self.model.initialize()
link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
return sgd
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger)
class Tagger(Pipe):
"""Pipeline component for part-of-speech tagging.
@ -1708,4 +1576,4 @@ def ner_factory(nlp, model, **cfg):
warnings.warn(Warnings.W098.format(name="ner"))
return EntityRecognizer.from_nlp(nlp, model, **cfg)
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]

View File

@ -123,9 +123,9 @@ def test_overfitting_IO():
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2},
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1},
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3},
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None},
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None},
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": True},
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": False},
],

View File

@ -24,6 +24,7 @@ window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true
dropout = null
[nlp.pipeline.tagger]
factory = "tagger"
@ -53,6 +54,7 @@ embed_size = 5555
window_size = 1
maxout_pieces = 7
subword_features = false
dropout = null
"""
@ -70,6 +72,7 @@ def my_parser():
nC=8,
conv_depth=2,
bilstm_depth=0,
dropout=None,
)
parser = build_tb_parser_model(
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5

View File

@ -1,7 +1,7 @@
import pytest
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger
from spacy.pipeline import TextCategorizer, SentenceRecognizer
from spacy.pipeline.defaults import default_parser, default_tagger
from spacy.pipeline.defaults import default_textcat, default_senter
from ..util import make_tempdir
@ -95,24 +95,6 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
tensorizer = Tensorizer(en_vocab, default_tensorizer())
tensorizer_b = tensorizer.to_bytes(exclude=["vocab"])
new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b)
assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b
def test_serialize_tensorizer_roundtrip_disk(en_vocab):
tensorizer = Tensorizer(en_vocab, default_tensorizer())
with make_tempdir() as d:
file_path = d / "tensorizer"
tensorizer.to_disk(file_path)
tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path)
assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes(
exclude=["vocab"]
)
def test_serialize_textcat_empty(en_vocab):
# See issue #1105
textcat = TextCategorizer(

View File

@ -15,7 +15,7 @@ def test_empty_doc():
vocab = Vocab()
doc = Doc(vocab, words=[])
# TODO: fix tok2vec arguments
tok2vec = build_Tok2Vec_model(width, embed_size)
tok2vec = build_Tok2Vec_model(width, embed_size, dropout=None)
vectors, backprop = tok2vec.begin_update([doc])
assert len(vectors) == 1
assert vectors[0].shape == (0, width)
@ -38,6 +38,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
char_embed=False,
nM=64,
nC=8,
dropout=None,
)
tok2vec.initialize()
vectors, backprop = tok2vec.begin_update(batch)
@ -50,14 +51,14 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
@pytest.mark.parametrize(
"tok2vec_config",
[
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None},
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None},
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None},
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None},
],
)
# fmt: on