mirror of https://github.com/explosion/spaCy.git
Merge pull request #5543 from svlandeg/feature/pretrain-config
pretrain from config
This commit is contained in:
commit
8411d4f4e6
|
@ -25,6 +25,7 @@ score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
|||
# These settings are invalid for the transformer models.
|
||||
init_tok2vec = null
|
||||
vectors = null
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
|
@ -32,7 +33,7 @@ start = 1000
|
|||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[optimizer]
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
@ -113,3 +114,4 @@ window_size = 1
|
|||
embed_size = 10000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
|
|
|
@ -0,0 +1,137 @@
|
|||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
# and tokens. If you set this to true, take care to ensure your run-time
|
||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||
gold_preproc = false
|
||||
# Limitations on training document length or number of examples.
|
||||
max_length = 0
|
||||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 400
|
||||
# Other settings
|
||||
seed = 0
|
||||
accumulate_gradient = 1
|
||||
use_pytorch_for_gpu_memory = false
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
||||
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
||||
# These settings are invalid for the transformer models.
|
||||
init_tok2vec = null
|
||||
vectors = null
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 1000
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
[pretraining]
|
||||
max_epochs = 1000
|
||||
min_length = 5
|
||||
max_length = 500
|
||||
dropout = 0.2
|
||||
n_save_every = null
|
||||
batch_size = 3000
|
||||
seed = ${training:seed}
|
||||
use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
|
||||
tok2vec_model = "nlp.pipeline.tok2vec.model"
|
||||
|
||||
[pretraining.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
[pretraining.loss_func]
|
||||
@losses = "CosineDistance.v1"
|
||||
normalize = true
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = ${training:vectors}
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.senter]
|
||||
factory = "senter"
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
|
||||
[nlp.pipeline.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[nlp.pipeline.parser]
|
||||
factory = "parser"
|
||||
|
||||
[nlp.pipeline.senter.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.senter.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 256
|
||||
depth = 6
|
||||
window_size = 1
|
||||
embed_size = 10000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
|
@ -14,6 +14,7 @@ score_weights = {"las": 0.8, "tags_acc": 0.2}
|
|||
limit = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
|
@ -21,7 +22,7 @@ start = 100
|
|||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[optimizer]
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
|
@ -65,3 +66,4 @@ depth = 4
|
|||
embed_size = 2000
|
||||
subword_features = true
|
||||
maxout_pieces = 3
|
||||
dropout = null
|
||||
|
|
|
@ -14,6 +14,7 @@ score_weights = {"las": 0.8, "tags_acc": 0.2}
|
|||
limit = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
|
@ -21,7 +22,7 @@ start = 100
|
|||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[optimizer]
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
|
@ -66,3 +67,4 @@ window_size = 1
|
|||
embed_size = 2000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
|
|
|
@ -12,8 +12,9 @@ max_length = 0
|
|||
batch_size = 25
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[optimizer]
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
|
@ -36,6 +37,7 @@ nM = 64
|
|||
nC = 8
|
||||
rows = 2000
|
||||
columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
|
||||
dropout = null
|
||||
|
||||
[nlp.pipeline.tok2vec.model.extract.features]
|
||||
@architectures = "spacy.Doc2Feats.v1"
|
||||
|
|
|
@ -11,6 +11,7 @@ gold_preproc = true
|
|||
max_length = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
|
@ -19,7 +20,7 @@ stop = 3000
|
|||
compound = 1.001
|
||||
|
||||
|
||||
[optimizer]
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
|
@ -44,3 +45,4 @@ maxout_pieces = 3
|
|||
window_size = 1
|
||||
subword_features = true
|
||||
pretrained_vectors = null
|
||||
dropout = null
|
||||
|
|
|
@ -1,212 +0,0 @@
|
|||
"""This script is experimental.
|
||||
|
||||
Try pre-training the CNN component of the text categorizer using a cheap
|
||||
language modelling-like objective. Specifically, we load pretrained vectors
|
||||
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
||||
predict the tokens' pretrained vectors. This isn't as easy as it sounds:
|
||||
we're not merely doing compression here, because heavy dropout is applied,
|
||||
including over the input words. This means the model must often (50% of the time)
|
||||
use the context in order to predict the word.
|
||||
|
||||
To evaluate the technique, we're pre-training with the 50k texts from the IMDB
|
||||
corpus, and then training with only 100 labels. Note that it's a bit dirty to
|
||||
pre-train with the development data, but also not *so* terrible: we're not using
|
||||
the development labels, after all --- only the unlabelled text.
|
||||
"""
|
||||
import plac
|
||||
import tqdm
|
||||
import random
|
||||
|
||||
import ml_datasets
|
||||
|
||||
import spacy
|
||||
from spacy.util import minibatch
|
||||
from spacy.pipeline import TextCategorizer
|
||||
from spacy.ml.models.tok2vec import build_Tok2Vec_model
|
||||
import numpy
|
||||
|
||||
|
||||
def load_texts(limit=0):
|
||||
train, dev = ml_datasets.imdb()
|
||||
train_texts, train_labels = zip(*train)
|
||||
dev_texts, dev_labels = zip(*train)
|
||||
train_texts = list(train_texts)
|
||||
dev_texts = list(dev_texts)
|
||||
random.shuffle(train_texts)
|
||||
random.shuffle(dev_texts)
|
||||
if limit >= 1:
|
||||
return train_texts[:limit]
|
||||
else:
|
||||
return list(train_texts) + list(dev_texts)
|
||||
|
||||
|
||||
def load_textcat_data(limit=0):
|
||||
"""Load data from the IMDB dataset."""
|
||||
# Partition off part of the train data for evaluation
|
||||
train_data, eval_data = ml_datasets.imdb()
|
||||
random.shuffle(train_data)
|
||||
train_data = train_data[-limit:]
|
||||
texts, labels = zip(*train_data)
|
||||
eval_texts, eval_labels = zip(*eval_data)
|
||||
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
|
||||
eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
|
||||
return (texts, cats), (eval_texts, eval_cats)
|
||||
|
||||
|
||||
def prefer_gpu():
|
||||
used = spacy.util.use_gpu(0)
|
||||
if used is None:
|
||||
return False
|
||||
else:
|
||||
import cupy.random
|
||||
|
||||
cupy.random.seed(0)
|
||||
return True
|
||||
|
||||
|
||||
def build_textcat_model(tok2vec, nr_class, width):
|
||||
from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged
|
||||
|
||||
with Model.define_operators({">>": chain}):
|
||||
model = (
|
||||
tok2vec
|
||||
>> list2ragged()
|
||||
>> reduce_mean()
|
||||
>> Softmax(nr_class, width)
|
||||
)
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
return model
|
||||
|
||||
|
||||
def block_gradients(model):
|
||||
from thinc.api import wrap # TODO FIX
|
||||
|
||||
def forward(X, drop=0.0):
|
||||
Y, _ = model.begin_update(X, drop=drop)
|
||||
return Y, None
|
||||
|
||||
return wrap(forward, model)
|
||||
|
||||
|
||||
def create_pipeline(width, embed_size, vectors_model):
|
||||
print("Load vectors")
|
||||
nlp = spacy.load(vectors_model)
|
||||
print("Start training")
|
||||
textcat = TextCategorizer(
|
||||
nlp.vocab,
|
||||
labels=["POSITIVE", "NEGATIVE"],
|
||||
# TODO: replace with config version
|
||||
model=build_textcat_model(
|
||||
build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width
|
||||
),
|
||||
)
|
||||
|
||||
nlp.add_pipe(textcat)
|
||||
return nlp
|
||||
|
||||
|
||||
def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||
tensorizer = nlp.create_pipe("tensorizer")
|
||||
nlp.add_pipe(tensorizer)
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(n_iter):
|
||||
losses = {}
|
||||
for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
|
||||
docs = [nlp.make_doc(text) for text in batch]
|
||||
tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout)
|
||||
print(losses)
|
||||
return optimizer
|
||||
|
||||
|
||||
def train_textcat(nlp, n_texts, n_iter=10):
|
||||
textcat = nlp.get_pipe("textcat")
|
||||
tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes()
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
||||
print(
|
||||
"Using {} examples ({} training, {} evaluation)".format(
|
||||
n_texts, len(train_texts), len(dev_texts)
|
||||
)
|
||||
)
|
||||
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
|
||||
|
||||
with nlp.select_pipes(enable="textcat"): # only train textcat
|
||||
optimizer = nlp.begin_training()
|
||||
textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights)
|
||||
print("Training the model...")
|
||||
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
||||
for i in range(n_iter):
|
||||
losses = {"textcat": 0.0}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(tqdm.tqdm(train_data), size=2)
|
||||
for batch in batches:
|
||||
nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
|
||||
with textcat.model.use_params(optimizer.averages):
|
||||
# evaluate on the dev data split off in load_data()
|
||||
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
||||
print(
|
||||
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
|
||||
losses["textcat"],
|
||||
scores["textcat_p"],
|
||||
scores["textcat_r"],
|
||||
scores["textcat_f"],
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def evaluate_textcat(tokenizer, textcat, texts, cats):
|
||||
docs = (tokenizer(text) for text in texts)
|
||||
tp = 1e-8
|
||||
fp = 1e-8
|
||||
tn = 1e-8
|
||||
fn = 1e-8
|
||||
for i, doc in enumerate(textcat.pipe(docs)):
|
||||
gold = cats[i]
|
||||
for label, score in doc.cats.items():
|
||||
if label not in gold:
|
||||
continue
|
||||
if score >= 0.5 and gold[label] >= 0.5:
|
||||
tp += 1.0
|
||||
elif score >= 0.5 and gold[label] < 0.5:
|
||||
fp += 1.0
|
||||
elif score < 0.5 and gold[label] < 0.5:
|
||||
tn += 1
|
||||
elif score < 0.5 and gold[label] >= 0.5:
|
||||
fn += 1
|
||||
precision = tp / (tp + fp)
|
||||
recall = tp / (tp + fn)
|
||||
f_score = 2 * (precision * recall) / (precision + recall)
|
||||
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
width=("Width of CNN layers", "positional", None, int),
|
||||
embed_size=("Embedding rows", "positional", None, int),
|
||||
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
|
||||
train_iters=("Number of iterations to pretrain", "option", "tn", int),
|
||||
train_examples=("Number of labelled examples", "option", "eg", int),
|
||||
vectors_model=("Name or path to vectors model to learn from"),
|
||||
)
|
||||
def main(
|
||||
width,
|
||||
embed_size,
|
||||
vectors_model,
|
||||
pretrain_iters=30,
|
||||
train_iters=30,
|
||||
train_examples=1000,
|
||||
):
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
use_gpu = prefer_gpu()
|
||||
print("Using GPU?", use_gpu)
|
||||
|
||||
nlp = create_pipeline(width, embed_size, vectors_model)
|
||||
print("Load data")
|
||||
texts = load_texts(limit=0)
|
||||
print("Train tensorizer")
|
||||
optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
|
||||
print("Train textcat")
|
||||
train_textcat(nlp, train_examples, n_iter=train_iters)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
plac.call(main)
|
|
@ -2,16 +2,15 @@ if __name__ == "__main__":
|
|||
import plac
|
||||
import sys
|
||||
from wasabi import msg
|
||||
from spacy.cli import download, link, info, package, train, pretrain, convert
|
||||
from spacy.cli import download, link, info, package, pretrain, convert
|
||||
from spacy.cli import init_model, profile, evaluate, validate, debug_data
|
||||
from spacy.cli import train_from_config_cli
|
||||
from spacy.cli import train_cli
|
||||
|
||||
commands = {
|
||||
"download": download,
|
||||
"link": link,
|
||||
"info": info,
|
||||
"train": train,
|
||||
"train-from-config": train_from_config_cli,
|
||||
"train": train_cli,
|
||||
"pretrain": pretrain,
|
||||
"debug-data": debug_data,
|
||||
"evaluate": evaluate,
|
||||
|
|
|
@ -4,8 +4,7 @@ from .download import download # noqa: F401
|
|||
from .info import info # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
from .profile import profile # noqa: F401
|
||||
from .train import train # noqa: F401
|
||||
from .train_from_config import train_from_config_cli # noqa: F401
|
||||
from .train_from_config import train_cli # noqa: F401
|
||||
from .pretrain import pretrain # noqa: F401
|
||||
from .debug_data import debug_data # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
|
|
|
@ -3,48 +3,39 @@ import numpy
|
|||
import time
|
||||
import re
|
||||
from collections import Counter
|
||||
import plac
|
||||
from pathlib import Path
|
||||
from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu
|
||||
from thinc.api import CosineDistance, L2Distance
|
||||
from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
|
||||
from ..gold import Example
|
||||
from ..errors import Errors
|
||||
from ..ml.models.multi_task import build_masked_language_model
|
||||
from ..tokens import Doc
|
||||
from ..attrs import ID, HEAD
|
||||
from ..ml.models.tok2vec import build_Tok2Vec_model
|
||||
from .. import util
|
||||
from ..util import create_default_optimizer
|
||||
from .train import _load_pretrained_tok2vec
|
||||
from ..gold import Example
|
||||
|
||||
|
||||
def pretrain(
|
||||
@plac.annotations(
|
||||
# fmt: off
|
||||
texts_loc: ("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
|
||||
vectors_model: ("Name or path to spaCy model with vectors to learn from", "positional", None, str),
|
||||
output_dir: ("Directory to write models to on each epoch", "positional", None, str),
|
||||
width: ("Width of CNN layers", "option", "cw", int) = 96,
|
||||
conv_depth: ("Depth of CNN layers", "option", "cd", int) = 4,
|
||||
bilstm_depth: ("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int) = 0,
|
||||
cnn_pieces: ("Maxout size for CNN layers. 1 for Mish", "option", "cP", int) = 3,
|
||||
sa_depth: ("Depth of self-attention layers", "option", "sa", int) = 0,
|
||||
use_chars: ("Whether to use character-based embedding", "flag", "chr", bool) = False,
|
||||
cnn_window: ("Window size for CNN layers", "option", "cW", int) = 1,
|
||||
embed_rows: ("Number of embedding rows", "option", "er", int) = 2000,
|
||||
loss_func: ("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str) = "cosine",
|
||||
use_vectors: ("Whether to use the static vectors as input features", "flag", "uv") = False,
|
||||
dropout: ("Dropout rate", "option", "d", float) = 0.2,
|
||||
n_iter: ("Number of iterations to pretrain", "option", "i", int) = 1000,
|
||||
batch_size: ("Number of words per training batch", "option", "bs", int) = 3000,
|
||||
max_length: ("Max words per example. Longer examples are discarded", "option", "xw", int) = 500,
|
||||
min_length: ("Min words per example. Shorter examples are discarded", "option", "nw", int) = 5,
|
||||
seed: ("Seed for random number generators", "option", "s", int) = 0,
|
||||
n_save_every: ("Save model every X batches.", "option", "se", int) = None,
|
||||
init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
|
||||
epoch_start: ("The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.", "option", "es", int) = None,
|
||||
texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
|
||||
vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str),
|
||||
output_dir=("Directory to write models to on each epoch", "positional", None, Path),
|
||||
config_path=("Path to config file", "positional", None, Path),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path),
|
||||
epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int),
|
||||
# fmt: on
|
||||
)
|
||||
def pretrain(
|
||||
texts_loc,
|
||||
vectors_model,
|
||||
config_path,
|
||||
output_dir,
|
||||
use_gpu=-1,
|
||||
resume_path=None,
|
||||
epoch_resume=None,
|
||||
):
|
||||
"""
|
||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||
|
@ -58,34 +49,46 @@ def pretrain(
|
|||
However, it's still quite experimental, so your mileage may vary.
|
||||
|
||||
To load the weights back in during 'spacy train', you need to ensure
|
||||
all settings are the same between pretraining and training. The API and
|
||||
errors around this need some improvement.
|
||||
all settings are the same between pretraining and training. Ideally,
|
||||
this is done by using the same config file for both commands.
|
||||
"""
|
||||
config = dict(locals())
|
||||
for key in config:
|
||||
if isinstance(config[key], Path):
|
||||
config[key] = str(config[key])
|
||||
util.fix_random_seed(seed)
|
||||
if not config_path or not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
|
||||
has_gpu = prefer_gpu()
|
||||
if has_gpu:
|
||||
import torch
|
||||
if use_gpu >= 0:
|
||||
msg.info("Using GPU")
|
||||
util.use_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
|
||||
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
||||
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
config = util.load_config(config_path, create_objects=False)
|
||||
util.fix_random_seed(config["pretraining"]["seed"])
|
||||
if config["pretraining"]["use_pytorch_for_gpu_memory"]:
|
||||
use_pytorch_for_gpu_memory()
|
||||
|
||||
output_dir = Path(output_dir)
|
||||
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
||||
msg.warn(
|
||||
"Output directory is not empty",
|
||||
"It is better to use an empty directory or refer to a new output path, "
|
||||
"then the new directory will be created for you.",
|
||||
)
|
||||
if resume_path:
|
||||
msg.warn(
|
||||
"Output directory is not empty. ",
|
||||
"If you're resuming a run from a previous model in this directory, "
|
||||
"the old models for the consecutive epochs will be overwritten "
|
||||
"with the new ones.",
|
||||
)
|
||||
else:
|
||||
msg.warn(
|
||||
"Output directory is not empty. ",
|
||||
"It is better to use an empty directory or refer to a new output path, "
|
||||
"then the new directory will be created for you.",
|
||||
)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
msg.good(f"Created output directory: {output_dir}")
|
||||
srsly.write_json(output_dir / "config.json", config)
|
||||
msg.good("Saved settings to config.json")
|
||||
msg.good("Saved config file in the output directory")
|
||||
|
||||
config = util.load_config(config_path, create_objects=True)
|
||||
pretrain_config = config["pretraining"]
|
||||
|
||||
# Load texts from file or stdin
|
||||
if texts_loc != "-": # reading from a file
|
||||
|
@ -99,57 +102,50 @@ def pretrain(
|
|||
msg.good("Loaded input texts")
|
||||
random.shuffle(texts)
|
||||
else: # reading from stdin
|
||||
msg.text("Reading input text from stdin...")
|
||||
msg.info("Reading input text from stdin...")
|
||||
texts = srsly.read_jsonl("-")
|
||||
|
||||
with msg.loading(f"Loading model '{vectors_model}'..."):
|
||||
nlp = util.load_model(vectors_model)
|
||||
msg.good(f"Loaded model '{vectors_model}'")
|
||||
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors
|
||||
model = create_pretraining_model(
|
||||
nlp,
|
||||
# TODO: replace with config
|
||||
build_Tok2Vec_model(
|
||||
width,
|
||||
embed_rows,
|
||||
conv_depth=conv_depth,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
|
||||
subword_features=not use_chars, # Set to False for Chinese etc
|
||||
maxout_pieces=cnn_pieces, # If set to 1, use Mish activation.
|
||||
window_size=1,
|
||||
char_embed=False,
|
||||
nM=64,
|
||||
nC=8,
|
||||
),
|
||||
)
|
||||
# Load in pretrained weights
|
||||
if init_tok2vec is not None:
|
||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||
msg.text(f"Loaded pretrained tok2vec for: {components}")
|
||||
tok2vec_path = pretrain_config["tok2vec_model"]
|
||||
tok2vec = config
|
||||
for subpath in tok2vec_path.split("."):
|
||||
tok2vec = tok2vec.get(subpath)
|
||||
model = create_pretraining_model(nlp, tok2vec)
|
||||
optimizer = pretrain_config["optimizer"]
|
||||
|
||||
# Load in pretrained weights to resume from
|
||||
if resume_path is not None:
|
||||
msg.info(f"Resume training tok2vec from: {resume_path}")
|
||||
with resume_path.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
model.get_ref("tok2vec").from_bytes(weights_data)
|
||||
# Parse the epoch number from the given weight file
|
||||
model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
|
||||
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
||||
if model_name:
|
||||
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
||||
epoch_start = int(model_name.group(0)[5:][:-4]) + 1
|
||||
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
|
||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
||||
else:
|
||||
if not epoch_start:
|
||||
if not epoch_resume:
|
||||
msg.fail(
|
||||
"You have to use the --epoch-start argument when using a renamed weight file for --init-tok2vec",
|
||||
"You have to use the --epoch-resume setting when using a renamed weight file for --resume-path",
|
||||
exits=True,
|
||||
)
|
||||
elif epoch_start < 0:
|
||||
elif epoch_resume < 0:
|
||||
msg.fail(
|
||||
f"The argument --epoch-start has to be greater or equal to 0. {epoch_start} is invalid",
|
||||
f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid",
|
||||
exits=True,
|
||||
)
|
||||
else:
|
||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
||||
else:
|
||||
# Without '--init-tok2vec' the '--epoch-start' argument is ignored
|
||||
epoch_start = 0
|
||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
||||
epoch_resume = 0
|
||||
|
||||
optimizer = create_default_optimizer()
|
||||
tracker = ProgressTracker(frequency=10000)
|
||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}")
|
||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||
|
||||
|
@ -168,28 +164,27 @@ def pretrain(
|
|||
file_.write(srsly.json_dumps(log) + "\n")
|
||||
|
||||
skip_counter = 0
|
||||
for epoch in range(epoch_start, n_iter + epoch_start):
|
||||
for batch_id, batch in enumerate(
|
||||
util.minibatch_by_words(
|
||||
(Example(doc=text) for text in texts), size=batch_size
|
||||
)
|
||||
):
|
||||
loss_func = pretrain_config["loss_func"]
|
||||
for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
|
||||
examples = [Example(doc=text) for text in texts]
|
||||
batches = util.minibatch_by_words(examples, size=pretrain_config["batch_size"])
|
||||
for batch_id, batch in enumerate(batches):
|
||||
docs, count = make_docs(
|
||||
nlp,
|
||||
[text for (text, _) in batch],
|
||||
max_length=max_length,
|
||||
min_length=min_length,
|
||||
[ex.doc for ex in batch],
|
||||
max_length=pretrain_config["max_length"],
|
||||
min_length=pretrain_config["min_length"],
|
||||
)
|
||||
skip_counter += count
|
||||
loss = make_update(
|
||||
model, docs, optimizer, objective=loss_func, drop=dropout
|
||||
)
|
||||
loss = make_update(model, docs, optimizer, distance=loss_func)
|
||||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
msg.row(progress, **row_settings)
|
||||
if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
|
||||
break
|
||||
if n_save_every and (batch_id % n_save_every == 0):
|
||||
if pretrain_config["n_save_every"] and (
|
||||
batch_id % pretrain_config["n_save_every"] == 0
|
||||
):
|
||||
_save_model(epoch, is_temp=True)
|
||||
_save_model(epoch)
|
||||
tracker.epoch_loss = 0.0
|
||||
|
@ -201,17 +196,17 @@ def pretrain(
|
|||
msg.good("Successfully finished pretrain")
|
||||
|
||||
|
||||
def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
|
||||
def make_update(model, docs, optimizer, distance):
|
||||
"""Perform an update over a single batch of documents.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
drop (float): The dropout rate.
|
||||
optimizer (callable): An optimizer.
|
||||
RETURNS loss: A float for the loss.
|
||||
"""
|
||||
predictions, backprop = model.begin_update(docs, drop=drop)
|
||||
loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
|
||||
backprop(gradients, sgd=optimizer)
|
||||
predictions, backprop = model.begin_update(docs)
|
||||
loss, gradients = get_vectors_loss(model.ops, docs, predictions, distance)
|
||||
backprop(gradients)
|
||||
model.finish_update(optimizer)
|
||||
# Don't want to return a cupy object here
|
||||
# The gradients are modified in-place by the BERT MLM,
|
||||
# so we get an accurate loss
|
||||
|
@ -243,12 +238,12 @@ def make_docs(nlp, batch, min_length, max_length):
|
|||
heads = numpy.asarray(heads, dtype="uint64")
|
||||
heads = heads.reshape((len(doc), 1))
|
||||
doc = doc.from_array([HEAD], heads)
|
||||
if len(doc) >= min_length and len(doc) < max_length:
|
||||
if min_length <= len(doc) < max_length:
|
||||
docs.append(doc)
|
||||
return docs, skip_count
|
||||
|
||||
|
||||
def get_vectors_loss(ops, docs, prediction, objective="L2"):
|
||||
def get_vectors_loss(ops, docs, prediction, distance):
|
||||
"""Compute a mean-squared error loss between the documents' vectors and
|
||||
the prediction.
|
||||
|
||||
|
@ -262,13 +257,6 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
|
|||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
# TODO: this code originally didn't normalize, but shouldn't normalize=True ?
|
||||
if objective == "L2":
|
||||
distance = L2Distance(normalize=False)
|
||||
elif objective == "cosine":
|
||||
distance = CosineDistance(normalize=False)
|
||||
else:
|
||||
raise ValueError(Errors.E142.format(loss_func=objective))
|
||||
d_target, loss = distance(prediction, target)
|
||||
return loss, d_target
|
||||
|
||||
|
@ -281,7 +269,7 @@ def create_pretraining_model(nlp, tok2vec):
|
|||
"""
|
||||
output_size = nlp.vocab.vectors.data.shape[1]
|
||||
output_layer = chain(
|
||||
Maxout(300, pieces=3, normalize=True, dropout=0.0), Linear(output_size)
|
||||
Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size)
|
||||
)
|
||||
# This is annoying, but the parser etc have the flatten step after
|
||||
# the tok2vec. To load the weights in cleanly, we need to match
|
||||
|
@ -289,11 +277,12 @@ def create_pretraining_model(nlp, tok2vec):
|
|||
# "tok2vec" has to be the same set of processes as what the components do.
|
||||
tok2vec = chain(tok2vec, list2array())
|
||||
model = chain(tok2vec, output_layer)
|
||||
model = build_masked_language_model(nlp.vocab, model)
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
model.set_ref("output_layer", output_layer)
|
||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||
return model
|
||||
mlm_model = build_masked_language_model(nlp.vocab, model)
|
||||
mlm_model.set_ref("tok2vec", tok2vec)
|
||||
mlm_model.set_ref("output_layer", output_layer)
|
||||
mlm_model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||
return mlm_model
|
||||
|
||||
|
||||
class ProgressTracker(object):
|
||||
|
|
|
@ -13,6 +13,7 @@ import random
|
|||
from ..gold import GoldCorpus
|
||||
from .. import util
|
||||
from ..errors import Errors
|
||||
from ..ml import models # don't remove - required to load the built-in architectures
|
||||
|
||||
registry = util.registry
|
||||
|
||||
|
@ -123,7 +124,7 @@ class ConfigSchema(BaseModel):
|
|||
use_gpu=("Use GPU", "option", "g", int),
|
||||
# fmt: on
|
||||
)
|
||||
def train_from_config_cli(
|
||||
def train_cli(
|
||||
train_path,
|
||||
dev_path,
|
||||
config_path,
|
||||
|
@ -132,7 +133,7 @@ def train_from_config_cli(
|
|||
raw_text=None,
|
||||
debug=False,
|
||||
verbose=False,
|
||||
use_gpu=-1
|
||||
use_gpu=-1,
|
||||
):
|
||||
"""
|
||||
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
||||
|
@ -156,7 +157,7 @@ def train_from_config_cli(
|
|||
else:
|
||||
msg.info("Using CPU")
|
||||
|
||||
train_from_config(
|
||||
train(
|
||||
config_path,
|
||||
{"train": train_path, "dev": dev_path},
|
||||
output_path=output_path,
|
||||
|
@ -165,10 +166,11 @@ def train_from_config_cli(
|
|||
)
|
||||
|
||||
|
||||
def train_from_config(
|
||||
def train(
|
||||
config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
|
||||
):
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
# Read the config first without creating objects, to get to the original nlp_config
|
||||
config = util.load_config(config_path, create_objects=False)
|
||||
util.fix_random_seed(config["training"]["seed"])
|
||||
if config["training"]["use_pytorch_for_gpu_memory"]:
|
||||
|
@ -177,8 +179,8 @@ def train_from_config(
|
|||
config = util.load_config(config_path, create_objects=True)
|
||||
msg.info("Creating nlp from config")
|
||||
nlp = util.load_model_from_config(nlp_config)
|
||||
optimizer = config["optimizer"]
|
||||
training = config["training"]
|
||||
optimizer = training["optimizer"]
|
||||
limit = training["limit"]
|
||||
msg.info("Loading training corpus")
|
||||
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
|
||||
|
@ -246,13 +248,19 @@ def create_train_batches(nlp, corpus, cfg):
|
|||
if len(train_examples) == 0:
|
||||
raise ValueError(Errors.E988)
|
||||
random.shuffle(train_examples)
|
||||
batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
|
||||
batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"], discard_oversize=cfg["discard_oversize"])
|
||||
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
|
||||
try:
|
||||
first = next(batches)
|
||||
yield first
|
||||
except StopIteration:
|
||||
raise ValueError(Errors.E986)
|
||||
for batch in batches:
|
||||
yield batch
|
||||
epochs_todo -= 1
|
||||
# We intentionally compare exactly to 0 here, so that max_epochs < 1
|
||||
# will not break.
|
||||
if epochs_todo == 0:
|
||||
if epochs_todo == 0:
|
||||
break
|
||||
|
||||
|
||||
|
|
|
@ -453,8 +453,6 @@ class Errors(object):
|
|||
"should be of equal length.")
|
||||
E141 = ("Entity vectors should be of length {required} instead of the "
|
||||
"provided {found}.")
|
||||
E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or "
|
||||
"'cosine'.")
|
||||
E143 = ("Labels for component '{name}' not initialized. Did you forget to "
|
||||
"call add_label()?")
|
||||
E144 = ("Could not find parameter `{param}` when building the entity "
|
||||
|
@ -577,6 +575,8 @@ class Errors(object):
|
|||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
|
||||
E986 = ("Could not create any training batches: check your input. "
|
||||
"Perhaps discard_oversize should be set to False ?")
|
||||
E987 = ("The text of an example training instance is either a Doc or "
|
||||
"a string, but found {type} instead.")
|
||||
E988 = ("Could not parse any training examples. Ensure the data is "
|
||||
|
|
|
@ -231,10 +231,6 @@ class Language(object):
|
|||
|
||||
# Conveniences to access pipeline components
|
||||
# Shouldn't be used anymore!
|
||||
@property
|
||||
def tensorizer(self):
|
||||
return self.get_pipe("tensorizer")
|
||||
|
||||
@property
|
||||
def tagger(self):
|
||||
return self.get_pipe("tagger")
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .models import *
|
|
@ -2,6 +2,5 @@ from .entity_linker import * # noqa
|
|||
from .parser import * # noqa
|
||||
from .simple_ner import *
|
||||
from .tagger import * # noqa
|
||||
from .tensorizer import * # noqa
|
||||
from .textcat import * # noqa
|
||||
from .tok2vec import * # noqa
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init
|
||||
import numpy
|
||||
|
||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||
|
||||
|
||||
def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96):
|
||||
|
@ -24,6 +26,80 @@ def build_cloze_multi_task_model(vocab, tok2vec):
|
|||
return model
|
||||
|
||||
|
||||
def build_masked_language_model(*args, **kwargs):
|
||||
# TODO cf https://github.com/explosion/spaCy/blob/2c107f02a4d60bda2440db0aad1a88cbbf4fb52d/spacy/_ml.py#L828
|
||||
raise NotImplementedError
|
||||
def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
|
||||
"""Convert a model into a BERT-style masked language model"""
|
||||
|
||||
random_words = _RandomWords(vocab)
|
||||
|
||||
def mlm_forward(model, docs, is_train):
|
||||
mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
|
||||
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
||||
output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop
|
||||
|
||||
def mlm_backward(d_output):
|
||||
d_output *= 1 - mask
|
||||
return backprop(d_output)
|
||||
|
||||
return output, mlm_backward
|
||||
|
||||
mlm_model = Model("masked-language-model", mlm_forward, layers=[wrapped_model])
|
||||
mlm_model.set_ref("wrapped-model", wrapped_model)
|
||||
|
||||
return mlm_model
|
||||
|
||||
|
||||
class _RandomWords(object):
|
||||
def __init__(self, vocab):
|
||||
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
||||
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
||||
self.words = self.words[:10000]
|
||||
self.probs = self.probs[:10000]
|
||||
self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
|
||||
self.probs /= self.probs.sum()
|
||||
self._cache = []
|
||||
|
||||
def next(self):
|
||||
if not self._cache:
|
||||
self._cache.extend(
|
||||
numpy.random.choice(len(self.words), 10000, p=self.probs)
|
||||
)
|
||||
index = self._cache.pop()
|
||||
return self.words[index]
|
||||
|
||||
|
||||
def _apply_mask(docs, random_words, mask_prob=0.15):
|
||||
# This needs to be here to avoid circular imports
|
||||
from ...tokens import Doc
|
||||
|
||||
N = sum(len(doc) for doc in docs)
|
||||
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
||||
mask = mask >= mask_prob
|
||||
i = 0
|
||||
masked_docs = []
|
||||
for doc in docs:
|
||||
words = []
|
||||
for token in doc:
|
||||
if not mask[i]:
|
||||
word = _replace_word(token.text, random_words)
|
||||
else:
|
||||
word = token.text
|
||||
words.append(word)
|
||||
i += 1
|
||||
spaces = [bool(w.whitespace_) for w in doc]
|
||||
# NB: If you change this implementation to instead modify
|
||||
# the docs in place, take care that the IDs reflect the original
|
||||
# words. Currently we use the original docs to make the vectors
|
||||
# for the target, so we don't lose the original tokens. But if
|
||||
# you modified the docs in place here, you would.
|
||||
masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces))
|
||||
return mask, masked_docs
|
||||
|
||||
|
||||
def _replace_word(word, random_words, mask="[MASK]"):
|
||||
roll = numpy.random.random()
|
||||
if roll < 0.8:
|
||||
return mask
|
||||
elif roll < 0.9:
|
||||
return random_words.next()
|
||||
else:
|
||||
return word
|
||||
|
|
|
@ -1,10 +0,0 @@
|
|||
from thinc.api import Linear, zero_init
|
||||
|
||||
from ... import util
|
||||
from ...util import registry
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.Tensorizer.v1")
|
||||
def build_tensorizer(input_size, output_size):
|
||||
input_size = util.env_opt("token_vector_width", input_size)
|
||||
return Linear(output_size, input_size, init_W=zero_init)
|
|
@ -49,13 +49,13 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
|
|||
|
||||
@registry.architectures.register("spacy.TextCat.v1")
|
||||
def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size,
|
||||
window_size, conv_depth, nO=None):
|
||||
window_size, conv_depth, dropout, nO=None):
|
||||
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||
lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER))
|
||||
prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX))
|
||||
suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX))
|
||||
shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE))
|
||||
lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout)
|
||||
prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout)
|
||||
suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout)
|
||||
shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout)
|
||||
|
||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
||||
trained_vectors = FeatureExtractor(cols) >> with_array(
|
||||
|
@ -114,7 +114,7 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.TextCatLowData.v1")
|
||||
def build_text_classifier_lowdata(width, pretrained_vectors, nO=None):
|
||||
def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None):
|
||||
nlp = util.load_model(pretrained_vectors)
|
||||
vectors = nlp.vocab.vectors
|
||||
vector_dim = vectors.data.shape[1]
|
||||
|
@ -129,7 +129,8 @@ def build_text_classifier_lowdata(width, pretrained_vectors, nO=None):
|
|||
>> reduce_sum()
|
||||
>> residual(Relu(width, width)) ** 2
|
||||
>> Linear(nO, width)
|
||||
>> Dropout(0.0)
|
||||
>> Logistic()
|
||||
)
|
||||
if dropout:
|
||||
model = model >> Dropout(dropout)
|
||||
model = model >> Logistic()
|
||||
return model
|
||||
|
|
|
@ -49,6 +49,7 @@ def hash_embed_cnn(
|
|||
maxout_pieces,
|
||||
window_size,
|
||||
subword_features,
|
||||
dropout,
|
||||
):
|
||||
# Does not use character embeddings: set to False by default
|
||||
return build_Tok2Vec_model(
|
||||
|
@ -63,6 +64,7 @@ def hash_embed_cnn(
|
|||
char_embed=False,
|
||||
nM=0,
|
||||
nC=0,
|
||||
dropout=dropout,
|
||||
)
|
||||
|
||||
|
||||
|
@ -76,6 +78,7 @@ def hash_charembed_cnn(
|
|||
window_size,
|
||||
nM,
|
||||
nC,
|
||||
dropout,
|
||||
):
|
||||
# Allows using character embeddings by setting nC, nM and char_embed=True
|
||||
return build_Tok2Vec_model(
|
||||
|
@ -90,12 +93,13 @@ def hash_charembed_cnn(
|
|||
char_embed=True,
|
||||
nM=nM,
|
||||
nC=nC,
|
||||
dropout=dropout,
|
||||
)
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
|
||||
def hash_embed_bilstm_v1(
|
||||
pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces
|
||||
pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout
|
||||
):
|
||||
# Does not use character embeddings: set to False by default
|
||||
return build_Tok2Vec_model(
|
||||
|
@ -110,12 +114,13 @@ def hash_embed_bilstm_v1(
|
|||
char_embed=False,
|
||||
nM=0,
|
||||
nC=0,
|
||||
dropout=dropout,
|
||||
)
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
|
||||
def hash_char_embed_bilstm_v1(
|
||||
pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC
|
||||
pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC, dropout
|
||||
):
|
||||
# Allows using character embeddings by setting nC, nM and char_embed=True
|
||||
return build_Tok2Vec_model(
|
||||
|
@ -130,6 +135,7 @@ def hash_char_embed_bilstm_v1(
|
|||
char_embed=True,
|
||||
nM=nM,
|
||||
nC=nC,
|
||||
dropout=dropout,
|
||||
)
|
||||
|
||||
|
||||
|
@ -144,19 +150,19 @@ def LayerNormalizedMaxout(width, maxout_pieces):
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
||||
def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix):
|
||||
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"))
|
||||
def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout):
|
||||
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
|
||||
if use_subwords:
|
||||
prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"))
|
||||
suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"))
|
||||
shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"))
|
||||
prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout)
|
||||
suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout)
|
||||
shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout)
|
||||
|
||||
if pretrained_vectors:
|
||||
glove = StaticVectors(
|
||||
vectors=pretrained_vectors.data,
|
||||
nO=width,
|
||||
column=columns.index(ID),
|
||||
dropout=0.0,
|
||||
dropout=dropout,
|
||||
)
|
||||
|
||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||
|
@ -164,13 +170,10 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix):
|
|||
embed_layer = norm
|
||||
else:
|
||||
if use_subwords and pretrained_vectors:
|
||||
nr_columns = 5
|
||||
concat_columns = glove | norm | prefix | suffix | shape
|
||||
elif use_subwords:
|
||||
nr_columns = 4
|
||||
concat_columns = norm | prefix | suffix | shape
|
||||
else:
|
||||
nr_columns = 2
|
||||
concat_columns = glove | norm
|
||||
|
||||
embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH"))
|
||||
|
@ -179,8 +182,8 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix):
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||
def CharacterEmbed(columns, width, rows, nM, nC, features):
|
||||
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"))
|
||||
def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
|
||||
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
|
||||
chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC)
|
||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||
embed_layer = chr_embed | features >> with_array(norm)
|
||||
|
@ -238,16 +241,17 @@ def build_Tok2Vec_model(
|
|||
nC,
|
||||
conv_depth,
|
||||
bilstm_depth,
|
||||
dropout,
|
||||
) -> Model:
|
||||
if char_embed:
|
||||
subword_features = False
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM))
|
||||
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout)
|
||||
if subword_features:
|
||||
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX))
|
||||
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX))
|
||||
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE))
|
||||
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout)
|
||||
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout)
|
||||
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout)
|
||||
else:
|
||||
prefix, suffix, shape = (None, None, None)
|
||||
if pretrained_vectors is not None:
|
||||
|
@ -255,7 +259,7 @@ def build_Tok2Vec_model(
|
|||
vectors=pretrained_vectors.data,
|
||||
nO=width,
|
||||
column=cols.index(ID),
|
||||
dropout=0.0,
|
||||
dropout=dropout,
|
||||
)
|
||||
|
||||
if subword_features:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
|
||||
from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
|
||||
from .pipes import TextCategorizer, Pipe, Sentencizer
|
||||
from .pipes import SentenceRecognizer
|
||||
from .simple_ner import SimpleNER
|
||||
from .morphologizer import Morphologizer
|
||||
|
@ -14,7 +14,6 @@ __all__ = [
|
|||
"EntityRecognizer",
|
||||
"EntityLinker",
|
||||
"TextCategorizer",
|
||||
"Tensorizer",
|
||||
"Tok2Vec",
|
||||
"Pipe",
|
||||
"Morphologizer",
|
||||
|
|
|
@ -63,16 +63,6 @@ def default_tagger():
|
|||
return util.load_config(loc, create_objects=True)["model"]
|
||||
|
||||
|
||||
def default_tensorizer_config():
|
||||
loc = Path(__file__).parent / "tensorizer_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=False)
|
||||
|
||||
|
||||
def default_tensorizer():
|
||||
loc = Path(__file__).parent / "tensorizer_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=True)["model"]
|
||||
|
||||
|
||||
def default_textcat_config():
|
||||
loc = Path(__file__).parent / "textcat_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=False)
|
||||
|
|
|
@ -10,3 +10,4 @@ embed_size = 300
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
|
|
|
@ -11,3 +11,4 @@ window_size = 1
|
|||
maxout_pieces = 3
|
||||
nM = 64
|
||||
nC = 8
|
||||
dropout = null
|
||||
|
|
|
@ -13,3 +13,4 @@ embed_size = 2000
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
|
|
|
@ -13,3 +13,4 @@ embed_size = 2000
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
|
|
|
@ -10,3 +10,4 @@ embed_size = 2000
|
|||
window_size = 1
|
||||
maxout_pieces = 2
|
||||
subword_features = true
|
||||
dropout = null
|
||||
|
|
|
@ -10,3 +10,4 @@ embed_size = 7000
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
|
|
|
@ -10,3 +10,4 @@ embed_size = 2000
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
[model]
|
||||
@architectures = "spacy.Tensorizer.v1"
|
||||
input_size=96
|
||||
output_size=300
|
|
@ -11,3 +11,4 @@ embed_size = 2000
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
|
|
|
@ -7,3 +7,4 @@ conv_depth = 2
|
|||
embed_size = 2000
|
||||
window_size = 1
|
||||
ngram_size = 1
|
||||
dropout = null
|
||||
|
|
|
@ -7,3 +7,4 @@ embed_size = 2000
|
|||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
|
|
|
@ -44,8 +44,8 @@ class SentenceSegmenter(object):
|
|||
class SimilarityHook(Pipe):
|
||||
"""
|
||||
Experimental: A pipeline component to install a hook for supervised
|
||||
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
|
||||
documents. The similarity model can be any object obeying the Thinc `Model`
|
||||
similarity into `Doc` objects.
|
||||
The similarity model can be any object obeying the Thinc `Model`
|
||||
interface. By default, the model concatenates the elementwise mean and
|
||||
elementwise max of the two tensors, and compares them using the
|
||||
Cauchy-like similarity function from Chen (2013):
|
||||
|
@ -82,7 +82,7 @@ class SimilarityHook(Pipe):
|
|||
sims, bp_sims = self.model.begin_update(doc1_doc2)
|
||||
|
||||
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
|
||||
"""Allocate model, using width from tensorizer in pipeline.
|
||||
"""Allocate model, using nO from the first model in the pipeline.
|
||||
|
||||
gold_tuples (iterable): Gold-standard training data.
|
||||
pipeline (list): The pipeline the model is part of.
|
||||
|
|
|
@ -16,7 +16,7 @@ from ..morphology cimport Morphology
|
|||
from ..vocab cimport Vocab
|
||||
|
||||
from .defaults import default_tagger, default_parser, default_ner, default_textcat
|
||||
from .defaults import default_nel, default_senter, default_tensorizer
|
||||
from .defaults import default_nel, default_senter
|
||||
from .functions import merge_subtokens
|
||||
from ..language import Language, component
|
||||
from ..syntax import nonproj
|
||||
|
@ -238,138 +238,6 @@ class Pipe(object):
|
|||
return self
|
||||
|
||||
|
||||
@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer)
|
||||
class Tensorizer(Pipe):
|
||||
"""Pre-train position-sensitive vectors for tokens."""
|
||||
|
||||
def __init__(self, vocab, model, **cfg):
|
||||
"""Construct a new statistical model. Weights are not allocated on
|
||||
initialisation.
|
||||
|
||||
vocab (Vocab): A `Vocab` instance. The model must share the same
|
||||
`Vocab` instance with the `Doc` objects it will process.
|
||||
**cfg: Config parameters.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.input_models = []
|
||||
self.cfg = dict(cfg)
|
||||
|
||||
def __call__(self, example):
|
||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||
model. Vectors are set to the `Doc.tensor` attribute.
|
||||
|
||||
docs (Doc or iterable): One or more documents to add vectors to.
|
||||
RETURNS (dict or None): Intermediate computations.
|
||||
"""
|
||||
doc = self._get_doc(example)
|
||||
tokvecses = self.predict([doc])
|
||||
self.set_annotations([doc], tokvecses)
|
||||
if isinstance(example, Example):
|
||||
example.doc = doc
|
||||
return example
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||
"""Process `Doc` objects as a stream.
|
||||
|
||||
stream (iterator): A sequence of `Doc` or `Example` objects to process.
|
||||
batch_size (int): Number of `Doc` or `Example` objects to group.
|
||||
YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input.
|
||||
"""
|
||||
for examples in util.minibatch(stream, size=batch_size):
|
||||
docs = [self._get_doc(ex) for ex in examples]
|
||||
tensors = self.predict(docs)
|
||||
self.set_annotations(docs, tensors)
|
||||
|
||||
if as_example:
|
||||
for ex, doc in zip(examples, docs):
|
||||
ex.doc = doc
|
||||
yield ex
|
||||
else:
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
"""Return a single tensor for a batch of documents.
|
||||
|
||||
docs (iterable): A sequence of `Doc` objects.
|
||||
RETURNS (object): Vector representations for each token in the docs.
|
||||
"""
|
||||
inputs = self.model.ops.flatten([doc.tensor for doc in docs])
|
||||
outputs = self.model(inputs)
|
||||
return self.model.ops.unflatten(outputs, [len(d) for d in docs])
|
||||
|
||||
def set_annotations(self, docs, tensors):
|
||||
"""Set the tensor attribute for a batch of documents.
|
||||
|
||||
docs (iterable): A sequence of `Doc` objects.
|
||||
tensors (object): Vector representation for each token in the docs.
|
||||
"""
|
||||
for doc, tensor in zip(docs, tensors):
|
||||
if tensor.shape[0] != len(doc):
|
||||
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
|
||||
doc.tensor = tensor
|
||||
|
||||
def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None):
|
||||
"""Update the model.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
golds (iterable): A batch of `GoldParse` objects.
|
||||
drop (float): The dropout rate.
|
||||
sgd (callable): An optimizer.
|
||||
RETURNS (dict): Results from the update.
|
||||
"""
|
||||
examples = Example.to_example_objects(examples)
|
||||
inputs = []
|
||||
bp_inputs = []
|
||||
set_dropout_rate(self.model, drop)
|
||||
for tok2vec in self.input_models:
|
||||
set_dropout_rate(tok2vec, drop)
|
||||
tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples])
|
||||
inputs.append(tensor)
|
||||
bp_inputs.append(bp_tensor)
|
||||
inputs = self.model.ops.xp.hstack(inputs)
|
||||
scores, bp_scores = self.model.begin_update(inputs)
|
||||
loss, d_scores = self.get_loss(examples, scores)
|
||||
d_inputs = bp_scores(d_scores, sgd=sgd)
|
||||
d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
|
||||
for d_input, bp_input in zip(d_inputs, bp_inputs):
|
||||
bp_input(d_input)
|
||||
if sgd is not None:
|
||||
for tok2vec in self.input_models:
|
||||
tok2vec.finish_update(sgd)
|
||||
self.model.finish_update(sgd)
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.0)
|
||||
losses[self.name] += loss
|
||||
return loss
|
||||
|
||||
def get_loss(self, examples, prediction):
|
||||
examples = Example.to_example_objects(examples)
|
||||
ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
|
||||
target = self.vocab.vectors.data[ids]
|
||||
d_scores = (prediction - target) / prediction.shape[0]
|
||||
loss = (d_scores ** 2).sum()
|
||||
return loss, d_scores
|
||||
|
||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
||||
"""Allocate models, pre-process training data and acquire an
|
||||
optimizer.
|
||||
|
||||
get_examples (iterable): Gold-standard training data.
|
||||
pipeline (list): The pipeline the model is part of.
|
||||
"""
|
||||
if pipeline is not None:
|
||||
for name, model in pipeline:
|
||||
if model.has_ref("tok2vec"):
|
||||
self.input_models.append(model.get_ref("tok2vec"))
|
||||
self.model.initialize()
|
||||
link_vectors_to_models(self.vocab)
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
||||
|
||||
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger)
|
||||
class Tagger(Pipe):
|
||||
"""Pipeline component for part-of-speech tagging.
|
||||
|
@ -1708,4 +1576,4 @@ def ner_factory(nlp, model, **cfg):
|
|||
warnings.warn(Warnings.W098.format(name="ner"))
|
||||
return EntityRecognizer.from_nlp(nlp, model, **cfg)
|
||||
|
||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
|
||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
|
||||
|
|
|
@ -123,9 +123,9 @@ def test_overfitting_IO():
|
|||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
|
||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
|
||||
{"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
|
||||
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2},
|
||||
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1},
|
||||
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3},
|
||||
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None},
|
||||
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None},
|
||||
{"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None},
|
||||
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": True},
|
||||
{"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": False},
|
||||
],
|
||||
|
|
|
@ -24,6 +24,7 @@ window_size = 1
|
|||
embed_size = 2000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
||||
|
||||
[nlp.pipeline.tagger]
|
||||
factory = "tagger"
|
||||
|
@ -53,6 +54,7 @@ embed_size = 5555
|
|||
window_size = 1
|
||||
maxout_pieces = 7
|
||||
subword_features = false
|
||||
dropout = null
|
||||
"""
|
||||
|
||||
|
||||
|
@ -70,6 +72,7 @@ def my_parser():
|
|||
nC=8,
|
||||
conv_depth=2,
|
||||
bilstm_depth=0,
|
||||
dropout=None,
|
||||
)
|
||||
parser = build_tb_parser_model(
|
||||
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
||||
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
|
||||
from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger
|
||||
from spacy.pipeline import TextCategorizer, SentenceRecognizer
|
||||
from spacy.pipeline.defaults import default_parser, default_tagger
|
||||
from spacy.pipeline.defaults import default_textcat, default_senter
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
@ -95,24 +95,6 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
|
|||
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
|
||||
|
||||
|
||||
def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
|
||||
tensorizer = Tensorizer(en_vocab, default_tensorizer())
|
||||
tensorizer_b = tensorizer.to_bytes(exclude=["vocab"])
|
||||
new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b)
|
||||
assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b
|
||||
|
||||
|
||||
def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
||||
tensorizer = Tensorizer(en_vocab, default_tensorizer())
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "tensorizer"
|
||||
tensorizer.to_disk(file_path)
|
||||
tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path)
|
||||
assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes(
|
||||
exclude=["vocab"]
|
||||
)
|
||||
|
||||
|
||||
def test_serialize_textcat_empty(en_vocab):
|
||||
# See issue #1105
|
||||
textcat = TextCategorizer(
|
||||
|
|
|
@ -15,7 +15,7 @@ def test_empty_doc():
|
|||
vocab = Vocab()
|
||||
doc = Doc(vocab, words=[])
|
||||
# TODO: fix tok2vec arguments
|
||||
tok2vec = build_Tok2Vec_model(width, embed_size)
|
||||
tok2vec = build_Tok2Vec_model(width, embed_size, dropout=None)
|
||||
vectors, backprop = tok2vec.begin_update([doc])
|
||||
assert len(vectors) == 1
|
||||
assert vectors[0].shape == (0, width)
|
||||
|
@ -38,6 +38,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
|||
char_embed=False,
|
||||
nM=64,
|
||||
nC=8,
|
||||
dropout=None,
|
||||
)
|
||||
tok2vec.initialize()
|
||||
vectors, backprop = tok2vec.begin_update(batch)
|
||||
|
@ -50,14 +51,14 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
|||
@pytest.mark.parametrize(
|
||||
"tok2vec_config",
|
||||
[
|
||||
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
|
||||
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
|
||||
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
|
||||
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
|
||||
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
|
||||
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None},
|
||||
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None},
|
||||
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None},
|
||||
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None},
|
||||
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None},
|
||||
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None},
|
||||
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None},
|
||||
{"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None},
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
|
|
Loading…
Reference in New Issue