mirror of https://github.com/explosion/spaCy.git
769 lines
34 KiB
Python
769 lines
34 KiB
Python
import os
|
|
import tqdm
|
|
from pathlib import Path
|
|
from thinc.api import use_ops
|
|
from timeit import default_timer as timer
|
|
import shutil
|
|
import srsly
|
|
from wasabi import msg
|
|
import contextlib
|
|
import random
|
|
|
|
from ..util import create_default_optimizer
|
|
from ..util import use_gpu as set_gpu
|
|
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
|
from ..gold import GoldCorpus
|
|
from .. import util
|
|
from .. import about
|
|
|
|
|
|
def train(
|
|
# fmt: off
|
|
lang: ("Model language", "positional", None, str),
|
|
output_path: ("Output directory to store model in", "positional", None, Path),
|
|
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
|
|
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
|
|
raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
|
|
base_model: ("Name of model to update (optional)", "option", "b", str) = None,
|
|
pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner",
|
|
vectors: ("Model to load vectors from", "option", "v", str) = None,
|
|
replace_components: ("Replace components from base model", "flag", "R", bool) = False,
|
|
n_iter: ("Number of iterations", "option", "n", int) = 30,
|
|
n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None,
|
|
n_examples: ("Number of examples", "option", "ns", int) = 0,
|
|
use_gpu: ("Use GPU", "option", "g", int) = -1,
|
|
version: ("Model version", "option", "V", str) = "0.0.0",
|
|
meta_path: ("Optional path to meta.json to use as base.", "option", "m", Path) = None,
|
|
init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
|
|
parser_multitasks: ("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str) = "",
|
|
entity_multitasks: ("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str) = "",
|
|
noise_level: ("Amount of corruption for data augmentation", "option", "nl", float) = 0.0,
|
|
orth_variant_level: ("Amount of orthography variation for data augmentation", "option", "ovl", float) = 0.0,
|
|
eval_beam_widths: ("Beam widths to evaluate, e.g. 4,8", "option", "bw", str) = "",
|
|
gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
|
|
learn_tokens: ("Make parser learn gold-standard tokenization", "flag", "T", bool) = False,
|
|
textcat_multilabel: ("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool) = False,
|
|
textcat_arch: ("Textcat model architecture", "option", "ta", str) = "bow",
|
|
textcat_positive_label: ("Textcat positive label for binary classes with two labels", "option", "tpl", str) = None,
|
|
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
|
|
verbose: ("Display more information for debug", "flag", "VV", bool) = False,
|
|
debug: ("Run data diagnostics before training", "flag", "D", bool) = False,
|
|
# fmt: on
|
|
):
|
|
"""
|
|
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
|
JSON format. To convert data from other formats, use the `spacy convert`
|
|
command.
|
|
"""
|
|
util.fix_random_seed()
|
|
util.set_env_log(verbose)
|
|
|
|
# Make sure all files and paths exists if they are needed
|
|
train_path = util.ensure_path(train_path)
|
|
dev_path = util.ensure_path(dev_path)
|
|
meta_path = util.ensure_path(meta_path)
|
|
output_path = util.ensure_path(output_path)
|
|
if raw_text is not None:
|
|
raw_text = list(srsly.read_jsonl(raw_text))
|
|
if not train_path or not train_path.exists():
|
|
msg.fail("Training data not found", train_path, exits=1)
|
|
if not dev_path or not dev_path.exists():
|
|
msg.fail("Development data not found", dev_path, exits=1)
|
|
if meta_path is not None and not meta_path.exists():
|
|
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
|
meta = srsly.read_json(meta_path) if meta_path else {}
|
|
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
|
|
msg.warn(
|
|
"Output directory is not empty",
|
|
"This can lead to unintended side effects when saving the model. "
|
|
"Please use an empty directory or a different path instead. If "
|
|
"the specified output path doesn't exist, the directory will be "
|
|
"created for you.",
|
|
)
|
|
if not output_path.exists():
|
|
output_path.mkdir()
|
|
msg.good(f"Created output directory: {output_path}")
|
|
|
|
tag_map = {}
|
|
if tag_map_path is not None:
|
|
tag_map = srsly.read_json(tag_map_path)
|
|
# Take dropout and batch size as generators of values -- dropout
|
|
# starts high and decays sharply, to force the optimizer to explore.
|
|
# Batch size starts at 1 and grows, so that we make updates quickly
|
|
# at the beginning of training.
|
|
dropout_rates = util.decaying(
|
|
util.env_opt("dropout_from", 0.2),
|
|
util.env_opt("dropout_to", 0.2),
|
|
util.env_opt("dropout_decay", 0.0),
|
|
)
|
|
batch_sizes = util.compounding(
|
|
util.env_opt("batch_from", 100.0),
|
|
util.env_opt("batch_to", 1000.0),
|
|
util.env_opt("batch_compound", 1.001),
|
|
)
|
|
|
|
if not eval_beam_widths:
|
|
eval_beam_widths = [1]
|
|
else:
|
|
eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
|
|
if 1 not in eval_beam_widths:
|
|
eval_beam_widths.append(1)
|
|
eval_beam_widths.sort()
|
|
has_beam_widths = eval_beam_widths != [1]
|
|
|
|
default_dir = Path(__file__).parent.parent / "ml" / "models" / "defaults"
|
|
|
|
# Set up the base model and pipeline. If a base model is specified, load
|
|
# the model and make sure the pipeline matches the pipeline setting. If
|
|
# training starts from a blank model, intitalize the language class.
|
|
pipeline = [p.strip() for p in pipeline.split(",")]
|
|
msg.text(f"Training pipeline: {pipeline}")
|
|
disabled_pipes = None
|
|
pipes_added = False
|
|
if use_gpu >= 0:
|
|
activated_gpu = None
|
|
try:
|
|
activated_gpu = set_gpu(use_gpu)
|
|
except Exception as e:
|
|
msg.warn(f"Exception: {e}")
|
|
if activated_gpu is not None:
|
|
msg.text(f"Using GPU: {use_gpu}")
|
|
else:
|
|
msg.warn(f"Unable to activate GPU: {use_gpu}")
|
|
msg.text("Using CPU only")
|
|
use_gpu = -1
|
|
if base_model:
|
|
msg.text(f"Starting with base model '{base_model}'")
|
|
nlp = util.load_model(base_model)
|
|
if nlp.lang != lang:
|
|
msg.fail(
|
|
f"Model language ('{nlp.lang}') doesn't match language "
|
|
f"specified as `lang` argument ('{lang}') ",
|
|
exits=1,
|
|
)
|
|
if vectors:
|
|
msg.text(f"Loading vectors from model '{vectors}'")
|
|
_load_vectors(nlp, vectors)
|
|
|
|
nlp.select_pipes(disable=[p for p in nlp.pipe_names if p not in pipeline])
|
|
for pipe in pipeline:
|
|
# first, create the model.
|
|
# Bit of a hack after the refactor to get the vectors into a default config
|
|
# use train-from-config instead :-)
|
|
if pipe == "parser":
|
|
config_loc = default_dir / "parser_defaults.cfg"
|
|
elif pipe == "tagger":
|
|
config_loc = default_dir / "tagger_defaults.cfg"
|
|
elif pipe == "ner":
|
|
config_loc = default_dir / "ner_defaults.cfg"
|
|
elif pipe == "textcat":
|
|
config_loc = default_dir / "textcat_defaults.cfg"
|
|
elif pipe == "senter":
|
|
config_loc = default_dir / "senter_defaults.cfg"
|
|
else:
|
|
raise ValueError(f"Component {pipe} currently not supported.")
|
|
pipe_cfg = util.load_config(config_loc, create_objects=False)
|
|
if vectors:
|
|
pretrained_config = {
|
|
"@architectures": "spacy.VocabVectors.v1",
|
|
"name": vectors,
|
|
}
|
|
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
|
|
|
|
if pipe == "parser":
|
|
pipe_cfg["learn_tokens"] = learn_tokens
|
|
elif pipe == "textcat":
|
|
pipe_cfg["exclusive_classes"] = not textcat_multilabel
|
|
pipe_cfg["architecture"] = textcat_arch
|
|
pipe_cfg["positive_label"] = textcat_positive_label
|
|
|
|
if pipe not in nlp.pipe_names:
|
|
msg.text(f"Adding component to base model '{pipe}'")
|
|
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
|
pipes_added = True
|
|
elif replace_components:
|
|
msg.text(f"Replacing component from base model '{pipe}'")
|
|
nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
|
|
pipes_added = True
|
|
else:
|
|
if pipe == "textcat":
|
|
textcat_cfg = nlp.get_pipe("textcat").cfg
|
|
base_cfg = {
|
|
"exclusive_classes": textcat_cfg["exclusive_classes"],
|
|
"architecture": textcat_cfg["architecture"],
|
|
"positive_label": textcat_cfg["positive_label"],
|
|
}
|
|
if base_cfg != pipe_cfg:
|
|
msg.fail(
|
|
f"The base textcat model configuration does"
|
|
f"not match the provided training options. "
|
|
f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
|
|
exits=1,
|
|
)
|
|
msg.text(f"Extending component from base model '{pipe}'")
|
|
disabled_pipes = nlp.select_pipes(
|
|
disable=[p for p in nlp.pipe_names if p not in pipeline]
|
|
)
|
|
else:
|
|
msg.text(f"Starting with blank model '{lang}'")
|
|
lang_cls = util.get_lang_class(lang)
|
|
nlp = lang_cls()
|
|
|
|
if vectors:
|
|
msg.text(f"Loading vectors from model '{vectors}'")
|
|
_load_vectors(nlp, vectors)
|
|
|
|
for pipe in pipeline:
|
|
# first, create the model.
|
|
# Bit of a hack after the refactor to get the vectors into a default config
|
|
# use train-from-config instead :-)
|
|
if pipe == "parser":
|
|
config_loc = default_dir / "parser_defaults.cfg"
|
|
elif pipe == "tagger":
|
|
config_loc = default_dir / "tagger_defaults.cfg"
|
|
elif pipe == "morphologizer":
|
|
config_loc = default_dir / "morphologizer_defaults.cfg"
|
|
elif pipe == "ner":
|
|
config_loc = default_dir / "ner_defaults.cfg"
|
|
elif pipe == "textcat":
|
|
config_loc = default_dir / "textcat_defaults.cfg"
|
|
elif pipe == "senter":
|
|
config_loc = default_dir / "senter_defaults.cfg"
|
|
else:
|
|
raise ValueError(f"Component {pipe} currently not supported.")
|
|
pipe_cfg = util.load_config(config_loc, create_objects=False)
|
|
if vectors:
|
|
pretrained_config = {
|
|
"@architectures": "spacy.VocabVectors.v1",
|
|
"name": vectors,
|
|
}
|
|
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
|
|
|
|
if pipe == "parser":
|
|
pipe_cfg["learn_tokens"] = learn_tokens
|
|
elif pipe == "textcat":
|
|
pipe_cfg["exclusive_classes"] = not textcat_multilabel
|
|
pipe_cfg["architecture"] = textcat_arch
|
|
pipe_cfg["positive_label"] = textcat_positive_label
|
|
|
|
pipe = nlp.create_pipe(pipe, config=pipe_cfg)
|
|
nlp.add_pipe(pipe)
|
|
|
|
# Update tag map with provided mapping
|
|
nlp.vocab.morphology.tag_map.update(tag_map)
|
|
|
|
# Multitask objectives
|
|
multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
|
|
for pipe_name, multitasks in multitask_options:
|
|
if multitasks:
|
|
if pipe_name not in pipeline:
|
|
msg.fail(
|
|
f"Can't use multitask objective without '{pipe_name}' in "
|
|
f"the pipeline"
|
|
)
|
|
pipe = nlp.get_pipe(pipe_name)
|
|
for objective in multitasks.split(","):
|
|
pipe.add_multitask_objective(objective)
|
|
|
|
# Prepare training corpus
|
|
msg.text(f"Counting training words (limit={n_examples})")
|
|
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
|
|
n_train_words = corpus.count_train()
|
|
|
|
if base_model and not pipes_added:
|
|
# Start with an existing model, use default optimizer
|
|
optimizer = create_default_optimizer()
|
|
else:
|
|
# Start with a blank model, call begin_training
|
|
cfg = {"device": use_gpu}
|
|
optimizer = nlp.begin_training(lambda: corpus.train_examples, **cfg)
|
|
nlp._optimizer = None
|
|
|
|
# Load in pretrained weights (TODO: this may be broken in the config rewrite)
|
|
if init_tok2vec is not None:
|
|
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
|
msg.text(f"Loaded pretrained tok2vec for: {components}")
|
|
|
|
# Verify textcat config
|
|
if "textcat" in pipeline:
|
|
textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
|
|
if textcat_positive_label and textcat_positive_label not in textcat_labels:
|
|
msg.fail(
|
|
f"The textcat_positive_label (tpl) '{textcat_positive_label}' "
|
|
f"does not match any label in the training data.",
|
|
exits=1,
|
|
)
|
|
if textcat_positive_label and len(textcat_labels) != 2:
|
|
msg.fail(
|
|
"A textcat_positive_label (tpl) '{textcat_positive_label}' was "
|
|
"provided for training data that does not appear to be a "
|
|
"binary classification problem with two labels.",
|
|
exits=1,
|
|
)
|
|
train_data = corpus.train_data(
|
|
nlp,
|
|
noise_level=noise_level,
|
|
gold_preproc=gold_preproc,
|
|
max_length=0,
|
|
ignore_misaligned=True,
|
|
)
|
|
train_labels = set()
|
|
if textcat_multilabel:
|
|
multilabel_found = False
|
|
for ex in train_data:
|
|
train_labels.update(ex.gold.cats.keys())
|
|
if list(ex.gold.cats.values()).count(1.0) != 1:
|
|
multilabel_found = True
|
|
if not multilabel_found and not base_model:
|
|
msg.warn(
|
|
"The textcat training instances look like they have "
|
|
"mutually-exclusive classes. Remove the flag "
|
|
"'--textcat-multilabel' to train a classifier with "
|
|
"mutually-exclusive classes."
|
|
)
|
|
if not textcat_multilabel:
|
|
for ex in train_data:
|
|
train_labels.update(ex.gold.cats.keys())
|
|
if list(ex.gold.cats.values()).count(1.0) != 1 and not base_model:
|
|
msg.warn(
|
|
"Some textcat training instances do not have exactly "
|
|
"one positive label. Modifying training options to "
|
|
"include the flag '--textcat-multilabel' for classes "
|
|
"that are not mutually exclusive."
|
|
)
|
|
nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
|
|
textcat_multilabel = True
|
|
break
|
|
if base_model and set(textcat_labels) != train_labels:
|
|
msg.fail(
|
|
f"Cannot extend textcat model using data with different "
|
|
f"labels. Base model labels: {textcat_labels}, training data "
|
|
f"labels: {list(train_labels)}",
|
|
exits=1,
|
|
)
|
|
if textcat_multilabel:
|
|
msg.text(
|
|
f"Textcat evaluation score: ROC AUC score macro-averaged across "
|
|
f"the labels '{', '.join(textcat_labels)}'"
|
|
)
|
|
elif textcat_positive_label and len(textcat_labels) == 2:
|
|
msg.text(
|
|
f"Textcat evaluation score: F1-score for the "
|
|
f"label '{textcat_positive_label}'"
|
|
)
|
|
elif len(textcat_labels) > 1:
|
|
if len(textcat_labels) == 2:
|
|
msg.warn(
|
|
"If the textcat component is a binary classifier with "
|
|
"exclusive classes, provide '--textcat_positive_label' for "
|
|
"an evaluation on the positive class."
|
|
)
|
|
msg.text(
|
|
f"Textcat evaluation score: F1-score macro-averaged across "
|
|
f"the labels '{', '.join(textcat_labels)}'"
|
|
)
|
|
else:
|
|
msg.fail(
|
|
"Unsupported textcat configuration. Use `spacy debug-data` "
|
|
"for more information."
|
|
)
|
|
|
|
# fmt: off
|
|
row_head, output_stats = _configure_training_output(pipeline, use_gpu, has_beam_widths)
|
|
row_widths = [len(w) for w in row_head]
|
|
row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
|
|
# fmt: on
|
|
print("")
|
|
msg.row(row_head, **row_settings)
|
|
msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
|
|
try:
|
|
iter_since_best = 0
|
|
best_score = 0.0
|
|
for i in range(n_iter):
|
|
train_data = corpus.train_dataset(
|
|
nlp,
|
|
noise_level=noise_level,
|
|
orth_variant_level=orth_variant_level,
|
|
gold_preproc=gold_preproc,
|
|
max_length=0,
|
|
ignore_misaligned=True,
|
|
)
|
|
if raw_text:
|
|
random.shuffle(raw_text)
|
|
raw_batches = util.minibatch(
|
|
(nlp.make_doc(rt["text"]) for rt in raw_text), size=8
|
|
)
|
|
words_seen = 0
|
|
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
|
losses = {}
|
|
for batch in util.minibatch_by_words(train_data, size=batch_sizes):
|
|
if not batch:
|
|
continue
|
|
try:
|
|
nlp.update(
|
|
batch,
|
|
sgd=optimizer,
|
|
drop=next(dropout_rates),
|
|
losses=losses,
|
|
)
|
|
except ValueError as e:
|
|
err = "Error during training"
|
|
if init_tok2vec:
|
|
err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
|
|
msg.fail(err, f"Original error message: {e}", exits=1)
|
|
if raw_text:
|
|
# If raw text is available, perform 'rehearsal' updates,
|
|
# which use unlabelled data to reduce overfitting.
|
|
raw_batch = list(next(raw_batches))
|
|
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
|
|
docs = [ex.doc for ex in batch]
|
|
if not int(os.environ.get("LOG_FRIENDLY", 0)):
|
|
pbar.update(sum(len(doc) for doc in docs))
|
|
words_seen += sum(len(doc) for doc in docs)
|
|
with nlp.use_params(optimizer.averages):
|
|
util.set_env_log(False)
|
|
epoch_model_path = output_path / f"model{i}"
|
|
nlp.to_disk(epoch_model_path)
|
|
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
|
for beam_width in eval_beam_widths:
|
|
for name, component in nlp_loaded.pipeline:
|
|
if hasattr(component, "cfg"):
|
|
component.cfg["beam_width"] = beam_width
|
|
dev_dataset = list(
|
|
corpus.dev_dataset(
|
|
nlp_loaded,
|
|
gold_preproc=gold_preproc,
|
|
ignore_misaligned=True,
|
|
)
|
|
)
|
|
nwords = sum(len(ex.doc) for ex in dev_dataset)
|
|
start_time = timer()
|
|
scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
|
|
end_time = timer()
|
|
if use_gpu < 0:
|
|
gpu_wps = None
|
|
cpu_wps = nwords / (end_time - start_time)
|
|
else:
|
|
gpu_wps = nwords / (end_time - start_time)
|
|
with use_ops("numpy"):
|
|
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
|
for name, component in nlp_loaded.pipeline:
|
|
if hasattr(component, "cfg"):
|
|
component.cfg["beam_width"] = beam_width
|
|
dev_dataset = list(
|
|
corpus.dev_dataset(
|
|
nlp_loaded,
|
|
gold_preproc=gold_preproc,
|
|
ignore_misaligned=True,
|
|
)
|
|
)
|
|
start_time = timer()
|
|
scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
|
|
end_time = timer()
|
|
cpu_wps = nwords / (end_time - start_time)
|
|
acc_loc = output_path / f"model{i}" / "accuracy.json"
|
|
srsly.write_json(acc_loc, scorer.scores)
|
|
|
|
# Update model meta.json
|
|
meta["lang"] = nlp.lang
|
|
meta["pipeline"] = nlp.pipe_names
|
|
meta["spacy_version"] = about.__version__
|
|
if beam_width == 1:
|
|
meta["speed"] = {
|
|
"nwords": nwords,
|
|
"cpu": cpu_wps,
|
|
"gpu": gpu_wps,
|
|
}
|
|
meta.setdefault("accuracy", {})
|
|
for component in nlp.pipe_names:
|
|
for metric in _get_metrics(component):
|
|
meta["accuracy"][metric] = scorer.scores[metric]
|
|
else:
|
|
meta.setdefault("beam_accuracy", {})
|
|
meta.setdefault("beam_speed", {})
|
|
for component in nlp.pipe_names:
|
|
for metric in _get_metrics(component):
|
|
meta["beam_accuracy"][metric] = scorer.scores[metric]
|
|
meta["beam_speed"][beam_width] = {
|
|
"nwords": nwords,
|
|
"cpu": cpu_wps,
|
|
"gpu": gpu_wps,
|
|
}
|
|
meta["vectors"] = {
|
|
"width": nlp.vocab.vectors_length,
|
|
"vectors": len(nlp.vocab.vectors),
|
|
"keys": nlp.vocab.vectors.n_keys,
|
|
"name": nlp.vocab.vectors.name,
|
|
}
|
|
meta.setdefault("name", f"model{i}")
|
|
meta.setdefault("version", version)
|
|
meta["labels"] = nlp.meta["labels"]
|
|
meta_loc = output_path / f"model{i}" / "meta.json"
|
|
srsly.write_json(meta_loc, meta)
|
|
util.set_env_log(verbose)
|
|
|
|
progress = _get_progress(
|
|
i,
|
|
losses,
|
|
scorer.scores,
|
|
output_stats,
|
|
beam_width=beam_width if has_beam_widths else None,
|
|
cpu_wps=cpu_wps,
|
|
gpu_wps=gpu_wps,
|
|
)
|
|
if i == 0 and "textcat" in pipeline:
|
|
textcats_per_cat = scorer.scores.get("textcats_per_cat", {})
|
|
for cat, cat_score in textcats_per_cat.items():
|
|
if cat_score.get("roc_auc_score", 0) < 0:
|
|
msg.warn(
|
|
f"Textcat ROC AUC score is undefined due to "
|
|
f"only one value in label '{cat}'."
|
|
)
|
|
msg.row(progress, **row_settings)
|
|
# Early stopping
|
|
if n_early_stopping is not None:
|
|
current_score = _score_for_model(meta)
|
|
if current_score < best_score:
|
|
iter_since_best += 1
|
|
else:
|
|
iter_since_best = 0
|
|
best_score = current_score
|
|
if iter_since_best >= n_early_stopping:
|
|
msg.text(
|
|
f"Early stopping, best iteration is: {i - iter_since_best}"
|
|
)
|
|
msg.text(
|
|
f"Best score = {best_score}; Final iteration score = {current_score}"
|
|
)
|
|
break
|
|
except Exception as e:
|
|
msg.warn(f"Aborting and saving final best model. Encountered exception: {e}")
|
|
finally:
|
|
best_pipes = nlp.pipe_names
|
|
if disabled_pipes:
|
|
disabled_pipes.restore()
|
|
with nlp.use_params(optimizer.averages):
|
|
final_model_path = output_path / "model-final"
|
|
nlp.to_disk(final_model_path)
|
|
meta_loc = output_path / "model-final" / "meta.json"
|
|
final_meta = srsly.read_json(meta_loc)
|
|
final_meta.setdefault("accuracy", {})
|
|
final_meta["accuracy"].update(meta.get("accuracy", {}))
|
|
final_meta.setdefault("speed", {})
|
|
final_meta["speed"].setdefault("cpu", None)
|
|
final_meta["speed"].setdefault("gpu", None)
|
|
meta.setdefault("speed", {})
|
|
meta["speed"].setdefault("cpu", None)
|
|
meta["speed"].setdefault("gpu", None)
|
|
# combine cpu and gpu speeds with the base model speeds
|
|
if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
|
|
speed = _get_total_speed(
|
|
[final_meta["speed"]["cpu"], meta["speed"]["cpu"]]
|
|
)
|
|
final_meta["speed"]["cpu"] = speed
|
|
if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
|
|
speed = _get_total_speed(
|
|
[final_meta["speed"]["gpu"], meta["speed"]["gpu"]]
|
|
)
|
|
final_meta["speed"]["gpu"] = speed
|
|
# if there were no speeds to update, overwrite with meta
|
|
if (
|
|
final_meta["speed"]["cpu"] is None
|
|
and final_meta["speed"]["gpu"] is None
|
|
):
|
|
final_meta["speed"].update(meta["speed"])
|
|
# note: beam speeds are not combined with the base model
|
|
if has_beam_widths:
|
|
final_meta.setdefault("beam_accuracy", {})
|
|
final_meta["beam_accuracy"].update(meta.get("beam_accuracy", {}))
|
|
final_meta.setdefault("beam_speed", {})
|
|
final_meta["beam_speed"].update(meta.get("beam_speed", {}))
|
|
srsly.write_json(meta_loc, final_meta)
|
|
msg.good("Saved model to output directory", final_model_path)
|
|
with msg.loading("Creating best model..."):
|
|
best_model_path = _collate_best_model(final_meta, output_path, best_pipes)
|
|
msg.good("Created best model", best_model_path)
|
|
|
|
|
|
def _score_for_model(meta):
|
|
""" Returns mean score between tasks in pipeline that can be used for early stopping. """
|
|
mean_acc = list()
|
|
pipes = meta["pipeline"]
|
|
acc = meta["accuracy"]
|
|
if "tagger" in pipes:
|
|
mean_acc.append(acc["tags_acc"])
|
|
if "morphologizer" in pipes:
|
|
mean_acc.append((acc["morphs_acc"] + acc["pos_acc"]) / 2)
|
|
if "parser" in pipes:
|
|
mean_acc.append((acc["uas"] + acc["las"]) / 2)
|
|
if "ner" in pipes:
|
|
mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
|
|
if "textcat" in pipes:
|
|
mean_acc.append(acc["textcat_score"])
|
|
if "senter" in pipes:
|
|
mean_acc.append((acc["sent_p"] + acc["sent_r"] + acc["sent_f"]) / 3)
|
|
return sum(mean_acc) / len(mean_acc)
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def _create_progress_bar(total):
|
|
if int(os.environ.get("LOG_FRIENDLY", 0)):
|
|
yield
|
|
else:
|
|
pbar = tqdm.tqdm(total=total, leave=False)
|
|
yield pbar
|
|
|
|
|
|
def _load_vectors(nlp, vectors):
|
|
loaded_model = util.load_model(vectors, vocab=nlp.vocab)
|
|
for lex in nlp.vocab:
|
|
values = {}
|
|
for attr, func in nlp.vocab.lex_attr_getters.items():
|
|
# These attrs are expected to be set by data. Others should
|
|
# be set by calling the language functions.
|
|
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
|
|
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
|
lex.set_attrs(**values)
|
|
lex.is_oov = False
|
|
return loaded_model
|
|
|
|
|
|
def _load_pretrained_tok2vec(nlp, loc):
|
|
"""Load pretrained weights for the 'token-to-vector' part of the component
|
|
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
|
"""
|
|
with loc.open("rb") as file_:
|
|
weights_data = file_.read()
|
|
loaded = []
|
|
for name, component in nlp.pipeline:
|
|
if hasattr(component, "model") and component.model.has_ref("tok2vec"):
|
|
component.get_ref("tok2vec").from_bytes(weights_data)
|
|
loaded.append(name)
|
|
return loaded
|
|
|
|
|
|
def _collate_best_model(meta, output_path, components):
|
|
bests = {}
|
|
meta.setdefault("accuracy", {})
|
|
for component in components:
|
|
bests[component] = _find_best(output_path, component)
|
|
best_dest = output_path / "model-best"
|
|
shutil.copytree(str(output_path / "model-final"), str(best_dest))
|
|
for component, best_component_src in bests.items():
|
|
shutil.rmtree(str(best_dest / component))
|
|
shutil.copytree(str(best_component_src / component), str(best_dest / component))
|
|
accs = srsly.read_json(best_component_src / "accuracy.json")
|
|
for metric in _get_metrics(component):
|
|
meta["accuracy"][metric] = accs[metric]
|
|
srsly.write_json(best_dest / "meta.json", meta)
|
|
return best_dest
|
|
|
|
|
|
def _find_best(experiment_dir, component):
|
|
accuracies = []
|
|
for epoch_model in experiment_dir.iterdir():
|
|
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
|
accs = srsly.read_json(epoch_model / "accuracy.json")
|
|
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
|
# remove per_type dicts from score list for max() comparison
|
|
scores = [score for score in scores if isinstance(score, float)]
|
|
accuracies.append((scores, epoch_model))
|
|
if accuracies:
|
|
return max(accuracies)[1]
|
|
else:
|
|
return None
|
|
|
|
|
|
def _get_metrics(component):
|
|
if component == "parser":
|
|
return ("las", "uas", "las_per_type", "sent_f", "token_acc")
|
|
elif component == "tagger":
|
|
return ("tags_acc", "token_acc")
|
|
elif component == "morphologizer":
|
|
return ("morphs_acc", "pos_acc", "token_acc")
|
|
elif component == "ner":
|
|
return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc")
|
|
elif component == "senter":
|
|
return ("sent_f", "sent_p", "sent_r", "token_acc")
|
|
elif component == "textcat":
|
|
return ("textcat_score", "token_acc")
|
|
return ("token_acc",)
|
|
|
|
|
|
def _configure_training_output(pipeline, use_gpu, has_beam_widths):
|
|
row_head = ["Itn"]
|
|
output_stats = []
|
|
for pipe in pipeline:
|
|
if pipe == "tagger":
|
|
row_head.extend(["Tag Loss ", " Tag % "])
|
|
output_stats.extend(["tag_loss", "tags_acc"])
|
|
elif pipe == "morphologizer" or pipe == "morphologizertagger":
|
|
row_head.extend(["Morph Loss ", " Morph % ", " POS % "])
|
|
output_stats.extend(["morph_loss", "morphs_acc", "pos_acc"])
|
|
elif pipe == "parser":
|
|
row_head.extend(
|
|
["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"]
|
|
)
|
|
output_stats.extend(
|
|
["dep_loss", "uas", "las", "sent_p", "sent_r", "sent_f"]
|
|
)
|
|
elif pipe == "ner":
|
|
row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "])
|
|
output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"])
|
|
elif pipe == "textcat":
|
|
row_head.extend(["Textcat Loss", "Textcat"])
|
|
output_stats.extend(["textcat_loss", "textcat_score"])
|
|
elif pipe == "senter":
|
|
row_head.extend(["Senter Loss", "Sent P", "Sent R", "Sent F"])
|
|
output_stats.extend(["senter_loss", "sent_p", "sent_r", "sent_f"])
|
|
row_head.extend(["Token %", "CPU WPS"])
|
|
output_stats.extend(["token_acc", "cpu_wps"])
|
|
|
|
if use_gpu >= 0:
|
|
row_head.extend(["GPU WPS"])
|
|
output_stats.extend(["gpu_wps"])
|
|
|
|
if has_beam_widths:
|
|
row_head.insert(1, "Beam W.")
|
|
# remove duplicates
|
|
row_head_dict = {k: 1 for k in row_head}
|
|
output_stats_dict = {k: 1 for k in output_stats}
|
|
return row_head_dict.keys(), output_stats_dict.keys()
|
|
|
|
|
|
def _get_progress(
|
|
itn, losses, dev_scores, output_stats, beam_width=None, cpu_wps=0.0, gpu_wps=0.0
|
|
):
|
|
scores = {}
|
|
for stat in output_stats:
|
|
scores[stat] = 0.0
|
|
scores["dep_loss"] = losses.get("parser", 0.0)
|
|
scores["ner_loss"] = losses.get("ner", 0.0)
|
|
scores["tag_loss"] = losses.get("tagger", 0.0)
|
|
scores["morph_loss"] = losses.get("morphologizer", 0.0)
|
|
scores["textcat_loss"] = losses.get("textcat", 0.0)
|
|
scores["senter_loss"] = losses.get("senter", 0.0)
|
|
scores["cpu_wps"] = cpu_wps
|
|
scores["gpu_wps"] = gpu_wps or 0.0
|
|
scores.update(dev_scores)
|
|
formatted_scores = []
|
|
for stat in output_stats:
|
|
format_spec = "{:.3f}"
|
|
if stat.endswith("_wps"):
|
|
format_spec = "{:.0f}"
|
|
formatted_scores.append(format_spec.format(scores[stat]))
|
|
result = [itn + 1]
|
|
result.extend(formatted_scores)
|
|
if beam_width is not None:
|
|
result.insert(1, beam_width)
|
|
return result
|
|
|
|
|
|
def _get_total_speed(speeds):
|
|
seconds_per_word = 0.0
|
|
for words_per_second in speeds:
|
|
if words_per_second is None:
|
|
return None
|
|
seconds_per_word += 1.0 / words_per_second
|
|
return 1.0 / seconds_per_word
|