From 257246017572433af7825d561de573dae73828f0 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 16 Feb 2020 17:16:41 +0100 Subject: [PATCH] add tok2vec parameters to train script to facilitate init_tok2vec (#5021) --- spacy/cli/pretrain.py | 14 ++++++++++---- spacy/cli/train.py | 45 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 47 insertions(+), 12 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index c1aade2b2..aaec1ea75 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -34,7 +34,7 @@ from .train import _load_pretrained_tok2vec vectors_model=("Name or path to spaCy model with vectors to learn from"), output_dir=("Directory to write models to on each epoch", "positional", None, str), width=("Width of CNN layers", "option", "cw", int), - depth=("Depth of CNN layers", "option", "cd", int), + conv_depth=("Depth of CNN layers", "option", "cd", int), cnn_window=("Window size for CNN layers", "option", "cW", int), cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int), use_chars=("Whether to use character-based embedding", "flag", "chr", bool), @@ -84,7 +84,7 @@ def pretrain( vectors_model, output_dir, width=96, - depth=4, + conv_depth=4, bilstm_depth=0, cnn_pieces=3, sa_depth=0, @@ -132,9 +132,15 @@ def pretrain( msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) + if output_dir.exists() and [p for p in output_dir.iterdir()]: + msg.warn( + "Output directory is not empty", + "It is better to use an empty directory or refer to a new output path, " + "then the new directory will be created for you.", + ) if not output_dir.exists(): output_dir.mkdir() - msg.good("Created output directory") + msg.good("Created output directory: {}".format(output_dir)) srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") @@ -162,7 +168,7 @@ def pretrain( Tok2Vec( width, embed_rows, - conv_depth=depth, + conv_depth=conv_depth, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. subword_features=not use_chars, # Set to False for Chinese etc diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 82d4da38e..5af93a8f3 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -33,6 +33,13 @@ from .. import about pipeline=("Comma-separated names of pipeline components", "option", "p", str), replace_components=("Replace components from base model", "flag", "R", bool), vectors=("Model to load vectors from", "option", "v", str), + width=("Width of CNN layers of Tok2Vec component", "option", "cw", int), + conv_depth=("Depth of CNN layers of Tok2Vec component", "option", "cd", int), + cnn_window=("Window size for CNN layers of Tok2Vec component", "option", "cW", int), + cnn_pieces=("Maxout size for CNN layers of Tok2Vec component. 1 for Mish", "option", "cP", int), + use_chars=("Whether to use character-based embedding of Tok2Vec component", "flag", "chr", bool), + bilstm_depth=("Depth of BiLSTM layers of Tok2Vec component (requires PyTorch)", "option", "lstm", int), + embed_rows=("Number of embedding rows of Tok2Vec component", "option", "er", int), n_iter=("Number of iterations", "option", "n", int), n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int), n_examples=("Number of examples", "option", "ns", int), @@ -64,6 +71,13 @@ def train( pipeline="tagger,parser,ner", replace_components=False, vectors=None, + width=96, + conv_depth=4, + cnn_window=1, + cnn_pieces=3, + use_chars=False, + bilstm_depth=0, + embed_rows=2000, n_iter=30, n_early_stopping=None, n_examples=0, @@ -116,6 +130,7 @@ def train( ) if not output_path.exists(): output_path.mkdir() + msg.good("Created output directory: {}".format(output_path)) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. @@ -250,7 +265,15 @@ def train( optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training - optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) + cfg = {"device": use_gpu} + cfg["conv_depth"] = conv_depth + cfg["token_vector_width"] = width + cfg["bilstm_depth"] = bilstm_depth + cfg["cnn_maxout_pieces"] = cnn_pieces + cfg["embed_size"] = embed_rows + cfg["conv_window"] = cnn_window + cfg["subword_features"] = not use_chars + optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg) nlp._optimizer = None @@ -375,13 +398,19 @@ def train( if not batch: continue docs, golds = zip(*batch) - nlp.update( - docs, - golds, - sgd=optimizer, - drop=next(dropout_rates), - losses=losses, - ) + try: + nlp.update( + docs, + golds, + sgd=optimizer, + drop=next(dropout_rates), + losses=losses, + ) + except ValueError as e: + msg.warn("Error during training") + if init_tok2vec: + msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?") + msg.fail("Original error message: {}".format(e), exits=1) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting.