add tok2vec parameters to train script to facilitate init_tok2vec (#5021)

2020-02-16 17:16:41 +01:00 · 2020-02-16 17:16:41 +01:00 · 2572460175
parent a27c77ce62
commit 2572460175
2 changed files with 47 additions and 12 deletions
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -34,7 +34,7 @@ from .train import _load_pretrained_tok2vec
    vectors_model=("Name or path to spaCy model with vectors to learn from"),
    output_dir=("Directory to write models to on each epoch", "positional", None, str),
    width=("Width of CNN layers", "option", "cw", int),
-    depth=("Depth of CNN layers", "option", "cd", int),
+    conv_depth=("Depth of CNN layers", "option", "cd", int),
    cnn_window=("Window size for CNN layers", "option", "cW", int),
    cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int),
    use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
@ -84,7 +84,7 @@ def pretrain(
    vectors_model,
    output_dir,
    width=96,
-    depth=4,
+    conv_depth=4,
    bilstm_depth=0,
    cnn_pieces=3,
    sa_depth=0,
@ -132,9 +132,15 @@ def pretrain(
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
+    if output_dir.exists() and [p for p in output_dir.iterdir()]:
+        msg.warn(
+            "Output directory is not empty",
+            "It is better to use an empty directory or refer to a new output path, "
+            "then the new directory will be created for you.",
+        )
    if not output_dir.exists():
        output_dir.mkdir()
-        msg.good("Created output directory")
+        msg.good("Created output directory: {}".format(output_dir))
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

@ -162,7 +168,7 @@ def pretrain(
        Tok2Vec(
            width,
            embed_rows,
-            conv_depth=depth,
+            conv_depth=conv_depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=bilstm_depth,  # Requires PyTorch. Experimental.
            subword_features=not use_chars,  # Set to False for Chinese etc
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -33,6 +33,13 @@ from .. import about
    pipeline=("Comma-separated names of pipeline components", "option", "p", str),
    replace_components=("Replace components from base model", "flag", "R", bool),
    vectors=("Model to load vectors from", "option", "v", str),
+    width=("Width of CNN layers of Tok2Vec component", "option", "cw", int),
+    conv_depth=("Depth of CNN layers of Tok2Vec component", "option", "cd", int),
+    cnn_window=("Window size for CNN layers of Tok2Vec component", "option", "cW", int),
+    cnn_pieces=("Maxout size for CNN layers of Tok2Vec component. 1 for Mish", "option", "cP", int),
+    use_chars=("Whether to use character-based embedding of Tok2Vec component", "flag", "chr", bool),
+    bilstm_depth=("Depth of BiLSTM layers of Tok2Vec component (requires PyTorch)", "option", "lstm", int),
+    embed_rows=("Number of embedding rows of Tok2Vec component", "option", "er", int),
    n_iter=("Number of iterations", "option", "n", int),
    n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int),
    n_examples=("Number of examples", "option", "ns", int),
@ -64,6 +71,13 @@ def train(
    pipeline="tagger,parser,ner",
    replace_components=False,
    vectors=None,
+    width=96,
+    conv_depth=4,
+    cnn_window=1,
+    cnn_pieces=3,
+    use_chars=False,
+    bilstm_depth=0,
+    embed_rows=2000,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
@ -116,6 +130,7 @@ def train(
        )
    if not output_path.exists():
        output_path.mkdir()
+        msg.good("Created output directory: {}".format(output_path))

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
@ -250,7 +265,15 @@ def train(
        optimizer = create_default_optimizer(Model.ops)
    else:
        # Start with a blank model, call begin_training
-        optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
+        cfg = {"device": use_gpu}
+        cfg["conv_depth"] = conv_depth
+        cfg["token_vector_width"] = width
+        cfg["bilstm_depth"] = bilstm_depth
+        cfg["cnn_maxout_pieces"] = cnn_pieces
+        cfg["embed_size"] = embed_rows
+        cfg["conv_window"] = cnn_window
+        cfg["subword_features"] = not use_chars
+        optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg)

    nlp._optimizer = None

@ -375,13 +398,19 @@ def train(
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
-                    nlp.update(
-                        docs,
-                        golds,
-                        sgd=optimizer,
-                        drop=next(dropout_rates),
-                        losses=losses,
-                    )
+                    try:
+                        nlp.update(
+                            docs,
+                            golds,
+                            sgd=optimizer,
+                            drop=next(dropout_rates),
+                            losses=losses,
+                        )
+                    except ValueError as e:
+                        msg.warn("Error during training")
+                        if init_tok2vec:
+                            msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?")
+                        msg.fail("Original error message: {}".format(e), exits=1)
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.