From 51fa929f47120272bd6b8dfbba1f000833446f0f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 15 Sep 2020 21:58:04 +0200
Subject: [PATCH] rewrite train_corpus to corpus.train in config

---
 extra/experiments/onto-joint/defaults.cfg     |  6 ++--
 .../ptb-joint-pos-dep/defaults.cfg            |  6 ++--
 spacy/cli/templates/quickstart_training.jinja |  6 ++--
 spacy/cli/train.py                            |  4 +--
 spacy/default_config.cfg                      |  6 ++--
 spacy/schemas.py                              |  3 +-
 .../tests/serialize/test_serialize_config.py  | 16 +++++----
 website/docs/api/corpus.md                    |  2 +-
 website/docs/api/data-formats.md              | 35 +++++++++----------
 website/docs/api/top-level.md                 |  4 +--
 website/docs/usage/projects.md                |  2 +-
 website/docs/usage/training.md                |  2 +-
 12 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg
index 7954b57b5..97eebe6b4 100644
--- a/extra/experiments/onto-joint/defaults.cfg
+++ b/extra/experiments/onto-joint/defaults.cfg
@@ -21,14 +21,16 @@ eval_frequency = 200
 score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2}
 frozen_components = []
 
-[training.train_corpus]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths:train}
 gold_preproc = true
 max_length = 0
 limit = 0
 
-[training.dev_corpus]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths:dev}
 gold_preproc = ${training.read_train:gold_preproc}
diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
index 8f9c5666e..03e2f5bd7 100644
--- a/extra/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
@@ -20,14 +20,16 @@ patience = 10000
 eval_frequency = 200
 score_weights = {"dep_las": 0.8, "tag_acc": 0.2}
 
-[training.read_train]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths:train}
 gold_preproc = true
 max_length = 0
 limit = 0
 
-[training.read_dev]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths:dev}
 gold_preproc = ${training.read_train:gold_preproc}
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 199aae217..39d4d875d 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -195,12 +195,14 @@ total_steps = 20000
 initial_rate = 5e-5
 {% endif %}
 
-[training.train_corpus]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
 max_length = {{ 500 if hardware == "gpu" else 2000 }}
 
-[training.dev_corpus]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 max_length = 0
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index ae4a8455e..2c2eeb88b 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -92,8 +92,8 @@ def train(
     raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
     T_cfg = config["training"]
     optimizer = T_cfg["optimizer"]
-    train_corpus = T_cfg["train_corpus"]
-    dev_corpus = T_cfg["dev_corpus"]
+    train_corpus = T_cfg["corpus"]["train"]
+    dev_corpus = T_cfg["corpus"]["dev"]
     batcher = T_cfg["batcher"]
     train_logger = T_cfg["logger"]
     # Components that shouldn't be updated during training
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 7cd71453f..61f3dfe25 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -44,7 +44,9 @@ frozen_components = []
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
 
-[training.train_corpus]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
 # Whether to train on sequences with 'gold standard' sentence boundaries
@@ -56,7 +58,7 @@ max_length = 0
 # Limitation on number of training examples
 limit = 0
 
-[training.dev_corpus]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 # Whether to train on sequences with 'gold standard' sentence boundaries
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 0dd2b9204..d8bcf3c1d 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -198,8 +198,7 @@ class ModelMetaSchema(BaseModel):
 class ConfigSchemaTraining(BaseModel):
     # fmt: off
     vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
-    train_corpus: Reader = Field(..., title="Reader for the training data")
-    dev_corpus: Reader = Field(..., title="Reader for the dev data")
+    corpus: Reader = Field(..., title="Reader for the training and dev data")
     batcher: Batcher = Field(..., title="Batcher for the training data")
     dropout: StrictFloat = Field(..., title="Dropout rate")
     patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 0ab212fda..d113ac2a5 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -19,11 +19,13 @@ dev = ""
 
 [training]
 
-[training.train_corpus]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
 
-[training.dev_corpus]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 
@@ -300,20 +302,20 @@ def test_config_overrides():
 
 def test_config_interpolation():
     config = Config().from_str(nlp_config_string, interpolate=False)
-    assert config["training"]["train_corpus"]["path"] == "${paths.train}"
+    assert config["training"]["corpus"]["train"]["path"] == "${paths.train}"
     interpolated = config.interpolate()
-    assert interpolated["training"]["train_corpus"]["path"] == ""
+    assert interpolated["training"]["corpus"]["train"]["path"] == ""
     nlp = English.from_config(config)
-    assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}"
+    assert nlp.config["training"]["corpus"]["train"]["path"] == "${paths.train}"
     # Ensure that variables are preserved in nlp config
     width = "${components.tok2vec.model.width}"
     assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
     assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
     interpolated2 = nlp.config.interpolate()
-    assert interpolated2["training"]["train_corpus"]["path"] == ""
+    assert interpolated2["training"]["corpus"]["train"]["path"] == ""
     assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
     nlp2 = English.from_config(interpolated)
-    assert nlp2.config["training"]["train_corpus"]["path"] == ""
+    assert nlp2.config["training"]["corpus"]["train"]["path"] == ""
     assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
 
 
diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md
index 0f49b02e3..c25ce1651 100644
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@@ -26,7 +26,7 @@ streaming.
 > [paths]
 > train = "corpus/train.spacy"
 >
-> [training.train_corpus]
+> [training.corpus.train]
 > @readers = "spacy.Corpus.v1"
 > path = ${paths.train}
 > gold_preproc = false
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 79ecb08b3..74d612862 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -126,24 +126,23 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
 This section defines settings and controls for the training and evaluation
 process that are used when you run [`spacy train`](/api/cli#train).
 
-| Name                  | Description                                                                                                                                                                                                  |
-| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                       |
-| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
-| `dev_corpus`          | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~                        |
-| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
-| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                    |
-| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                              |
-| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
-| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
-| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                    |
-| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                      |
-| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                              |
-| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                          |
-| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                |
-| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                              |
-| `train_corpus`        | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~                        |
-| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                             |
+| Name                  | Description                                                                                                                                                                                                                                           |
+| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                |
+| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                          |
+| `corpus`              | Dictionary with `train` and `develop` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
+| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                        |
+| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                             |
+| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                       |
+| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                                       |
+| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                                       |
+| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                             |
+| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                               |
+| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                                                                       |
+| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                                                                   |
+| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                         |
+| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                       |
+| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                                                                      |
 
 ### pretraining {#config-pretraining tag="section,optional"}
 
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index f52c63f18..be7994d5d 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -448,7 +448,7 @@ remain in the config file stored on your local system.
 > [training.logger]
 > @loggers = "spacy.WandbLogger.v1"
 > project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
+> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"]
 > ```
 
 | Name                   | Description                                                                                                                           |
@@ -478,7 +478,7 @@ the [`Corpus`](/api/corpus) class.
 > [paths]
 > train = "corpus/train.spacy"
 >
-> [training.train_corpus]
+> [training.corpus.train]
 > @readers = "spacy.Corpus.v1"
 > path = ${paths.train}
 > gold_preproc = false
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 9776dab1b..3a6bd4551 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -969,7 +969,7 @@ your results.
 > [training.logger]
 > @loggers = "spacy.WandbLogger.v1"
 > project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
+> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"]
 > ```
 
 ![Screenshot: Visualized training results](../images/wandb1.jpg)
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 65cfb563b..bba2e2853 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -746,7 +746,7 @@ as **config settings** – in this case, `source`.
 > #### config.cfg
 >
 > ```ini
-> [training.train_corpus]
+> [training.corpus.train]
 > @readers = "corpus_variants.v1"
 > source = "s3://your_bucket/path/data.csv"
 > ```