diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg index 7954b57b5..97eebe6b4 100644 --- a/extra/experiments/onto-joint/defaults.cfg +++ b/extra/experiments/onto-joint/defaults.cfg @@ -21,14 +21,16 @@ eval_frequency = 200 score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2} frozen_components = [] -[training.train_corpus] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths:train} gold_preproc = true max_length = 0 limit = 0 -[training.dev_corpus] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths:dev} gold_preproc = ${training.read_train:gold_preproc} diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg index 8f9c5666e..03e2f5bd7 100644 --- a/extra/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/extra/experiments/ptb-joint-pos-dep/defaults.cfg @@ -20,14 +20,16 @@ patience = 10000 eval_frequency = 200 score_weights = {"dep_las": 0.8, "tag_acc": 0.2} -[training.read_train] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths:train} gold_preproc = true max_length = 0 limit = 0 -[training.read_dev] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths:dev} gold_preproc = ${training.read_train:gold_preproc} diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 199aae217..39d4d875d 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -195,12 +195,14 @@ total_steps = 20000 initial_rate = 5e-5 {% endif %} -[training.train_corpus] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths.train} max_length = {{ 500 if hardware == "gpu" else 2000 }} -[training.dev_corpus] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths.dev} max_length = 0 diff --git a/spacy/cli/train.py b/spacy/cli/train.py index ae4a8455e..2c2eeb88b 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -92,8 +92,8 @@ def train( raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) T_cfg = config["training"] optimizer = T_cfg["optimizer"] - train_corpus = T_cfg["train_corpus"] - dev_corpus = T_cfg["dev_corpus"] + train_corpus = T_cfg["corpus"]["train"] + dev_corpus = T_cfg["corpus"]["dev"] batcher = T_cfg["batcher"] train_logger = T_cfg["logger"] # Components that shouldn't be updated during training diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 7cd71453f..61f3dfe25 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -44,7 +44,9 @@ frozen_components = [] [training.logger] @loggers = "spacy.ConsoleLogger.v1" -[training.train_corpus] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths.train} # Whether to train on sequences with 'gold standard' sentence boundaries @@ -56,7 +58,7 @@ max_length = 0 # Limitation on number of training examples limit = 0 -[training.dev_corpus] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths.dev} # Whether to train on sequences with 'gold standard' sentence boundaries diff --git a/spacy/schemas.py b/spacy/schemas.py index 0dd2b9204..d8bcf3c1d 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -198,8 +198,7 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off vectors: Optional[StrictStr] = Field(..., title="Path to vectors") - train_corpus: Reader = Field(..., title="Reader for the training data") - dev_corpus: Reader = Field(..., title="Reader for the dev data") + corpus: Reader = Field(..., title="Reader for the training and dev data") batcher: Batcher = Field(..., title="Batcher for the training data") dropout: StrictFloat = Field(..., title="Dropout rate") patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score") diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 0ab212fda..d113ac2a5 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -19,11 +19,13 @@ dev = "" [training] -[training.train_corpus] +[training.corpus] + +[training.corpus.train] @readers = "spacy.Corpus.v1" path = ${paths.train} -[training.dev_corpus] +[training.corpus.dev] @readers = "spacy.Corpus.v1" path = ${paths.dev} @@ -300,20 +302,20 @@ def test_config_overrides(): def test_config_interpolation(): config = Config().from_str(nlp_config_string, interpolate=False) - assert config["training"]["train_corpus"]["path"] == "${paths.train}" + assert config["training"]["corpus"]["train"]["path"] == "${paths.train}" interpolated = config.interpolate() - assert interpolated["training"]["train_corpus"]["path"] == "" + assert interpolated["training"]["corpus"]["train"]["path"] == "" nlp = English.from_config(config) - assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}" + assert nlp.config["training"]["corpus"]["train"]["path"] == "${paths.train}" # Ensure that variables are preserved in nlp config width = "${components.tok2vec.model.width}" assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width interpolated2 = nlp.config.interpolate() - assert interpolated2["training"]["train_corpus"]["path"] == "" + assert interpolated2["training"]["corpus"]["train"]["path"] == "" assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 nlp2 = English.from_config(interpolated) - assert nlp2.config["training"]["train_corpus"]["path"] == "" + assert nlp2.config["training"]["corpus"]["train"]["path"] == "" assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 0f49b02e3..c25ce1651 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -26,7 +26,7 @@ streaming. > [paths] > train = "corpus/train.spacy" > -> [training.train_corpus] +> [training.corpus.train] > @readers = "spacy.Corpus.v1" > path = ${paths.train} > gold_preproc = false diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 79ecb08b3..74d612862 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -126,24 +126,23 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | -| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | -| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | -| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | -| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | -| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | +| Name | Description | +| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `corpus` | Dictionary with `train` and `develop` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | +| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index f52c63f18..be7994d5d 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -448,7 +448,7 @@ remain in the config file stored on your local system. > [training.logger] > @loggers = "spacy.WandbLogger.v1" > project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"] +> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"] > ``` | Name | Description | @@ -478,7 +478,7 @@ the [`Corpus`](/api/corpus) class. > [paths] > train = "corpus/train.spacy" > -> [training.train_corpus] +> [training.corpus.train] > @readers = "spacy.Corpus.v1" > path = ${paths.train} > gold_preproc = false diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 9776dab1b..3a6bd4551 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -969,7 +969,7 @@ your results. > [training.logger] > @loggers = "spacy.WandbLogger.v1" > project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"] +> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"] > ``` ![Screenshot: Visualized training results](../images/wandb1.jpg) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 65cfb563b..bba2e2853 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -746,7 +746,7 @@ as **config settings** – in this case, `source`. > #### config.cfg > > ```ini -> [training.train_corpus] +> [training.corpus.train] > @readers = "corpus_variants.v1" > source = "s3://your_bucket/path/data.csv" > ```