From 154752f9c24f9536e4aeeee9505813b8a11f534f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 15 Sep 2020 00:32:49 +0200 Subject: [PATCH] Update docs and consistency [ci skip] --- spacy/training/corpus.py | 15 ++++--- website/docs/api/cli.md | 59 +++++++++++++------------ website/docs/api/corpus.md | 76 ++++++++++++++++++++++++++++++++ website/docs/api/data-formats.md | 67 +++++++--------------------- website/docs/api/top-level.md | 76 +++++++++++++++++++++++++++++--- website/docs/usage/v3.md | 2 +- 6 files changed, 201 insertions(+), 94 deletions(-) diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 20e4507aa..11f098993 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -22,9 +22,10 @@ def create_docbin_reader( ) -> Callable[["Language"], Iterable[Example]]: return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit) + @util.registry.readers("spacy.JsonlReader.v1") def create_jsonl_reader( - path: Path, min_length: int=0, max_length: int = 0, limit: int = 0 + path: Path, min_length: int = 0, max_length: int = 0, limit: int = 0 ) -> Callable[["Language"], Iterable[Doc]]: return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit) @@ -52,7 +53,6 @@ def walk_corpus(path: Union[str, Path], file_type) -> List[Path]: return locs - class Corpus: """Iterate Example objects from a file or directory of DocBin (.spacy) formatted data files. @@ -162,20 +162,21 @@ class Corpus: class JsonlTexts: - """Iterate Doc objects from a file or directory of jsonl + """Iterate Doc objects from a file or directory of jsonl formatted raw text files. path (Path): The directory or filename to read from. min_length (int): Minimum document length (in tokens). Shorter documents will be skipped. Defaults to 0, which indicates no limit. - + max_length (int): Maximum document length (in tokens). Longer documents will be skipped. Defaults to 0, which indicates no limit. limit (int): Limit corpus to a subset of examples, e.g. for debugging. Defaults to 0, which indicates no limit. - DOCS: https://nightly.spacy.io/api/corpus + DOCS: https://nightly.spacy.io/api/corpus#jsonltexts """ + file_type = "jsonl" def __init__( @@ -195,9 +196,9 @@ class JsonlTexts: """Yield examples from the data. nlp (Language): The current nlp object. - YIELDS (Doc): The docs. + YIELDS (Example): The example objects. - DOCS: https://nightly.spacy.io/api/corpus#call + DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call """ for loc in walk_corpus(self.path, "jsonl"): records = srsly.read_jsonl(loc) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index c27efb2e4..8449d23e1 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -791,20 +791,19 @@ auto-generated by setting `--pretraining` on ```cli -$ python -m spacy pretrain [texts_loc] [output_dir] [config_path] [--code] [--resume-path] [--epoch-resume] [overrides] +$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [overrides] ``` -| Name | Description | -| ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `texts_loc` | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. ~~Path (positional)~~ | -| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ | -| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ | -| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ | -| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. | +| Name | Description | +| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ | +| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ | +| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. | ## evaluate {#evaluate new="2" tag="command"} @@ -886,8 +885,8 @@ deploying custom spaCy pipelines. ### project clone {#project-clone tag="command"} Clone a project template from a Git repository. Calls into `git` under the hood -and uses the sparse checkout feature, so you're only downloading what you need. -By default, spaCy's +and can use the sparse checkout feature if available, so you're only downloading +what you need. By default, spaCy's [project templates repo](https://github.com/explosion/projects) is used, but you can provide any other repo (public or private) that you have access to using the `--repo` option. @@ -895,7 +894,7 @@ can provide any other repo (public or private) that you have access to using the ```cli -$ python -m spacy project clone [name] [dest] [--repo] [--branch] +$ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse] ``` > #### Example @@ -910,14 +909,15 @@ $ python -m spacy project clone [name] [dest] [--repo] [--branch] > $ python -m spacy project clone template --repo https://github.com/your_org/your_repo > ``` -| Name | Description | -| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. ~~str (positional)~~ | -| `dest` | Where to clone the project. Defaults to current working directory. ~~Path (positional)~~ | -| `--repo`, `-r` | The repository to clone from. Can be any public or private Git repo you have access to. ~~str (option)~~ | -| `--branch`, `-b` | The branch to clone from. Defaults to `master`. ~~str (option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | The cloned [project directory](/usage/projects#project-files). | +| Name | Description | +| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. ~~str (positional)~~ | +| `dest` | Where to clone the project. Defaults to current working directory. ~~Path (positional)~~ | +| `--repo`, `-r` | The repository to clone from. Can be any public or private Git repo you have access to. ~~str (option)~~ | +| `--branch`, `-b` | The branch to clone from. Defaults to `master`. ~~str (option)~~ | +| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | The cloned [project directory](/usage/projects#project-files). | ### project assets {#project-assets tag="command"} @@ -937,14 +937,15 @@ $ python -m spacy project assets [project_dir] > #### Example > > ```cli -> $ python -m spacy project assets +> $ python -m spacy project assets [--sparse] > ``` -| Name | Description | -| -------------- | --------------------------------------------------------------------------------------- | -| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Downloaded or copied assets defined in the `project.yml`. | +| Name | Description | +| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Downloaded or copied assets defined in the `project.yml`. | ### project run {#project-run tag="command"} diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index f6f6bbf68..0f49b02e3 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -94,3 +94,79 @@ Yield examples from the data. | ---------- | -------------------------------------- | | `nlp` | The current `nlp` object. ~~Language~~ | | **YIELDS** | The examples. ~~Example~~ | + +## JsonlTexts {#jsonltexts tag="class"} + +Iterate Doc objects from a file or directory of JSONL (newline-delimited JSON) +formatted raw text files. Can be used to read the raw text corpus for language +model [pretraining](/usage/embeddings-transformers#pretraining) from a JSONL +file. + +> #### Tip: Writing JSONL +> +> Our utility library [`srsly`](https://github.com/explosion/srsly) provides a +> handy `write_jsonl` helper that takes a file path and list of dictionaries and +> writes out JSONL-formatted data. +> +> ```python +> import srsly +> data = [{"text": "Some text"}, {"text": "More..."}] +> srsly.write_jsonl("/path/to/text.jsonl", data) +> ``` + +```json +### Example +{"text": "Can I ask where you work now and what you do, and if you enjoy it?"} +{"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."} +{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."} +``` + +### JsonlTexts.\_\init\_\_ {#jsonltexts-init tag="method"} + +Initialize the reader. + +> #### Example +> +> ```python +> from spacy.training import JsonlTexts +> +> corpus = JsonlTexts("./data/texts.jsonl") +> ``` +> +> ```ini +> ### Example config +> [pretraining.corpus] +> @readers = "spacy.JsonlReader.v1" +> path = "corpus/raw_text.jsonl" +> min_length = 0 +> max_length = 0 +> limit = 0 +> ``` + +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------- | +| `path` | The directory or filename to read from. Expects newline-delimited JSON with a key `"text"` for each record. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ | +| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ | +| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | + +### JsonlTexts.\_\_call\_\_ {#jsonltexts-call tag="method"} + +Yield examples from the data. + +> #### Example +> +> ```python +> from spacy.training import JsonlTexts +> import spacy +> +> corpus = JsonlTexts("./texts.jsonl") +> nlp = spacy.blank("en") +> data = corpus(nlp) +> ``` + +| Name | Description | +| ---------- | -------------------------------------- | +| `nlp` | The current `nlp` object. ~~Language~~ | +| **YIELDS** | The examples. ~~Example~~ | diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 3d78df39d..79ecb08b3 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -4,7 +4,6 @@ teaser: Details on spaCy's input and output data formats menu: - ['Training Config', 'config'] - ['Training Data', 'training'] - - ['Pretraining Data', 'pretraining'] - ['Vocabulary', 'vocab-jsonl'] - ['Pipeline Meta', 'meta'] --- @@ -131,7 +130,7 @@ process that are used when you run [`spacy train`](/api/cli#train). | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | | `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ | +| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | | `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | @@ -143,28 +142,26 @@ process that are used when you run [`spacy train`](/api/cli#train). | `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | | `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | | `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ | +| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ | | `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} This section is optional and defines settings and controls for -[language model pretraining](/usage/training#pretraining). It's used when you -run [`spacy pretrain`](/api/cli#pretrain). +[language model pretraining](/usage/embeddings-transformers#pretraining). It's +used when you run [`spacy pretrain`](/api/cli#pretrain). -| Name | Description | -| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | -| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ | -| `min_length` | Minimum length of examples. Defaults to `5`. ~~int~~ | -| `max_length` | Maximum length of examples. Defaults to `500`. ~~int~~ | -| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | -| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | -| `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system.use_pytorch_for_gpu_memory}`. ~~bool~~ | -| `tok2vec_model` | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~ | -| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | -| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ | +| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | +| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | +| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | +| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `corpus` | Callable that takes the current `nlp` object and yields [`Doc`](/api/doc) objects. Defaults to [`JsonlReader`](/api/top-level#JsonlReader). ~~Callable[[Language, str], Iterable[Example]]~~ | +| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ | +| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ | ## Training data {#training} @@ -369,40 +366,6 @@ gold_dict = {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}} example = Example.from_dict(doc, gold_dict) ``` -## Pretraining data {#pretraining} - -The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the -"token-to-vector" embedding layer of pipeline components from raw text. Raw text -can be provided as a `.jsonl` (newline-delimited JSON) file containing one input -text per line (roughly paragraph length is good). Optionally, custom -tokenization can be provided. The JSONL format means that the texts can be read -in line-by-line, while still making it easy to represent newlines in the data. - -> #### Tip: Writing JSONL -> -> Our utility library [`srsly`](https://github.com/explosion/srsly) provides a -> handy `write_jsonl` helper that takes a file path and list of dictionaries and -> writes out JSONL-formatted data. -> -> ```python -> import srsly -> data = [{"text": "Some text"}, {"text": "More..."}] -> srsly.write_jsonl("/path/to/text.jsonl", data) -> ``` - -| Key | Description | -| -------- | --------------------------------------------------------------------- | -| `text` | The raw input text. Is not required if `tokens` is available. ~~str~~ | -| `tokens` | Optional tokenization, one string per token. ~~List[str]~~ | - -```json -### Example -{"text": "Can I ask where you work now and what you do, and if you enjoy it?"} -{"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."} -{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."} -{"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]} -``` - ## Lexical data for vocabulary {#vocab-jsonl new="2"} To populate a pipeline's vocabulary, you can use the diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 38e2299fa..deae39f3d 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -5,6 +5,7 @@ menu: - ['displacy', 'displacy'] - ['registry', 'registry'] - ['Loggers', 'loggers'] + - ['Readers', 'readers'] - ['Batchers', 'batchers'] - ['Data & Alignment', 'gold'] - ['Utility Functions', 'util'] @@ -363,7 +364,7 @@ results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of using one of the built-in loggers listed here, you can also [implement your own](/usage/training#custom-logging). -#### spacy.ConsoleLogger {#ConsoleLogger tag="registered function"} +#### ConsoleLogger {#ConsoleLogger tag="registered function"} > #### Example config > @@ -409,7 +410,7 @@ start decreasing across epochs. -#### spacy.WandbLogger {#WandbLogger tag="registered function"} +#### WandbLogger {#WandbLogger tag="registered function"} > #### Installation > @@ -451,6 +452,71 @@ remain in the config file stored on your local system. | `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ | | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ | +## Readers {#readers source="spacy/training/corpus.py" new="3"} + +Corpus readers are registered functions that load data and return a function +that takes the current `nlp` object and yields [`Example`](/api/example) objects +that can be used for [training](/usage/training) and +[pretraining](/usage/embeddings-transformers#pretraining). You can replace it +with your own registered function in the +[`@readers` registry](/api/top-level#registry) to customize the data loading and +streaming. + +### Corpus {#corpus} + +The `Corpus` reader manages annotated corpora and can be used for training and +development datasets in the [DocBin](/api/docbin) (`.spacy`) format. Also see +the [`Corpus`](/api/corpus) class. + +> #### Example config +> +> ```ini +> [paths] +> train = "corpus/train.spacy" +> +> [training.train_corpus] +> @readers = "spacy.Corpus.v1" +> path = ${paths.train} +> gold_preproc = false +> max_length = 0 +> limit = 0 +> ``` + +| Name | Description | +| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ | +|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ | +| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | +| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | + +### JsonlReader {#jsonlreader} + +Create [`Example`](/api/example) objects from a JSONL (newline-delimited JSON) +file of texts keyed by `"text"`. Can be used to read the raw text corpus for +language model [pretraining](/usage/embeddings-transformers#pretraining) from a +JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class. + +> #### Example config +> +> ```ini +> [paths] +> pretrain = "corpus/raw_text.jsonl" +> +> [pretraining.corpus] +> @readers = "spacy.JsonlReader.v1" +> path = ${paths.pretrain} +> min_length = 0 +> max_length = 0 +> limit = 0 +> ``` + +| Name | Description | +| ------------ | -------------------------------------------------------------------------------------------------------------------------------- | +| `path` | The directory or filename to read from. Expects newline-delimited JSON with a key `"text"` for each record. ~~Union[str, Path]~~ | +| `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ | +| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ | +| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | + ## Batchers {#batchers source="spacy/training/batchers.py" new="3"} A data batcher implements a batching strategy that essentially turns a stream of @@ -465,7 +531,7 @@ Instead of using one of the built-in batchers listed here, you can also [implement your own](/usage/training#custom-code-readers-batchers), which may or may not use a custom schedule. -#### batch_by_words {#batch_by_words tag="registered function"} +### batch_by_words {#batch_by_words tag="registered function"} Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by @@ -492,7 +558,7 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument | `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ | | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | -#### batch_by_sequence {#batch_by_sequence tag="registered function"} +### batch_by_sequence {#batch_by_sequence tag="registered function"} > #### Example config > @@ -510,7 +576,7 @@ Create a batcher that creates batches of the specified size. | `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ | | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | -#### batch_by_padded {#batch_by_padded tag="registered function"} +### batch_by_padded {#batch_by_padded tag="registered function"} > #### Example config > diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 87f832b88..4d30477e1 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -383,7 +383,7 @@ hints. The new version of spaCy's machine learning library types for models and arrays, and a custom `mypy` plugin that can be used to type-check model definitions. -For data validation, spacy v3.0 adopts +For data validation, spaCy v3.0 adopts [`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which lets you to register **custom functions with typed arguments**, reference them