Merge pull request #6521 from explosion/feature/config-stdin

Allow reading config from stdin in spacy train
This commit is contained in:
Ines Montani 2020-12-08 22:07:43 +11:00 committed by GitHub
commit 8921364579
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 170 additions and 126 deletions

View File

@ -272,7 +272,11 @@ def show_validation_error(
msg.fail(title)
print(err.text.strip())
if hint_fill and "value_error.missing" in err.error_types:
config_path = file_path if file_path is not None else "config.cfg"
config_path = (
file_path
if file_path is not None and str(file_path) != "-"
else "config.cfg"
)
msg.text(
"If your config contains missing values, you can run the 'init "
"fill-config' command to fill in all the defaults, if possible:",

View File

@ -19,7 +19,7 @@ from .. import util
def debug_config_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")

View File

@ -37,7 +37,7 @@ BLANK_MODEL_THRESHOLD = 2000
def debug_data_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),

View File

@ -22,7 +22,7 @@ from .. import util
def debug_model_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
component: str = Arg(..., help="Name of the pipeline component of which the model should be analysed"),
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),

View File

@ -62,7 +62,7 @@ def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
def init_pipeline_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Path = Arg(..., help="Output directory for the prepared data"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
@ -88,7 +88,7 @@ def init_pipeline_cli(
def init_labels_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Path = Arg(..., help="Output directory for the labels"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),

View File

@ -17,7 +17,7 @@ from ..util import load_config
def pretrain_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
@ -79,7 +79,7 @@ def pretrain_cli(
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
if not config_path or not config_path.exists():
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)
if output_dir.exists() and [p for p in output_dir.iterdir()]:
if resume_path:

View File

@ -18,7 +18,7 @@ from .. import util
def train_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
@ -41,7 +41,7 @@ def train_cli(
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
# Make sure all files and paths exists if they are needed
if not config_path or not config_path.exists():
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)
if output_path is not None and not output_path.exists():
output_path.mkdir(parents=True)

View File

@ -465,18 +465,24 @@ def load_config(
) -> Config:
"""Load a config file. Takes care of path validation and section order.
path (Union[str, Path]): Path to the config file.
path (Union[str, Path]): Path to the config file or "-" to read from stdin.
overrides: (Dict[str, Any]): Config overrides as nested dict or
dict keyed by section values in dot notation.
interpolate (bool): Whether to interpolate and resolve variables.
RETURNS (Config): The loaded config.
"""
config_path = ensure_path(path)
if not config_path.exists() or not config_path.is_file():
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
return Config(section_order=CONFIG_SECTION_ORDER).from_disk(
config_path, overrides=overrides, interpolate=interpolate
)
config = Config(section_order=CONFIG_SECTION_ORDER)
if str(config_path) == "-": # read from standard input
return config.from_str(
sys.stdin.read(), overrides=overrides, interpolate=interpolate
)
else:
if not config_path or not config_path.exists() or not config_path.is_file():
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
return config.from_disk(
config_path, overrides=overrides, interpolate=interpolate
)
def load_config_from_str(

View File

@ -126,7 +126,7 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
| Name | Description |
| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
@ -223,16 +223,16 @@ After generating the labels, you can provide them to components that accept a
$ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides]
```
| Name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **CREATES** | The best trained pipeline and the final checkpoint (if training is terminated). |
| Name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **CREATES** | The best trained pipeline and the final checkpoint (if training is terminated). |
## convert {#convert tag="command"}
@ -428,7 +428,7 @@ File /path/to/thinc/thinc/schedules.py (line 91)
| Name | Description |
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~ |
| `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ |
@ -600,16 +600,16 @@ will not be available.
</Accordion>
| Name | Description |
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~ |
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
| `--no-format`, `-NF` | Don't pretty-print the results. Use this if you want to write to a file. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **PRINTS** | Debugging information. |
| Name | Description |
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~ |
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
| `--no-format`, `-NF` | Don't pretty-print the results. Use this if you want to write to a file. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **PRINTS** | Debugging information. |
### debug profile {#debug-profile tag="command"}
@ -742,22 +742,22 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
</Accordion>
| Name | Description |
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------- |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `component` | Name of the pipeline component of which the model should be analyzed. ~~str (positional)~~ |
| `--layers`, `-l` | Comma-separated names of layer IDs to print. ~~str (option)~~ |
| `--dimensions`, `-DIM` | Show dimensions of each layer. ~~bool (flag)~~ |
| `--parameters`, `-PAR` | Show parameters of each layer. ~~bool (flag)~~ |
| `--gradients`, `-GRAD` | Show gradients of each layer. ~~bool (flag)~~ |
| `--attributes`, `-ATTR` | Show attributes of each layer. ~~bool (flag)~~ |
| `--print-step0`, `-P0` | Print model before training. ~~bool (flag)~~ |
| `--print-step1`, `-P1` | Print model after initialization. ~~bool (flag)~~ |
| `--print-step2`, `-P2` | Print model after training. ~~bool (flag)~~ |
| `--print-step3`, `-P3` | Print final predictions. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **PRINTS** | Debugging information. |
| Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
| `component` | Name of the pipeline component of which the model should be analyzed. ~~str (positional)~~ |
| `--layers`, `-l` | Comma-separated names of layer IDs to print. ~~str (option)~~ |
| `--dimensions`, `-DIM` | Show dimensions of each layer. ~~bool (flag)~~ |
| `--parameters`, `-PAR` | Show parameters of each layer. ~~bool (flag)~~ |
| `--gradients`, `-GRAD` | Show gradients of each layer. ~~bool (flag)~~ |
| `--attributes`, `-ATTR` | Show attributes of each layer. ~~bool (flag)~~ |
| `--print-step0`, `-P0` | Print model before training. ~~bool (flag)~~ |
| `--print-step1`, `-P1` | Print model after initialization. ~~bool (flag)~~ |
| `--print-step2`, `-P2` | Print model after training. ~~bool (flag)~~ |
| `--print-step3`, `-P3` | Print final predictions. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **PRINTS** | Debugging information. |
## train {#train tag="command"}
@ -787,16 +787,16 @@ in the section `[paths]`.
$ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
```
| Name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **CREATES** | The final trained pipeline and the best trained pipeline. |
| Name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **CREATES** | The final trained pipeline and the best trained pipeline. |
## pretrain {#pretrain new="2.1" tag="command,experimental"}
@ -827,17 +827,17 @@ auto-generated by setting `--pretraining` on
$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
```
| Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
| Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
## evaluate {#evaluate new="2" tag="command"}

View File

@ -264,6 +264,26 @@ defined in the config file.
$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
```
### Reading from standard input {#config-stdin}
Setting the config path to `-` on the command line lets you read the config from
standard input and pipe it forward from a different process, like
[`init config`](/api/cli#init-config) or your own custom script. This is
especially useful for quick experiments, as it lets you generate a config on the
fly without having to save to and load from disk.
> #### 💡 Tip: Writing to stdout
>
> When you run `init config`, you can set the output path to `-` to write to
> stdout. In a custom script, you can print the string config, e.g.
> `print(nlp.config.to_str())`.
```cli
$ python -m spacy init config - --lang en --pipeline ner,textcat --optimize accuracy | python -m spacy train - --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy
```
<!-- TODO: add reference to Prodigy's commands once Prodigy nightly is available -->
### Using variable interpolation {#config-interpolation}
Another very useful feature of the config system is that it supports variable
@ -378,7 +398,8 @@ weights and [resume training](/api/language#resume_training).
If you don't want a component to be updated, you can **freeze** it by adding it
to the `frozen_components` list in the `[training]` block. Frozen components are
**not updated** during training and are included in the final trained pipeline
as-is. They are also excluded when calling [`nlp.initialize`](/api/language#initialize).
as-is. They are also excluded when calling
[`nlp.initialize`](/api/language#initialize).
> #### Note on frozen components
>
@ -551,8 +572,8 @@ or TensorFlow, make **custom modifications** to the `nlp` object, create custom
optimizers or schedules, or **stream in data** and preprocesses it on the fly
while training.
Each custom function can have any number of arguments that are passed in via
the [config](#config), just the built-in functions. If your function defines
Each custom function can have any number of arguments that are passed in via the
[config](#config), just the built-in functions. If your function defines
**default argument values**, spaCy is able to auto-fill your config when you run
[`init fill-config`](/api/cli#init-fill-config). If you want to make sure that a
given parameter is always explicitly set in the config, avoid setting a default
@ -958,10 +979,10 @@ data assets, track changes and share your end-to-end processes with your team.
</Infobox>
The binary `.spacy` format is a serialized [`DocBin`](/api/docbin) containing
one or more [`Doc`](/api/doc) objects. It's extremely **efficient in
storage**, especially when packing multiple documents together. You can also
create `Doc` objects manually, so you can write your own custom logic to convert
and store existing annotations for use in spaCy.
one or more [`Doc`](/api/doc) objects. It's extremely **efficient in storage**,
especially when packing multiple documents together. You can also create `Doc`
objects manually, so you can write your own custom logic to convert and store
existing annotations for use in spaCy.
```python
### Training data from Doc objects {highlight="6-9"}
@ -1300,10 +1321,10 @@ mapping so they know which worker owns which parameter.
As training proceeds, every worker will be computing gradients for **all** of
the model parameters. When they compute gradients for parameters they don't own,
they'll **send them to the worker** that does own that parameter, along with a
version identifier so that the owner can decide whether to discard the
gradient. Workers use the gradients they receive and the ones they compute
locally to update the parameters they own, and then broadcast the updated array
and a new version ID to the other workers.
version identifier so that the owner can decide whether to discard the gradient.
Workers use the gradients they receive and the ones they compute locally to
update the parameters they own, and then broadcast the updated array and a new
version ID to the other workers.
This training procedure is **asynchronous** and **non-blocking**. Workers always
push their gradient increments and parameter updates, they do not have to pull

View File

@ -120,52 +120,65 @@ function parseArgs(raw) {
return result
}
function convertLine(line, i) {
console.log(line, i)
const cliRegex = /^(\$ )?python -m spacy/
if (cliRegex.test(line)) {
const text = line.replace(cliRegex, '')
const args = parseArgs(text)
const cmd = Object.keys(args).map((key, i) => {
const value = args[key]
return value === null || value === true || i === 0 ? key : `${key} ${value}`
})
return (
<Fragment key={line}>
<span data-prompt={i === 0 ? '$' : null} className={classes.cliArgSubtle}>
python -m
</span>{' '}
<span>spacy</span>{' '}
{cmd.map((item, j) => {
const isCmd = j === 0
const url = isCmd ? `/api/cli#${item.replace(' ', '-')}` : null
const isAbstract = isString(item) && /^\[(.+)\]$/.test(item)
const itemClassNames = classNames(classes.cliArg, {
[classes.cliArgHighlight]: isCmd,
[classes.cliArgEmphasis]: isAbstract,
})
const text = isAbstract ? item.slice(1, -1) : item
return (
<Fragment key={j}>
{j !== 0 && ' '}
<span className={itemClassNames}>
<OptionalLink hidden hideIcon to={url}>
{text}
</OptionalLink>
</span>
</Fragment>
)
})}
</Fragment>
)
}
const htmlLine = replacePrompt(highlightCode('bash', line), '$')
return htmlToReact(htmlLine)
}
function formatCode(html, lang, prompt) {
if (lang === 'cli') {
const cliRegex = /^(\$ )?python -m spacy/
const lines = html
.trim()
.split('\n')
.map((line, i) => {
if (cliRegex.test(line)) {
const text = line.replace(cliRegex, '')
const args = parseArgs(text)
const cmd = Object.keys(args).map((key, i) => {
const value = args[key]
return value === null || value === true || i === 0 ? key : `${key} ${value}`
})
return (
<Fragment key={i}>
<span data-prompt="$" className={classes.cliArgSubtle}>
python -m
</span>{' '}
<span>spacy</span>{' '}
{cmd.map((item, j) => {
const isCmd = j === 0
const url = isCmd ? `/api/cli#${item.replace(' ', '-')}` : null
const isAbstract = isString(item) && /^\[(.+)\]$/.test(item)
const itemClassNames = classNames(classes.cliArg, {
[classes.cliArgHighlight]: isCmd,
[classes.cliArgEmphasis]: isAbstract,
})
const text = isAbstract ? item.slice(1, -1) : item
return (
<Fragment key={j}>
{j !== 0 && ' '}
<span className={itemClassNames}>
<OptionalLink hidden hideIcon to={url}>
{text}
</OptionalLink>
</span>
</Fragment>
)
})}
.map(line =>
line
.split(' | ')
.map((l, i) => convertLine(l, i))
.map((l, j) => (
<Fragment>
{j !== 0 && <span> | </span>}
{l}
</Fragment>
)
}
const htmlLine = replacePrompt(highlightCode('bash', line), '$')
return htmlToReact(htmlLine)
})
))
)
return lines.map((line, i) => (
<Fragment key={i}>
{i !== 0 && <br />}