mirror of https://github.com/explosion/spaCy.git
Merge pull request #6521 from explosion/feature/config-stdin
Allow reading config from stdin in spacy train
This commit is contained in:
commit
8921364579
|
@ -272,7 +272,11 @@ def show_validation_error(
|
|||
msg.fail(title)
|
||||
print(err.text.strip())
|
||||
if hint_fill and "value_error.missing" in err.error_types:
|
||||
config_path = file_path if file_path is not None else "config.cfg"
|
||||
config_path = (
|
||||
file_path
|
||||
if file_path is not None and str(file_path) != "-"
|
||||
else "config.cfg"
|
||||
)
|
||||
msg.text(
|
||||
"If your config contains missing values, you can run the 'init "
|
||||
"fill-config' command to fill in all the defaults, if possible:",
|
||||
|
|
|
@ -19,7 +19,7 @@ from .. import util
|
|||
def debug_config_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
|
||||
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
|
||||
|
|
|
@ -37,7 +37,7 @@ BLANK_MODEL_THRESHOLD = 2000
|
|||
def debug_data_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
|
||||
|
|
|
@ -22,7 +22,7 @@ from .. import util
|
|||
def debug_model_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
component: str = Arg(..., help="Name of the pipeline component of which the model should be analysed"),
|
||||
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
|
||||
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
|
||||
|
|
|
@ -62,7 +62,7 @@ def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
|
|||
def init_pipeline_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
output_path: Path = Arg(..., help="Output directory for the prepared data"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
|
@ -88,7 +88,7 @@ def init_pipeline_cli(
|
|||
def init_labels_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
output_path: Path = Arg(..., help="Output directory for the labels"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
|
|
|
@ -17,7 +17,7 @@ from ..util import load_config
|
|||
def pretrain_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
|
||||
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
|
@ -79,7 +79,7 @@ def pretrain_cli(
|
|||
|
||||
|
||||
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
||||
if not config_path or not config_path.exists():
|
||||
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
||||
if resume_path:
|
||||
|
|
|
@ -18,7 +18,7 @@ from .. import util
|
|||
def train_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
|
@ -41,7 +41,7 @@ def train_cli(
|
|||
"""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
# Make sure all files and paths exists if they are needed
|
||||
if not config_path or not config_path.exists():
|
||||
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
if output_path is not None and not output_path.exists():
|
||||
output_path.mkdir(parents=True)
|
||||
|
|
|
@ -465,18 +465,24 @@ def load_config(
|
|||
) -> Config:
|
||||
"""Load a config file. Takes care of path validation and section order.
|
||||
|
||||
path (Union[str, Path]): Path to the config file.
|
||||
path (Union[str, Path]): Path to the config file or "-" to read from stdin.
|
||||
overrides: (Dict[str, Any]): Config overrides as nested dict or
|
||||
dict keyed by section values in dot notation.
|
||||
interpolate (bool): Whether to interpolate and resolve variables.
|
||||
RETURNS (Config): The loaded config.
|
||||
"""
|
||||
config_path = ensure_path(path)
|
||||
if not config_path.exists() or not config_path.is_file():
|
||||
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
||||
return Config(section_order=CONFIG_SECTION_ORDER).from_disk(
|
||||
config_path, overrides=overrides, interpolate=interpolate
|
||||
)
|
||||
config = Config(section_order=CONFIG_SECTION_ORDER)
|
||||
if str(config_path) == "-": # read from standard input
|
||||
return config.from_str(
|
||||
sys.stdin.read(), overrides=overrides, interpolate=interpolate
|
||||
)
|
||||
else:
|
||||
if not config_path or not config_path.exists() or not config_path.is_file():
|
||||
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
||||
return config.from_disk(
|
||||
config_path, overrides=overrides, interpolate=interpolate
|
||||
)
|
||||
|
||||
|
||||
def load_config_from_str(
|
||||
|
|
|
@ -126,7 +126,7 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
|
|||
|
||||
| Name | Description |
|
||||
| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
||||
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
||||
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
|
||||
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
|
||||
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
|
||||
|
@ -223,16 +223,16 @@ After generating the labels, you can provide them to components that accept a
|
|||
$ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The best trained pipeline and the final checkpoint (if training is terminated). |
|
||||
| Name | Description |
|
||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||
| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The best trained pipeline and the final checkpoint (if training is terminated). |
|
||||
|
||||
## convert {#convert tag="command"}
|
||||
|
||||
|
@ -428,7 +428,7 @@ File /path/to/thinc/thinc/schedules.py (line 91)
|
|||
|
||||
| Name | Description |
|
||||
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~ |
|
||||
| `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ |
|
||||
|
@ -600,16 +600,16 @@ will not be available.
|
|||
|
||||
</Accordion>
|
||||
|
||||
| Name | Description |
|
||||
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~ |
|
||||
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
||||
| `--no-format`, `-NF` | Don't pretty-print the results. Use this if you want to write to a file. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **PRINTS** | Debugging information. |
|
||||
| Name | Description |
|
||||
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~ |
|
||||
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
||||
| `--no-format`, `-NF` | Don't pretty-print the results. Use this if you want to write to a file. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **PRINTS** | Debugging information. |
|
||||
|
||||
### debug profile {#debug-profile tag="command"}
|
||||
|
||||
|
@ -742,22 +742,22 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
|
|||
|
||||
</Accordion>
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `component` | Name of the pipeline component of which the model should be analyzed. ~~str (positional)~~ |
|
||||
| `--layers`, `-l` | Comma-separated names of layer IDs to print. ~~str (option)~~ |
|
||||
| `--dimensions`, `-DIM` | Show dimensions of each layer. ~~bool (flag)~~ |
|
||||
| `--parameters`, `-PAR` | Show parameters of each layer. ~~bool (flag)~~ |
|
||||
| `--gradients`, `-GRAD` | Show gradients of each layer. ~~bool (flag)~~ |
|
||||
| `--attributes`, `-ATTR` | Show attributes of each layer. ~~bool (flag)~~ |
|
||||
| `--print-step0`, `-P0` | Print model before training. ~~bool (flag)~~ |
|
||||
| `--print-step1`, `-P1` | Print model after initialization. ~~bool (flag)~~ |
|
||||
| `--print-step2`, `-P2` | Print model after training. ~~bool (flag)~~ |
|
||||
| `--print-step3`, `-P3` | Print final predictions. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Debugging information. |
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||
| `component` | Name of the pipeline component of which the model should be analyzed. ~~str (positional)~~ |
|
||||
| `--layers`, `-l` | Comma-separated names of layer IDs to print. ~~str (option)~~ |
|
||||
| `--dimensions`, `-DIM` | Show dimensions of each layer. ~~bool (flag)~~ |
|
||||
| `--parameters`, `-PAR` | Show parameters of each layer. ~~bool (flag)~~ |
|
||||
| `--gradients`, `-GRAD` | Show gradients of each layer. ~~bool (flag)~~ |
|
||||
| `--attributes`, `-ATTR` | Show attributes of each layer. ~~bool (flag)~~ |
|
||||
| `--print-step0`, `-P0` | Print model before training. ~~bool (flag)~~ |
|
||||
| `--print-step1`, `-P1` | Print model after initialization. ~~bool (flag)~~ |
|
||||
| `--print-step2`, `-P2` | Print model after training. ~~bool (flag)~~ |
|
||||
| `--print-step3`, `-P3` | Print final predictions. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Debugging information. |
|
||||
|
||||
## train {#train tag="command"}
|
||||
|
||||
|
@ -787,16 +787,16 @@ in the section `[paths]`.
|
|||
$ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
||||
| Name | Description |
|
||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
||||
|
||||
## pretrain {#pretrain new="2.1" tag="command,experimental"}
|
||||
|
||||
|
@ -827,17 +827,17 @@ auto-generated by setting `--pretraining` on
|
|||
$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
||||
|
||||
## evaluate {#evaluate new="2" tag="command"}
|
||||
|
||||
|
|
|
@ -264,6 +264,26 @@ defined in the config file.
|
|||
$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
|
||||
```
|
||||
|
||||
### Reading from standard input {#config-stdin}
|
||||
|
||||
Setting the config path to `-` on the command line lets you read the config from
|
||||
standard input and pipe it forward from a different process, like
|
||||
[`init config`](/api/cli#init-config) or your own custom script. This is
|
||||
especially useful for quick experiments, as it lets you generate a config on the
|
||||
fly without having to save to and load from disk.
|
||||
|
||||
> #### 💡 Tip: Writing to stdout
|
||||
>
|
||||
> When you run `init config`, you can set the output path to `-` to write to
|
||||
> stdout. In a custom script, you can print the string config, e.g.
|
||||
> `print(nlp.config.to_str())`.
|
||||
|
||||
```cli
|
||||
$ python -m spacy init config - --lang en --pipeline ner,textcat --optimize accuracy | python -m spacy train - --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy
|
||||
```
|
||||
|
||||
<!-- TODO: add reference to Prodigy's commands once Prodigy nightly is available -->
|
||||
|
||||
### Using variable interpolation {#config-interpolation}
|
||||
|
||||
Another very useful feature of the config system is that it supports variable
|
||||
|
@ -378,7 +398,8 @@ weights and [resume training](/api/language#resume_training).
|
|||
If you don't want a component to be updated, you can **freeze** it by adding it
|
||||
to the `frozen_components` list in the `[training]` block. Frozen components are
|
||||
**not updated** during training and are included in the final trained pipeline
|
||||
as-is. They are also excluded when calling [`nlp.initialize`](/api/language#initialize).
|
||||
as-is. They are also excluded when calling
|
||||
[`nlp.initialize`](/api/language#initialize).
|
||||
|
||||
> #### Note on frozen components
|
||||
>
|
||||
|
@ -551,8 +572,8 @@ or TensorFlow, make **custom modifications** to the `nlp` object, create custom
|
|||
optimizers or schedules, or **stream in data** and preprocesses it on the fly
|
||||
while training.
|
||||
|
||||
Each custom function can have any number of arguments that are passed in via
|
||||
the [config](#config), just the built-in functions. If your function defines
|
||||
Each custom function can have any number of arguments that are passed in via the
|
||||
[config](#config), just the built-in functions. If your function defines
|
||||
**default argument values**, spaCy is able to auto-fill your config when you run
|
||||
[`init fill-config`](/api/cli#init-fill-config). If you want to make sure that a
|
||||
given parameter is always explicitly set in the config, avoid setting a default
|
||||
|
@ -958,10 +979,10 @@ data assets, track changes and share your end-to-end processes with your team.
|
|||
</Infobox>
|
||||
|
||||
The binary `.spacy` format is a serialized [`DocBin`](/api/docbin) containing
|
||||
one or more [`Doc`](/api/doc) objects. It's extremely **efficient in
|
||||
storage**, especially when packing multiple documents together. You can also
|
||||
create `Doc` objects manually, so you can write your own custom logic to convert
|
||||
and store existing annotations for use in spaCy.
|
||||
one or more [`Doc`](/api/doc) objects. It's extremely **efficient in storage**,
|
||||
especially when packing multiple documents together. You can also create `Doc`
|
||||
objects manually, so you can write your own custom logic to convert and store
|
||||
existing annotations for use in spaCy.
|
||||
|
||||
```python
|
||||
### Training data from Doc objects {highlight="6-9"}
|
||||
|
@ -1300,10 +1321,10 @@ mapping so they know which worker owns which parameter.
|
|||
As training proceeds, every worker will be computing gradients for **all** of
|
||||
the model parameters. When they compute gradients for parameters they don't own,
|
||||
they'll **send them to the worker** that does own that parameter, along with a
|
||||
version identifier so that the owner can decide whether to discard the
|
||||
gradient. Workers use the gradients they receive and the ones they compute
|
||||
locally to update the parameters they own, and then broadcast the updated array
|
||||
and a new version ID to the other workers.
|
||||
version identifier so that the owner can decide whether to discard the gradient.
|
||||
Workers use the gradients they receive and the ones they compute locally to
|
||||
update the parameters they own, and then broadcast the updated array and a new
|
||||
version ID to the other workers.
|
||||
|
||||
This training procedure is **asynchronous** and **non-blocking**. Workers always
|
||||
push their gradient increments and parameter updates, they do not have to pull
|
||||
|
|
|
@ -120,52 +120,65 @@ function parseArgs(raw) {
|
|||
return result
|
||||
}
|
||||
|
||||
function convertLine(line, i) {
|
||||
console.log(line, i)
|
||||
const cliRegex = /^(\$ )?python -m spacy/
|
||||
if (cliRegex.test(line)) {
|
||||
const text = line.replace(cliRegex, '')
|
||||
const args = parseArgs(text)
|
||||
const cmd = Object.keys(args).map((key, i) => {
|
||||
const value = args[key]
|
||||
return value === null || value === true || i === 0 ? key : `${key} ${value}`
|
||||
})
|
||||
return (
|
||||
<Fragment key={line}>
|
||||
<span data-prompt={i === 0 ? '$' : null} className={classes.cliArgSubtle}>
|
||||
python -m
|
||||
</span>{' '}
|
||||
<span>spacy</span>{' '}
|
||||
{cmd.map((item, j) => {
|
||||
const isCmd = j === 0
|
||||
const url = isCmd ? `/api/cli#${item.replace(' ', '-')}` : null
|
||||
const isAbstract = isString(item) && /^\[(.+)\]$/.test(item)
|
||||
const itemClassNames = classNames(classes.cliArg, {
|
||||
[classes.cliArgHighlight]: isCmd,
|
||||
[classes.cliArgEmphasis]: isAbstract,
|
||||
})
|
||||
const text = isAbstract ? item.slice(1, -1) : item
|
||||
return (
|
||||
<Fragment key={j}>
|
||||
{j !== 0 && ' '}
|
||||
<span className={itemClassNames}>
|
||||
<OptionalLink hidden hideIcon to={url}>
|
||||
{text}
|
||||
</OptionalLink>
|
||||
</span>
|
||||
</Fragment>
|
||||
)
|
||||
})}
|
||||
</Fragment>
|
||||
)
|
||||
}
|
||||
const htmlLine = replacePrompt(highlightCode('bash', line), '$')
|
||||
return htmlToReact(htmlLine)
|
||||
}
|
||||
|
||||
function formatCode(html, lang, prompt) {
|
||||
if (lang === 'cli') {
|
||||
const cliRegex = /^(\$ )?python -m spacy/
|
||||
const lines = html
|
||||
.trim()
|
||||
.split('\n')
|
||||
.map((line, i) => {
|
||||
if (cliRegex.test(line)) {
|
||||
const text = line.replace(cliRegex, '')
|
||||
const args = parseArgs(text)
|
||||
const cmd = Object.keys(args).map((key, i) => {
|
||||
const value = args[key]
|
||||
return value === null || value === true || i === 0 ? key : `${key} ${value}`
|
||||
})
|
||||
return (
|
||||
<Fragment key={i}>
|
||||
<span data-prompt="$" className={classes.cliArgSubtle}>
|
||||
python -m
|
||||
</span>{' '}
|
||||
<span>spacy</span>{' '}
|
||||
{cmd.map((item, j) => {
|
||||
const isCmd = j === 0
|
||||
const url = isCmd ? `/api/cli#${item.replace(' ', '-')}` : null
|
||||
const isAbstract = isString(item) && /^\[(.+)\]$/.test(item)
|
||||
const itemClassNames = classNames(classes.cliArg, {
|
||||
[classes.cliArgHighlight]: isCmd,
|
||||
[classes.cliArgEmphasis]: isAbstract,
|
||||
})
|
||||
const text = isAbstract ? item.slice(1, -1) : item
|
||||
return (
|
||||
<Fragment key={j}>
|
||||
{j !== 0 && ' '}
|
||||
<span className={itemClassNames}>
|
||||
<OptionalLink hidden hideIcon to={url}>
|
||||
{text}
|
||||
</OptionalLink>
|
||||
</span>
|
||||
</Fragment>
|
||||
)
|
||||
})}
|
||||
.map(line =>
|
||||
line
|
||||
.split(' | ')
|
||||
.map((l, i) => convertLine(l, i))
|
||||
.map((l, j) => (
|
||||
<Fragment>
|
||||
{j !== 0 && <span> | </span>}
|
||||
{l}
|
||||
</Fragment>
|
||||
)
|
||||
}
|
||||
const htmlLine = replacePrompt(highlightCode('bash', line), '$')
|
||||
return htmlToReact(htmlLine)
|
||||
})
|
||||
))
|
||||
)
|
||||
return lines.map((line, i) => (
|
||||
<Fragment key={i}>
|
||||
{i !== 0 && <br />}
|
||||
|
|
Loading…
Reference in New Issue