From 5fb8b7037aa67400a76a0545f85c3c15a2491fb6 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 12 Jan 2021 17:17:00 +0100 Subject: [PATCH 1/2] Expand initialize/training config validation Validate both `[initialize]` and `[training]` in `debug data` and `nlp.initialize()` with separate config validation error blocks that indicate which block of the config is being validated. --- spacy/cli/debug_config.py | 7 ++++++- spacy/training/initialize.py | 16 +++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index ed0a43921..5f9759c8c 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -7,7 +7,7 @@ import typer from ._util import Arg, Opt, show_validation_error, parse_config_overrides from ._util import import_code, debug_cli -from ..schemas import ConfigSchemaTraining +from ..schemas import ConfigSchemaInit, ConfigSchemaTraining from ..util import registry from .. import util @@ -55,6 +55,11 @@ def debug_config( config = util.load_config(config_path, overrides=overrides) nlp = util.load_model_from_config(config) config = nlp.config.interpolate() + msg.divider("Config validation for [initialize]") + with show_validation_error(config_path): + T = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + msg.divider("Config validation for [training]") + with show_validation_error(config_path): T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] util.resolve_dot_names(config, dot_names) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index e8a0a46de..c75c92136 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -1,4 +1,5 @@ from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING +from pydantic import BaseModel from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import ConfigValidationError from pathlib import Path @@ -12,7 +13,7 @@ import tqdm from ..lookups import Lookups from ..vectors import Vectors from ..errors import Errors -from ..schemas import ConfigSchemaTraining +from ..schemas import ConfigSchemaInit, ConfigSchemaTraining from ..util import registry, load_model_from_config, resolve_dot_names, logger from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB @@ -23,6 +24,9 @@ if TYPE_CHECKING: def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": raw_config = config config = raw_config.interpolate() + # Validate config before accessing values + _validate_config_block(config, "initialize", ConfigSchemaInit) + _validate_config_block(config, "training", ConfigSchemaTraining) if config["training"]["seed"] is not None: fix_random_seed(config["training"]["seed"]) allocator = config["training"]["gpu_allocator"] @@ -269,3 +273,13 @@ def ensure_shape(lines): length = len(captured) yield f"{length} {width}" yield from captured + + +def _validate_config_block(config: Config, block: str, schema: BaseModel): + try: + registry.resolve(config[block], validate=True, schema=schema) + except ConfigValidationError as e: + title = f"Config validation error for [{block}]" + desc = "For more information run: python -m spacy debug config config.cfg" + err = ConfigValidationError.from_error(e, title=title, desc=desc) + raise err from None From 681a6195f740daa65bc5e2f7176523617dbef610 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 14 Jan 2021 16:57:57 +0100 Subject: [PATCH 2/2] Validate seed and gpu_allocator manually --- spacy/errors.py | 2 ++ spacy/training/initialize.py | 20 +++++--------------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index a286a5ac3..8cbcbe6d9 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -730,6 +730,8 @@ class Errors: "DocBin (.spacy) format. If your data is in spaCy v2's JSON " "training format, convert it using `python -m spacy convert " "file.json .`.") + E1015 = ("Can't initialize model from config: no {value} found. For more " + "information, run: python -m spacy debug config config.cfg") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index c75c92136..1947e7c27 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -1,5 +1,4 @@ from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING -from pydantic import BaseModel from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import ConfigValidationError from pathlib import Path @@ -13,7 +12,7 @@ import tqdm from ..lookups import Lookups from ..vectors import Vectors from ..errors import Errors -from ..schemas import ConfigSchemaInit, ConfigSchemaTraining +from ..schemas import ConfigSchemaTraining from ..util import registry, load_model_from_config, resolve_dot_names, logger from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB @@ -24,9 +23,10 @@ if TYPE_CHECKING: def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": raw_config = config config = raw_config.interpolate() - # Validate config before accessing values - _validate_config_block(config, "initialize", ConfigSchemaInit) - _validate_config_block(config, "training", ConfigSchemaTraining) + if "seed" not in config["training"]: + raise ValueError(Errors.E1015.format(value="[training] seed")) + if "gpu_allocator" not in config["training"]: + raise ValueError(Errors.E1015.format(value="[training] gpu_allocator")) if config["training"]["seed"] is not None: fix_random_seed(config["training"]["seed"]) allocator = config["training"]["gpu_allocator"] @@ -273,13 +273,3 @@ def ensure_shape(lines): length = len(captured) yield f"{length} {width}" yield from captured - - -def _validate_config_block(config: Config, block: str, schema: BaseModel): - try: - registry.resolve(config[block], validate=True, schema=schema) - except ConfigValidationError as e: - title = f"Config validation error for [{block}]" - desc = "For more information run: python -m spacy debug config config.cfg" - err = ConfigValidationError.from_error(e, title=title, desc=desc) - raise err from None