diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index 477df11fc..95c2f28bd 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -48,7 +48,7 @@ use_averages = false eps = 1e-8 #learn_rate = 0.001 -[optimizer.learn_rate] +[training.optimizer.learn_rate] @schedules = "warmup_linear.v1" warmup_steps = 250 total_steps = 20000 @@ -56,8 +56,11 @@ initial_rate = 0.001 [nlp] lang = "en" +base_model = null vectors = null +[nlp.pipeline] + [nlp.pipeline.tok2vec] factory = "tok2vec" diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index cc63ba7a1..211339603 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -25,6 +25,11 @@ score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} # These settings are invalid for the transformer models. init_tok2vec = null discard_oversize = false +omit_extra_lookups = false +batch_by = "words" +use_gpu = -1 +raw_text = null +tag_map = null [training.batch_size] @schedules = "compounding.v1" @@ -72,6 +77,9 @@ learn_rate = 0.001 [nlp] lang = "en" vectors = null +base_model = null + +[nlp.pipeline] [nlp.pipeline.tok2vec] factory = "tok2vec" diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 4596a9d32..125584643 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,4 +1,4 @@ -from typing import Dict, Any, Union, List +from typing import Dict, Any, Union, List, Optional from pathlib import Path from wasabi import msg import srsly @@ -11,6 +11,7 @@ from configparser import InterpolationError import sys from ..schemas import ProjectConfigSchema, validate +from ..util import import_file PROJECT_FILE = "project.yml" @@ -172,3 +173,16 @@ def show_validation_error(title: str = "Config validation error"): msg.fail(title, spaced=True) print(str(e).replace("Config validation error", "").strip()) sys.exit(1) + + +def import_code(code_path: Optional[Union[Path, str]]) -> None: + """Helper to import Python file provided in training commands / commands + using the config. This makes custom registered functions available. + """ + if code_path is not None: + if not Path(code_path).exists(): + msg.fail("Path to Python code not found", code_path, exits=1) + try: + import_file("python_code", code_path) + except Exception as e: + msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index df3236511..84ee8a757 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -1,15 +1,18 @@ -from typing import Optional, List, Sequence, Dict, Any, Tuple +from typing import List, Sequence, Dict, Any, Tuple, Optional from pathlib import Path from collections import Counter import sys import srsly from wasabi import Printer, MESSAGES +import typer -from ._util import app, Arg, Opt +from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides +from ._util import import_code +from ..schemas import ConfigSchema from ..gold import Corpus, Example from ..syntax import nonproj from ..language import Language -from ..util import load_model, get_lang_class +from .. import util # Minimum number of expected occurrences of NER label in data to train new label @@ -24,12 +27,11 @@ BLANK_MODEL_THRESHOLD = 2000 @app.command("debug-data") def debug_data_cli( # fmt: off - lang: str = Arg(..., help="Model language"), + ctx: typer.Context, # This is only used to read additional arguments train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True), dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True), - tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map", exists=True, dir_okay=False), - base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Name of model to update (optional)"), - pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of pipeline components to train"), + config_path: Path = Arg(..., help="Path to config file", exists=True), + code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"), no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"), @@ -40,13 +42,13 @@ def debug_data_cli( stats, and find problems like invalid entity annotations, cyclic dependencies, low data labels and more. """ + overrides = parse_config_overrides(ctx.args) + import_code(code_path) debug_data( - lang, train_path, dev_path, - tag_map_path=tag_map_path, - base_model=base_model, - pipeline=[p.strip() for p in pipeline.split(",")], + config_path, + config_overrides=overrides, ignore_warnings=ignore_warnings, verbose=verbose, no_format=no_format, @@ -55,13 +57,11 @@ def debug_data_cli( def debug_data( - lang: str, train_path: Path, dev_path: Path, + config_path: Path, *, - tag_map_path: Optional[Path] = None, - base_model: Optional[str] = None, - pipeline: List[str] = ["tagger", "parser", "ner"], + config_overrides: Dict[str, Any] = {}, ignore_warnings: bool = False, verbose: bool = False, no_format: bool = True, @@ -75,25 +75,27 @@ def debug_data( msg.fail("Training data not found", train_path, exits=1) if not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) - + if not config_path.exists(): + msg.fail("Config file not found", config_path, exists=1) + with show_validation_error(): + config = util.load_config( + config_path, + create_objects=False, + schema=ConfigSchema, + overrides=config_overrides, + ) + nlp = util.load_model_from_config(config["nlp"]) + lang = config["nlp"]["lang"] + base_model = config["nlp"]["base_model"] + pipeline = list(config["nlp"]["pipeline"].keys()) + tag_map_path = util.ensure_path(config["training"]["tag_map"]) tag_map = {} if tag_map_path is not None: tag_map = srsly.read_json(tag_map_path) - - # Initialize the model and pipeline - if base_model: - nlp = load_model(base_model) - else: - lang_cls = get_lang_class(lang) - nlp = lang_cls() # Update tag map with provided mapping nlp.vocab.morphology.tag_map.update(tag_map) - msg.divider("Data format validation") - - # TODO: Validate data format using the JSON schema - # TODO: update once the new format is ready - # TODO: move validation to GoldCorpus in order to be able to load from dir + msg.divider("Data file validation") # Create the gold corpus to be able to better analyze data loading_train_error_message = "" @@ -380,7 +382,7 @@ def debug_data( if gold_dev_data["n_nonproj"] > 0: n_nonproj = gold_dev_data["n_nonproj"] msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)") - msg.info(f"{labels_train_unpreprocessed} label(s) in train data") + msg.info(f"{len(labels_train_unpreprocessed)} label(s) in train data") msg.info(f"{len(labels_train)} label(s) in projectivized train data") labels_with_counts = _format_labels( gold_train_unpreprocessed_data["deps"].most_common(), counts=True diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index d1fa71fd7..c509bb945 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Dict, Any import random import numpy import time @@ -11,8 +11,11 @@ from thinc.api import CosineDistance, L2Distance from wasabi import msg import srsly from functools import partial +import typer -from ._util import app, Arg, Opt +from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error +from ._util import import_code +from ..schemas import ConfigSchema from ..errors import Errors from ..ml.models.multi_task import build_cloze_multi_task_model from ..ml.models.multi_task import build_cloze_characters_multi_task_model @@ -24,10 +27,11 @@ from .. import util @app.command("pretrain") def pretrain_cli( # fmt: off + ctx: typer.Context, # This is only used to read additional arguments texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True), output_dir: Path = Arg(..., help="Directory to write models to on each epoch"), config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False), - use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), + code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."), # fmt: on @@ -51,11 +55,13 @@ def pretrain_cli( all settings are the same between pretraining and training. Ideally, this is done by using the same config file for both commands. """ + overrides = parse_config_overrides(ctx.args) + import_code(code_path) pretrain( texts_loc, output_dir, config_path, - use_gpu=use_gpu, + config_overrides=overrides, resume_path=resume_path, epoch_resume=epoch_resume, ) @@ -65,24 +71,34 @@ def pretrain( texts_loc: Path, output_dir: Path, config_path: Path, - use_gpu: int = -1, + config_overrides: Dict[str, Any] = {}, resume_path: Optional[Path] = None, epoch_resume: Optional[int] = None, ): - verify_cli_args(**locals()) + verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume) + msg.info(f"Loading config from: {config_path}") + with show_validation_error(): + config = util.load_config( + config_path, + create_objects=False, + validate=True, + schema=ConfigSchema, + overrides=config_overrides, + ) if not output_dir.exists(): output_dir.mkdir() msg.good(f"Created output directory: {output_dir}") + use_gpu = config["training"]["use_gpu"] if use_gpu >= 0: msg.info("Using GPU") require_gpu(use_gpu) else: msg.info("Using CPU") - msg.info(f"Loading config from: {config_path}") - config = util.load_config(config_path, create_objects=False) - fix_random_seed(config["pretraining"]["seed"]) + seed = config["pretraining"]["seed"] + if seed is not None: + fix_random_seed(seed) if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]: use_pytorch_for_gpu_memory() @@ -360,9 +376,7 @@ def _smart_round(figure, width=10, max_decimal=4): return format_str % figure -def verify_cli_args( - texts_loc, output_dir, config_path, use_gpu, resume_path, epoch_resume -): +def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume): if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) if output_dir.exists() and [p for p in output_dir.iterdir()]: @@ -401,10 +415,3 @@ def verify_cli_args( f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid", exits=True, ) - config = util.load_config(config_path, create_objects=False) - if config["pretraining"]["objective"]["type"] == "vectors": - if not config["nlp"]["vectors"]: - msg.fail( - "Must specify nlp.vectors if pretraining.objective.type is vectors", - exits=True, - ) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 4707a8e98..feebc30d4 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -11,6 +11,7 @@ import random import typer from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error +from ._util import import_code from ..gold import Corpus, Example from ..lookups import Lookups from .. import util @@ -53,17 +54,10 @@ def train_cli( """ util.set_env_log(verbose) verify_cli_args( - train_path=train_path, - dev_path=dev_path, - config_path=config_path, - code_path=code_path, + train_path=train_path, dev_path=dev_path, config_path=config_path, ) overrides = parse_config_overrides(ctx.args) - if code_path is not None: - try: - util.import_file("python_code", code_path) - except Exception as e: - msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) + import_code(code_path) train( config_path, {"train": train_path, "dev": dev_path}, @@ -503,7 +497,6 @@ def verify_cli_args( dev_path: Path, config_path: Path, output_path: Optional[Path] = None, - code_path: Optional[Path] = None, ): # Make sure all files and paths exists if they are needed if not config_path or not config_path.exists(): @@ -524,9 +517,6 @@ def verify_cli_args( "the specified output path doesn't exist, the directory will be " "created for you.", ) - if code_path is not None: - if not code_path.exists(): - msg.fail("Path to Python code not found", code_path, exits=1) def verify_textcat_config(nlp, nlp_config): diff --git a/spacy/schemas.py b/spacy/schemas.py index e0776c56e..10280afa6 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -2,7 +2,7 @@ from typing import Dict, List, Union, Optional, Sequence, Any from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool -from pydantic import FilePath, DirectoryPath +from pydantic import FilePath, DirectoryPath, root_validator from collections import defaultdict from thinc.api import Model, Optimizer @@ -242,6 +242,7 @@ class ConfigSchemaPipeline(BaseModel): class ConfigSchemaNlp(BaseModel): lang: StrictStr = Field(..., title="The base language to use") + base_model: Optional[StrictStr] = Field(..., title="The base model to use") vectors: Optional[DirectoryPath] = Field(..., title="Path to vectors") pipeline: Optional[ConfigSchemaPipeline] @@ -250,9 +251,40 @@ class ConfigSchemaNlp(BaseModel): arbitrary_types_allowed = True +class ConfigSchemaPretrain(BaseModel): + # fmt: off + max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for") + min_length: StrictInt = Field(..., title="Minimum length of examples") + max_length: StrictInt = Field(..., title="Maximum length of examples") + dropout: StrictFloat = Field(..., title="Dropout rate") + n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency") + batch_size: Union[Sequence[int], int] = Field(..., title="The batch size or batch size schedule") + seed: Optional[StrictInt] = Field(..., title="Random seed") + use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch") + tok2vec_model: StrictStr = Field(..., title="tok2vec model in config, e.g. nlp.pipeline.tok2vec.model") + optimizer: Optimizer = Field(..., title="The optimizer to use") + # TODO: use a more detailed schema for this? + objective: Dict[str, Any] = Field(..., title="Pretraining objective") + # fmt: on + + class Config: + extra = "forbid" + arbitrary_types_allowed = True + + class ConfigSchema(BaseModel): training: ConfigSchemaTraining nlp: ConfigSchemaNlp + pretraining: Optional[ConfigSchemaPretrain] + + @root_validator + def validate_config(cls, values): + """Perform additional validation for settings with dependencies.""" + pt = values.get("pretraining") + if pt and pt.objective.get("type") == "vectors" and not values["nlp"].vectors: + err = "Need nlp.vectors if pretraining.objective.type is vectors" + raise ValueError(err) + return values class Config: extra = "allow" diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 870a980f2..85346de90 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -12,6 +12,8 @@ nlp_config_string = """ [nlp] lang = "en" +[nlp.pipeline] + [nlp.pipeline.tok2vec] factory = "tok2vec"