diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6cf4d79c8..45520978b 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,13 +1,12 @@ -from typing import Optional, Dict, List, Union, Sequence +from typing import Optional, Dict from timeit import default_timer as timer import srsly import tqdm -from pydantic import BaseModel, FilePath from pathlib import Path from wasabi import msg import thinc import thinc.schedules -from thinc.api import Model, use_pytorch_for_gpu_memory, require_gpu, fix_random_seed +from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed import random from ._app import app, Arg, Opt @@ -15,108 +14,15 @@ from ..gold import Corpus, Example from ..lookups import Lookups from .. import util from ..errors import Errors +from ..schemas import ConfigSchema + # Don't remove - required to load the built-in architectures from ..ml import models # noqa: F401 -# from ..schemas import ConfigSchema # TODO: include? - registry = util.registry -CONFIG_STR = """ -[training] -patience = 10 -eval_frequency = 10 -dropout = 0.2 -init_tok2vec = null -max_epochs = 100 -orth_variant_level = 0.0 -gold_preproc = false -max_length = 0 -use_gpu = 0 -scores = ["ents_p", "ents_r", "ents_f"] -score_weights = {"ents_f": 1.0} -limit = 0 - -[training.batch_size] -@schedules = "compounding.v1" -start = 100 -stop = 1000 -compound = 1.001 - -[optimizer] -@optimizers = "Adam.v1" -learn_rate = 0.001 -beta1 = 0.9 -beta2 = 0.999 - -[nlp] -lang = "en" -vectors = null - -[nlp.pipeline.tok2vec] -factory = "tok2vec" - -[nlp.pipeline.ner] -factory = "ner" - -[nlp.pipeline.ner.model] -@architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 3 -hidden_width = 64 -maxout_pieces = 3 - -[nlp.pipeline.ner.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" -width = ${nlp.pipeline.tok2vec.model:width} - -[nlp.pipeline.tok2vec.model] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = ${nlp:vectors} -width = 128 -depth = 4 -window_size = 1 -embed_size = 10000 -maxout_pieces = 3 -subword_features = true -""" - - -class PipelineComponent(BaseModel): - factory: str - model: Model - - class Config: - arbitrary_types_allowed = True - - -class ConfigSchema(BaseModel): - optimizer: Optional["Optimizer"] - - class training(BaseModel): - patience: int = 10 - eval_frequency: int = 100 - dropout: float = 0.2 - init_tok2vec: Optional[FilePath] = None - max_epochs: int = 100 - orth_variant_level: float = 0.0 - gold_preproc: bool = False - max_length: int = 0 - use_gpu: int = 0 - scores: List[str] = ["ents_p", "ents_r", "ents_f"] - score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0} - limit: int = 0 - batch_size: Union[Sequence[int], int] - - class nlp(BaseModel): - lang: str - vectors: Optional[str] - pipeline: Optional[Dict[str, PipelineComponent]] - - class Config: - extra = "allow" - @app.command("train") def train_cli( @@ -126,12 +32,7 @@ def train_cli( config_path: Path = Arg(..., help="Path to config file", exists=True), output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."), - raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), - tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"), - omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"), # fmt: on ): """ @@ -141,33 +42,11 @@ def train_cli( """ util.set_env_log(verbose) verify_cli_args(**locals()) - - if raw_text is not None: - raw_text = list(srsly.read_jsonl(raw_text)) - tag_map = {} - if tag_map_path is not None: - tag_map = srsly.read_json(tag_map_path) - - weights_data = None - if init_tok2vec is not None: - with init_tok2vec.open("rb") as file_: - weights_data = file_.read() - - if use_gpu >= 0: - msg.info("Using GPU: {use_gpu}") - require_gpu(use_gpu) - else: - msg.info("Using CPU") - - train( - config_path, - {"train": train_path, "dev": dev_path}, - output_path=output_path, - raw_text=raw_text, - tag_map=tag_map, - weights_data=weights_data, - omit_extra_lookups=omit_extra_lookups, - ) + try: + util.import_file("python_code", code_path) + except Exception as e: + msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) + train(config_path, {"train": train_path, "dev": dev_path}, output_path=output_path) def train( @@ -175,19 +54,24 @@ def train( data_paths: Dict[str, Path], raw_text: Optional[Path] = None, output_path: Optional[Path] = None, - tag_map: Optional[Path] = None, weights_data: Optional[bytes] = None, - omit_extra_lookups: bool = False, ) -> None: msg.info(f"Loading config from: {config_path}") # Read the config first without creating objects, to get to the original nlp_config - config = util.load_config(config_path, create_objects=False) + config = util.load_config(config_path, create_objects=False, schema=ConfigSchema) + use_gpu = config["training"]["use_gpu"] + if use_gpu >= 0: + msg.info(f"Using GPU: {use_gpu}") + require_gpu(use_gpu) + else: + msg.info("Using CPU") + raw_text, tag_map, weights_data = load_from_paths(config) fix_random_seed(config["training"]["seed"]) if config["training"].get("use_pytorch_for_gpu_memory"): # It feels kind of weird to not have a default for this. use_pytorch_for_gpu_memory() nlp_config = config["nlp"] - config = util.load_config(config_path, create_objects=True) + config = util.load_config(config_path, create_objects=True, schema=ConfigSchema) training = config["training"] msg.info("Creating nlp from config") nlp = util.load_model_from_config(nlp_config) @@ -216,7 +100,7 @@ def train( # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed - if omit_extra_lookups: + if config["omit_extra_lookups"]: nlp.vocab.lookups_extra = Lookups() nlp.vocab.lookups_extra.add_table("lexeme_cluster") nlp.vocab.lookups_extra.add_table("lexeme_prob") @@ -556,18 +440,36 @@ def update_meta(training, nlp, info): nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] +def load_from_paths(config): + # TODO: separate checks from loading + raw_text = util.ensure_path(config["training"]["raw_text"]) + if raw_text is not None: + if not raw_text.exists(): + msg.fail("Can't find raw text", raw_text, exits=1) + raw_text = list(srsly.read_jsonl(config["training"]["raw_text"])) + tag_map = {} + tag_map_path = util.ensure_path(config["training"]["tag_map"]) + if tag_map_path is not None: + if not tag_map_path.exists(): + msg.fail("Can't find tag map path", tag_map_path, exits=1) + tag_map = srsly.read_json(config["training"]["tag_map"]) + weights_data = None + init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"]) + if init_tok2vec is not None: + if not init_tok2vec.exists(): + msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) + with init_tok2vec.open("rb") as file_: + weights_data = file_.read() + return raw_text, tag_map, weights_data + + def verify_cli_args( - train_path, - dev_path, - config_path, - output_path=None, - code_path=None, - init_tok2vec=None, - raw_text=None, - verbose=False, - use_gpu=-1, - tag_map_path=None, - omit_extra_lookups=False, + train_path: Path, + dev_path: Path, + config_path: Path, + output_path: Optional[Path] = None, + code_path: Optional[Path] = None, + verbose: bool = False, ): # Make sure all files and paths exists if they are needed if not config_path or not config_path.exists(): @@ -591,12 +493,6 @@ def verify_cli_args( if code_path is not None: if not code_path.exists(): msg.fail("Path to Python code not found", code_path, exits=1) - try: - util.import_file("python_code", code_path) - except Exception as e: - msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) - if init_tok2vec is not None and not init_tok2vec.exists(): - msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) def verify_textcat_config(nlp, nlp_config): diff --git a/spacy/schemas.py b/spacy/schemas.py index c67814dfd..d8df33ed5 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,9 +1,10 @@ from typing import Dict, List, Union, Optional, Sequence, Any from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator -from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, FilePath +from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool +from pydantic import FilePath, DirectoryPath from collections import defaultdict -from thinc.api import Model +from thinc.api import Model, Optimizer from .attrs import NAMES @@ -173,41 +174,6 @@ class ModelMetaSchema(BaseModel): # JSON training format -class PipelineComponent(BaseModel): - factory: str - model: Model - - class Config: - arbitrary_types_allowed = True - - -class ConfigSchema(BaseModel): - optimizer: Optional["Optimizer"] - - class training(BaseModel): - patience: int = 10 - eval_frequency: int = 100 - dropout: float = 0.2 - init_tok2vec: Optional[FilePath] = None - max_epochs: int = 100 - orth_variant_level: float = 0.0 - gold_preproc: bool = False - max_length: int = 0 - use_gpu: int = 0 - scores: List[str] = ["ents_p", "ents_r", "ents_f"] - score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0} - limit: int = 0 - batch_size: Union[Sequence[int], int] - - class nlp(BaseModel): - lang: str - vectors: Optional[str] - pipeline: Optional[Dict[str, PipelineComponent]] - - class Config: - extra = "allow" - - class TrainingSchema(BaseModel): # TODO: write @@ -216,6 +182,76 @@ class TrainingSchema(BaseModel): extra = "forbid" +# Config schema +# We're not setting any defaults here (which is too messy) and are making all +# fields required, so we can raise validation errors for missing values. To +# provide a default, we include a separate .cfg file with all values and +# check that against this schema in the test suite to make sure it's always +# up to date. + + +class ConfigSchemaTraining(BaseModel): + # fmt: off + gold_preproc: StrictBool = Field(..., title="Whether to train on gold-standard sentences and tokens") + max_length: StrictInt = Field(..., title="Maximum length of examples (longer examples are divided into sentences if possible)") + limit: StrictInt = Field(..., title="Number of examples to use (0 for all)") + orth_variant_level: StrictFloat = Field(..., title="Orth variants for data augmentation") + dropout: StrictFloat = Field(..., title="Dropout rate") + patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score") + max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for") + max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for") + eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)") + seed: StrictInt = Field(..., title="Random seed") + accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") + use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch") + use_gpu: StrictInt = Field(..., title="GPU ID or -1 for CPU") + scores: List[StrictStr] = Field(..., title="Score types to be printed in overview") + score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Weights of each score type for selecting final model") + init_tok2vec: Optional[FilePath] = Field(..., title="Path to pretrained tok2vec weights") + discard_oversize: StrictBool = Field(..., title="Whether to skip examples longer than batch size") + omit_extra_lookups: StrictBool = Field(..., title="Don't include extra lookups in model") + batch_by: StrictStr = Field(..., title="Batch examples by type") + raw_text: Optional[FilePath] = Field(..., title="Raw text") + tag_map: Optional[FilePath] = Field(..., title="Path to JSON-formatted tag map") + batch_size: Union[Sequence[int], int] = Field(..., title="The batch size or batch size schedule") + optimizer: Optimizer = Field(..., title="The optimizer to use") + # fmt: on + + class Config: + extra = "forbid" + arbitrary_types_allowed = True + + +class ConfigSchemaNlpComponent(BaseModel): + factory: StrictStr = Field(..., title="Component factory name") + model: Model = Field(..., title="Component model") + # TODO: add config schema / types for components so we can fill and validate + # component options like learn_tokens, min_action_freq etc. + + class Config: + extra = "allow" + arbitrary_types_allowed = True + + +class ConfigSchemaNlp(BaseModel): + lang: StrictStr = Field(..., title="The base language to use") + vectors: Optional[DirectoryPath] = Field(..., title="Path to vectors") + pipeline: Optional[Dict[str, ConfigSchemaNlpComponent]] + + class Config: + extra = "forbid" + arbitrary_types_allowed = True + + +class ConfigSchema(BaseModel): + training: ConfigSchemaTraining + nlp: ConfigSchemaNlp + + class Config: + extra = "allow" + arbitrary_types_allowed = True + + # Project config Schema diff --git a/spacy/util.py b/spacy/util.py index 4ed002f37..c91c2af25 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,4 +1,4 @@ -from typing import List, Union +from typing import List, Union, Type, Dict, Any import os import importlib import importlib.util @@ -6,6 +6,8 @@ import re from pathlib import Path import thinc from thinc.api import NumpyOps, get_current_ops, Adam, Config +from thinc.config import EmptySchema +from pydantic import BaseModel import functools import itertools import numpy.random @@ -20,6 +22,7 @@ import subprocess from contextlib import contextmanager import tempfile import shutil +import hashlib import shlex try: @@ -326,20 +329,29 @@ def get_base_version(version): return Version(version).base_version -def load_config(path, create_objects=False): +def load_config( + path: Union[Path, str], + *, + create_objects: bool = False, + schema: Type[BaseModel] = EmptySchema, + validate: bool = True, +) -> Dict[str, Any]: """Load a Thinc-formatted config file, optionally filling in objects where the config references registry entries. See "Thinc config files" for details. path (str / Path): Path to the config file create_objects (bool): Whether to automatically create objects when the config references registry entries. Defaults to False. - + schema (BaseModel): Optional pydantic base schema to use for validation. RETURNS (dict): The objects from the config file. """ config = thinc.config.Config().from_disk(path) if create_objects: - return registry.make_from_config(config, validate=True) + return registry.make_from_config(config, validate=validate, schema=schema) else: + # Just fill config here so we can validate and fail early + if validate and schema: + registry.fill_config(config, validate=validate, schema=schema) return config