diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 5ca565d88..78d828719 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -72,14 +72,15 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language: config = nlp.config.interpolate() # Resolve all training-relevant sections using the filled nlp config T = registry.resolve(config["training"], schema=ConfigSchemaTraining) - dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]] - train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(config, dot_names) I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) - init_vocab(nlp, data=I["vocab"]["data"], lookups=I["vocab"]["lookups"]) + V = I["vocab"] + init_vocab(nlp, data=V["data"], lookups=V["lookups"]) msg.good("Created vocabulary") - if T["vectors"] is not None: - add_vectors(nlp, T["vectors"]) - msg.good(f"Added vectors: {T['vectors']}") + if V["vectors"] is not None: + add_vectors(nlp, V["vectors"]) + msg.good(f"Added vectors: {V['vectors']}") optimizer = T["optimizer"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Components that shouldn't be updated during training @@ -130,20 +131,15 @@ def init_vocab( def add_tok2vec_weights( - nlp: Language, pretrain_config: Dict[str, Any], init_config: Dict[str, Any] + nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] ) -> None: # Load pretrained tok2vec weights - cf. CLI command 'pretrain' P = pretrain_config - I = init_config - raw_text = util.ensure_path(I["vocab"]["raw_text"]) - if raw_text is not None: - if not raw_text.exists(): - msg.fail("Can't find raw text", raw_text, exits=1) - raw_text = list(srsly.read_jsonl(raw_text)) + V = vocab_config weights_data = None - init_tok2vec = util.ensure_path(I["vocab"]["init_tok2vec"]) + init_tok2vec = util.ensure_path(V["init_tok2vec"]) if init_tok2vec is not None: - if P["objective"].get("type") == "vectors" and not I["vectors"]: + if P["objective"].get("type") == "vectors" and not V["vectors"]: err = "Need initialize.vectors if pretraining.objective.type is vectors" msg.fail(err, exits=1) if not init_tok2vec.exists(): diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 9a8b9d1d7..5e990611e 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -277,11 +277,6 @@ path = ${paths.dev} max_length = 0 [training] -{% if use_transformer or optimize == "efficiency" or not word_vectors -%} -vectors = null -{% else -%} -vectors = "{{ word_vectors }}" -{% endif -%} {% if use_transformer -%} accumulate_gradient = {{ transformer["size_factor"] }} {% endif -%} @@ -317,3 +312,12 @@ start = 100 stop = 1000 compound = 1.001 {% endif %} + +[initialize] + +[initialize.vocab] +{% if use_transformer or optimize == "efficiency" or not word_vectors -%} +vectors = null +{% else -%} +vectors = "{{ word_vectors }}" +{% endif -%} diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 0ab27f499..083b6a702 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,8 +1,9 @@ [paths] train = "" dev = "" -raw = null +raw_text = null init_tok2vec = null +vocab_data = null [system] seed = 0 @@ -54,11 +55,6 @@ seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} dropout = 0.1 accumulate_gradient = 1 -# Extra resources for transfer-learning or pseudo-rehearsal -init_tok2vec = ${paths.init_tok2vec} -raw_text = ${paths.raw} -vectors = null -lookups = null # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 max_epochs = 0 @@ -112,9 +108,8 @@ tokenizer = {} components = {} [initialize.vocab] -data = null +data = ${paths.vocab_data} lookups = null vectors = null # Extra resources for transfer-learning or pseudo-rehearsal init_tok2vec = ${paths.init_tok2vec} -raw_text = ${paths.raw} diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg index bbd595308..122a7803a 100644 --- a/spacy/default_config_pretraining.cfg +++ b/spacy/default_config_pretraining.cfg @@ -32,7 +32,7 @@ learn_rate = 0.001 [corpora.pretrain] @readers = "spacy.JsonlReader.v1" -path = ${paths.raw} +path = ${paths.raw_text} min_length = 5 max_length = 500 limit = 0 diff --git a/spacy/schemas.py b/spacy/schemas.py index 6553892d3..b98498b8b 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -3,7 +3,6 @@ from typing import Iterable, TypeVar, TYPE_CHECKING from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool -from pydantic import root_validator from thinc.config import Promise from collections import defaultdict from thinc.api import Optimizer @@ -205,8 +204,6 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off - vectors: Optional[StrictStr] = Field(..., title="Path to vectors") - lookups: Optional[Lookups] = Field(..., title="Vocab lookups") dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") train_corpus: StrictStr = Field(..., title="Path in the config to the training data") batcher: Batcher = Field(..., title="Batcher for the training data") @@ -219,8 +216,6 @@ class ConfigSchemaTraining(BaseModel): gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU") accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model") - init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") - raw_text: Optional[StrictStr] = Field(default=None, title="Raw text") optimizer: Optimizer = Field(..., title="The optimizer to use") logger: Logger = Field(..., title="The logger to track training progress") frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") @@ -275,11 +270,10 @@ class ConfigSchemaPretrain(BaseModel): class ConfigSchemaInitVocab(BaseModel): # fmt: off - data: Optional[str] = Field(..., title="Path to JSON-formatted vocabulary file") + data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file") lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization") vectors: Optional[StrictStr] = Field(..., title="Path to vectors") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") - raw_text: Optional[StrictStr] = Field(default=None, title="Raw text") # fmt: on class Config: @@ -290,7 +284,7 @@ class ConfigSchemaInitVocab(BaseModel): class ConfigSchemaInit(BaseModel): vocab: ConfigSchemaInitVocab tokenizer: Any - components: Dict[str, Any] + components: Dict[StrictStr, Any] class Config: extra = "forbid"