Update config

2020-09-28 12:05:23 +02:00 · 2020-09-28 12:05:23 +02:00 · 1590de11b1
parent 9f6ad06452
commit 1590de11b1
5 changed files with 26 additions and 37 deletions
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -72,14 +72,15 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
    config = nlp.config.interpolate()
    # Resolve all training-relevant sections using the filled nlp config
    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
-    dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
-    train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
    I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
-    init_vocab(nlp, data=I["vocab"]["data"], lookups=I["vocab"]["lookups"])
+    V = I["vocab"]
+    init_vocab(nlp, data=V["data"], lookups=V["lookups"])
    msg.good("Created vocabulary")
-    if T["vectors"] is not None:
-        add_vectors(nlp, T["vectors"])
-        msg.good(f"Added vectors: {T['vectors']}")
+    if V["vectors"] is not None:
+        add_vectors(nlp, V["vectors"])
+        msg.good(f"Added vectors: {V['vectors']}")
    optimizer = T["optimizer"]
    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
    # Components that shouldn't be updated during training
@ -130,20 +131,15 @@ def init_vocab(


 def add_tok2vec_weights(
-    nlp: Language, pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
+    nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
 ) -> None:
    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
    P = pretrain_config
-    I = init_config
-    raw_text = util.ensure_path(I["vocab"]["raw_text"])
-    if raw_text is not None:
-        if not raw_text.exists():
-            msg.fail("Can't find raw text", raw_text, exits=1)
-        raw_text = list(srsly.read_jsonl(raw_text))
+    V = vocab_config
    weights_data = None
-    init_tok2vec = util.ensure_path(I["vocab"]["init_tok2vec"])
+    init_tok2vec = util.ensure_path(V["init_tok2vec"])
    if init_tok2vec is not None:
-        if P["objective"].get("type") == "vectors" and not I["vectors"]:
+        if P["objective"].get("type") == "vectors" and not V["vectors"]:
            err = "Need initialize.vectors if pretraining.objective.type is vectors"
            msg.fail(err, exits=1)
        if not init_tok2vec.exists():
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -277,11 +277,6 @@ path = ${paths.dev}
 max_length = 0

 [training]
-{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
-vectors = null
-{% else -%}
-vectors = "{{ word_vectors }}"
-{% endif -%}
 {% if use_transformer -%}
 accumulate_gradient = {{ transformer["size_factor"] }}
 {% endif -%}
@ -317,3 +312,12 @@ start = 100
 stop = 1000
 compound = 1.001
 {% endif %}
+
+[initialize]
+
+[initialize.vocab]
+{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
+vectors = null
+{% else -%}
+vectors = "{{ word_vectors }}"
+{% endif -%}
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -1,8 +1,9 @@
 [paths]
 train = ""
 dev = ""
-raw = null
+raw_text = null
 init_tok2vec = null
+vocab_data = null

 [system]
 seed = 0
@ -54,11 +55,6 @@ seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
-# Extra resources for transfer-learning or pseudo-rehearsal
-init_tok2vec = ${paths.init_tok2vec}
-raw_text = ${paths.raw}
-vectors = null
-lookups = null
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
 max_epochs = 0
@ -112,9 +108,8 @@ tokenizer = {}
 components = {}

 [initialize.vocab]
-data = null
+data = ${paths.vocab_data}
 lookups = null
 vectors = null
 # Extra resources for transfer-learning or pseudo-rehearsal
 init_tok2vec = ${paths.init_tok2vec}
-raw_text = ${paths.raw}
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@ -32,7 +32,7 @@ learn_rate = 0.001

 [corpora.pretrain]
@readers = "spacy.JsonlReader.v1"
-path = ${paths.raw}
+path = ${paths.raw_text}
 min_length = 5
 max_length = 500
 limit = 0
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -3,7 +3,6 @@ from typing import Iterable, TypeVar, TYPE_CHECKING
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
-from pydantic import root_validator
 from thinc.config import Promise
 from collections import defaultdict
 from thinc.api import Optimizer
@ -205,8 +204,6 @@ class ModelMetaSchema(BaseModel):

 class ConfigSchemaTraining(BaseModel):
    # fmt: off
-    vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
-    lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
    dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
    train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
    batcher: Batcher = Field(..., title="Batcher for the training data")
@ -219,8 +216,6 @@ class ConfigSchemaTraining(BaseModel):
    gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
    accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
    score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
-    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
-    raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
    optimizer: Optimizer = Field(..., title="The optimizer to use")
    logger: Logger = Field(..., title="The logger to track training progress")
    frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
@ -275,11 +270,10 @@ class ConfigSchemaPretrain(BaseModel):

 class ConfigSchemaInitVocab(BaseModel):
    # fmt: off
-    data: Optional[str] = Field(..., title="Path to JSON-formatted vocabulary file")
+    data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
    lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
    vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
-    raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
    # fmt: on

    class Config:
@ -290,7 +284,7 @@ class ConfigSchemaInitVocab(BaseModel):
 class ConfigSchemaInit(BaseModel):
    vocab: ConfigSchemaInitVocab
    tokenizer: Any
-    components: Dict[str, Any]
+    components: Dict[StrictStr, Any]

    class Config:
        extra = "forbid"