Update config

This commit is contained in:
Ines Montani 2020-09-28 12:05:23 +02:00
parent 9f6ad06452
commit 1590de11b1
5 changed files with 26 additions and 37 deletions

View File

@ -72,14 +72,15 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
config = nlp.config.interpolate()
# Resolve all training-relevant sections using the filled nlp config
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
dot_names = [T["train_corpus"], T["dev_corpus"]]
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
init_vocab(nlp, data=I["vocab"]["data"], lookups=I["vocab"]["lookups"])
V = I["vocab"]
init_vocab(nlp, data=V["data"], lookups=V["lookups"])
msg.good("Created vocabulary")
if T["vectors"] is not None:
add_vectors(nlp, T["vectors"])
msg.good(f"Added vectors: {T['vectors']}")
if V["vectors"] is not None:
add_vectors(nlp, V["vectors"])
msg.good(f"Added vectors: {V['vectors']}")
optimizer = T["optimizer"]
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
# Components that shouldn't be updated during training
@ -130,20 +131,15 @@ def init_vocab(
def add_tok2vec_weights(
nlp: Language, pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
) -> None:
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
P = pretrain_config
I = init_config
raw_text = util.ensure_path(I["vocab"]["raw_text"])
if raw_text is not None:
if not raw_text.exists():
msg.fail("Can't find raw text", raw_text, exits=1)
raw_text = list(srsly.read_jsonl(raw_text))
V = vocab_config
weights_data = None
init_tok2vec = util.ensure_path(I["vocab"]["init_tok2vec"])
init_tok2vec = util.ensure_path(V["init_tok2vec"])
if init_tok2vec is not None:
if P["objective"].get("type") == "vectors" and not I["vectors"]:
if P["objective"].get("type") == "vectors" and not V["vectors"]:
err = "Need initialize.vectors if pretraining.objective.type is vectors"
msg.fail(err, exits=1)
if not init_tok2vec.exists():

View File

@ -277,11 +277,6 @@ path = ${paths.dev}
max_length = 0
[training]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}
{% if use_transformer -%}
accumulate_gradient = {{ transformer["size_factor"] }}
{% endif -%}
@ -317,3 +312,12 @@ start = 100
stop = 1000
compound = 1.001
{% endif %}
[initialize]
[initialize.vocab]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}

View File

@ -1,8 +1,9 @@
[paths]
train = ""
dev = ""
raw = null
raw_text = null
init_tok2vec = null
vocab_data = null
[system]
seed = 0
@ -54,11 +55,6 @@ seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
# Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths.raw}
vectors = null
lookups = null
# Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600
max_epochs = 0
@ -112,9 +108,8 @@ tokenizer = {}
components = {}
[initialize.vocab]
data = null
data = ${paths.vocab_data}
lookups = null
vectors = null
# Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths.raw}

View File

@ -32,7 +32,7 @@ learn_rate = 0.001
[corpora.pretrain]
@readers = "spacy.JsonlReader.v1"
path = ${paths.raw}
path = ${paths.raw_text}
min_length = 5
max_length = 500
limit = 0

View File

@ -3,7 +3,6 @@ from typing import Iterable, TypeVar, TYPE_CHECKING
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
from pydantic import root_validator
from thinc.config import Promise
from collections import defaultdict
from thinc.api import Optimizer
@ -205,8 +204,6 @@ class ModelMetaSchema(BaseModel):
class ConfigSchemaTraining(BaseModel):
# fmt: off
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
batcher: Batcher = Field(..., title="Batcher for the training data")
@ -219,8 +216,6 @@ class ConfigSchemaTraining(BaseModel):
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
optimizer: Optimizer = Field(..., title="The optimizer to use")
logger: Logger = Field(..., title="The logger to track training progress")
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
@ -275,11 +270,10 @@ class ConfigSchemaPretrain(BaseModel):
class ConfigSchemaInitVocab(BaseModel):
# fmt: off
data: Optional[str] = Field(..., title="Path to JSON-formatted vocabulary file")
data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
# fmt: on
class Config:
@ -290,7 +284,7 @@ class ConfigSchemaInitVocab(BaseModel):
class ConfigSchemaInit(BaseModel):
vocab: ConfigSchemaInitVocab
tokenizer: Any
components: Dict[str, Any]
components: Dict[StrictStr, Any]
class Config:
extra = "forbid"