mirror of https://github.com/explosion/spaCy.git
Update config
This commit is contained in:
parent
9f6ad06452
commit
1590de11b1
|
@ -72,14 +72,15 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
|
|||
config = nlp.config.interpolate()
|
||||
# Resolve all training-relevant sections using the filled nlp config
|
||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||
dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
|
||||
train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
|
||||
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||
init_vocab(nlp, data=I["vocab"]["data"], lookups=I["vocab"]["lookups"])
|
||||
V = I["vocab"]
|
||||
init_vocab(nlp, data=V["data"], lookups=V["lookups"])
|
||||
msg.good("Created vocabulary")
|
||||
if T["vectors"] is not None:
|
||||
add_vectors(nlp, T["vectors"])
|
||||
msg.good(f"Added vectors: {T['vectors']}")
|
||||
if V["vectors"] is not None:
|
||||
add_vectors(nlp, V["vectors"])
|
||||
msg.good(f"Added vectors: {V['vectors']}")
|
||||
optimizer = T["optimizer"]
|
||||
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
||||
# Components that shouldn't be updated during training
|
||||
|
@ -130,20 +131,15 @@ def init_vocab(
|
|||
|
||||
|
||||
def add_tok2vec_weights(
|
||||
nlp: Language, pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
|
||||
nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
|
||||
) -> None:
|
||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||
P = pretrain_config
|
||||
I = init_config
|
||||
raw_text = util.ensure_path(I["vocab"]["raw_text"])
|
||||
if raw_text is not None:
|
||||
if not raw_text.exists():
|
||||
msg.fail("Can't find raw text", raw_text, exits=1)
|
||||
raw_text = list(srsly.read_jsonl(raw_text))
|
||||
V = vocab_config
|
||||
weights_data = None
|
||||
init_tok2vec = util.ensure_path(I["vocab"]["init_tok2vec"])
|
||||
init_tok2vec = util.ensure_path(V["init_tok2vec"])
|
||||
if init_tok2vec is not None:
|
||||
if P["objective"].get("type") == "vectors" and not I["vectors"]:
|
||||
if P["objective"].get("type") == "vectors" and not V["vectors"]:
|
||||
err = "Need initialize.vectors if pretraining.objective.type is vectors"
|
||||
msg.fail(err, exits=1)
|
||||
if not init_tok2vec.exists():
|
||||
|
|
|
@ -277,11 +277,6 @@ path = ${paths.dev}
|
|||
max_length = 0
|
||||
|
||||
[training]
|
||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||
vectors = null
|
||||
{% else -%}
|
||||
vectors = "{{ word_vectors }}"
|
||||
{% endif -%}
|
||||
{% if use_transformer -%}
|
||||
accumulate_gradient = {{ transformer["size_factor"] }}
|
||||
{% endif -%}
|
||||
|
@ -317,3 +312,12 @@ start = 100
|
|||
stop = 1000
|
||||
compound = 1.001
|
||||
{% endif %}
|
||||
|
||||
[initialize]
|
||||
|
||||
[initialize.vocab]
|
||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||
vectors = null
|
||||
{% else -%}
|
||||
vectors = "{{ word_vectors }}"
|
||||
{% endif -%}
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
[paths]
|
||||
train = ""
|
||||
dev = ""
|
||||
raw = null
|
||||
raw_text = null
|
||||
init_tok2vec = null
|
||||
vocab_data = null
|
||||
|
||||
[system]
|
||||
seed = 0
|
||||
|
@ -54,11 +55,6 @@ seed = ${system.seed}
|
|||
gpu_allocator = ${system.gpu_allocator}
|
||||
dropout = 0.1
|
||||
accumulate_gradient = 1
|
||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
raw_text = ${paths.raw}
|
||||
vectors = null
|
||||
lookups = null
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
|
@ -112,9 +108,8 @@ tokenizer = {}
|
|||
components = {}
|
||||
|
||||
[initialize.vocab]
|
||||
data = null
|
||||
data = ${paths.vocab_data}
|
||||
lookups = null
|
||||
vectors = null
|
||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
raw_text = ${paths.raw}
|
||||
|
|
|
@ -32,7 +32,7 @@ learn_rate = 0.001
|
|||
|
||||
[corpora.pretrain]
|
||||
@readers = "spacy.JsonlReader.v1"
|
||||
path = ${paths.raw}
|
||||
path = ${paths.raw_text}
|
||||
min_length = 5
|
||||
max_length = 500
|
||||
limit = 0
|
||||
|
|
|
@ -3,7 +3,6 @@ from typing import Iterable, TypeVar, TYPE_CHECKING
|
|||
from enum import Enum
|
||||
from pydantic import BaseModel, Field, ValidationError, validator
|
||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||
from pydantic import root_validator
|
||||
from thinc.config import Promise
|
||||
from collections import defaultdict
|
||||
from thinc.api import Optimizer
|
||||
|
@ -205,8 +204,6 @@ class ModelMetaSchema(BaseModel):
|
|||
|
||||
class ConfigSchemaTraining(BaseModel):
|
||||
# fmt: off
|
||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
|
||||
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
||||
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||
|
@ -219,8 +216,6 @@ class ConfigSchemaTraining(BaseModel):
|
|||
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
|
||||
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
|
||||
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
|
||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
|
||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||
logger: Logger = Field(..., title="The logger to track training progress")
|
||||
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
||||
|
@ -275,11 +270,10 @@ class ConfigSchemaPretrain(BaseModel):
|
|||
|
||||
class ConfigSchemaInitVocab(BaseModel):
|
||||
# fmt: off
|
||||
data: Optional[str] = Field(..., title="Path to JSON-formatted vocabulary file")
|
||||
data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
|
||||
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
|
||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
|
@ -290,7 +284,7 @@ class ConfigSchemaInitVocab(BaseModel):
|
|||
class ConfigSchemaInit(BaseModel):
|
||||
vocab: ConfigSchemaInitVocab
|
||||
tokenizer: Any
|
||||
components: Dict[str, Any]
|
||||
components: Dict[StrictStr, Any]
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
|
Loading…
Reference in New Issue