mirror of https://github.com/explosion/spaCy.git
Tidy up and remove raw text (rehearsal) for now
This commit is contained in:
parent
1590de11b1
commit
a5f2cc0509
|
@ -42,20 +42,6 @@ def init_pipeline_cli(
|
||||||
msg.good(f"Saved initialized pipeline to {output_path}")
|
msg.good(f"Saved initialized pipeline to {output_path}")
|
||||||
|
|
||||||
|
|
||||||
def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool:
|
|
||||||
config = util.load_config(config_path, overrides=overrides)
|
|
||||||
if not init_path.exists():
|
|
||||||
return True
|
|
||||||
elif not (init_path / "config.cfg").exists():
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
init_cfg = util.load_config(init_path / "config.cfg", interpolate=True)
|
|
||||||
if config.to_str() != init_cfg.to_str():
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
|
def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
|
||||||
raw_config = config
|
raw_config = config
|
||||||
config = raw_config.interpolate()
|
config = raw_config.interpolate()
|
||||||
|
|
|
@ -10,13 +10,12 @@ import random
|
||||||
import typer
|
import typer
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from .init_pipeline import init_pipeline, must_initialize
|
from .init_pipeline import init_pipeline
|
||||||
from .init_pipeline import create_before_to_disk_callback
|
from .init_pipeline import create_before_to_disk_callback
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code
|
from ._util import import_code
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..training.example import Example
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..util import resolve_dot_names, registry
|
from ..util import resolve_dot_names, registry
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaTraining
|
||||||
|
@ -69,24 +68,39 @@ def train_cli(
|
||||||
def init_nlp(
|
def init_nlp(
|
||||||
config: Config, output_path: Optional[Path], init_path: Optional[Path]
|
config: Config, output_path: Optional[Path], init_path: Optional[Path]
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
if init_path is not None:
|
if init_path is not None:
|
||||||
nlp = util.load_model(init_path)
|
nlp = util.load_model(init_path)
|
||||||
# TODO: how to handle provided pipeline that needs to be reinitialized?
|
if must_reinitialize(config, nlp.config):
|
||||||
|
msg.fail(
|
||||||
|
f"Config has changed: can't use initialized pipeline from "
|
||||||
|
f"{init_path}. Please re-run 'spacy init nlp'.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
msg.good(f"Loaded initialized pipeline from {init_path}")
|
msg.good(f"Loaded initialized pipeline from {init_path}")
|
||||||
return nlp
|
return nlp
|
||||||
if output_path is not None:
|
if output_path is not None:
|
||||||
output_init_path = output_path / "model-initial"
|
output_init_path = output_path / "model-initial"
|
||||||
if must_initialize(config, output_init_path):
|
if not output_init_path.exists():
|
||||||
msg.warn("TODO:")
|
msg.info(f"Initializing the pipeline in {output_init_path}")
|
||||||
nlp = init_pipeline(config)
|
nlp = init_pipeline(config)
|
||||||
nlp.to_disk(init_path)
|
nlp.to_disk(output_init_path)
|
||||||
msg.good(f"Saved initialized pipeline to {output_init_path}")
|
msg.good(f"Saved initialized pipeline to {output_init_path}")
|
||||||
else:
|
else:
|
||||||
nlp = util.load_model(output_init_path)
|
nlp = util.load_model(output_init_path)
|
||||||
|
if must_reinitialize(config, nlp.config):
|
||||||
|
msg.warn("Config has changed: need to re-initialize pipeline")
|
||||||
|
nlp = init_pipeline(config)
|
||||||
|
nlp.to_disk(output_init_path)
|
||||||
|
msg.good(f"Re-initialized pipeline in {output_init_path}")
|
||||||
|
else:
|
||||||
msg.good(f"Loaded initialized pipeline from {output_init_path}")
|
msg.good(f"Loaded initialized pipeline from {output_init_path}")
|
||||||
return nlp
|
return nlp
|
||||||
msg.warn("TODO:")
|
msg.warn(
|
||||||
|
"Not saving initialized model: no output directory specified. "
|
||||||
|
"To speed up training, spaCy can save the initialized nlp object with "
|
||||||
|
"the vocabulary, vectors and label scheme. To take advantage of this, "
|
||||||
|
"provide an output directory or use the 'spacy init nlp' command."
|
||||||
|
)
|
||||||
return init_pipeline(config)
|
return init_pipeline(config)
|
||||||
|
|
||||||
|
|
||||||
|
@ -101,8 +115,8 @@ def train(
|
||||||
if use_gpu >= 0 and allocator:
|
if use_gpu >= 0 and allocator:
|
||||||
set_gpu_allocator(allocator)
|
set_gpu_allocator(allocator)
|
||||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
||||||
train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
|
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||||
optimizer = T["optimizer"]
|
optimizer = T["optimizer"]
|
||||||
score_weights = T["score_weights"]
|
score_weights = T["score_weights"]
|
||||||
batcher = T["batcher"]
|
batcher = T["batcher"]
|
||||||
|
@ -121,7 +135,6 @@ def train(
|
||||||
patience=T["patience"],
|
patience=T["patience"],
|
||||||
max_steps=T["max_steps"],
|
max_steps=T["max_steps"],
|
||||||
eval_frequency=T["eval_frequency"],
|
eval_frequency=T["eval_frequency"],
|
||||||
raw_text=raw_text,
|
|
||||||
exclude=frozen_components,
|
exclude=frozen_components,
|
||||||
)
|
)
|
||||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
msg.info(f"Pipeline: {nlp.pipe_names}")
|
||||||
|
@ -171,6 +184,11 @@ def train(
|
||||||
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
msg.good(f"Saved pipeline to output directory {final_model_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def must_reinitialize(train_config: Config, init_config: Config) -> bool:
|
||||||
|
# TODO: do this better and more fine-grained
|
||||||
|
return train_config.interpolate().to_str() == init_config.interpolate().to_str()
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(nlp: Language, vectors: str) -> None:
|
def add_vectors(nlp: Language, vectors: str) -> None:
|
||||||
title = f"Config validation error for vectors {vectors}"
|
title = f"Config validation error for vectors {vectors}"
|
||||||
desc = (
|
desc = (
|
||||||
|
@ -235,7 +253,6 @@ def train_while_improving(
|
||||||
accumulate_gradient: int,
|
accumulate_gradient: int,
|
||||||
patience: int,
|
patience: int,
|
||||||
max_steps: int,
|
max_steps: int,
|
||||||
raw_text: List[Dict[str, str]],
|
|
||||||
exclude: List[str],
|
exclude: List[str],
|
||||||
):
|
):
|
||||||
"""Train until an evaluation stops improving. Works as a generator,
|
"""Train until an evaluation stops improving. Works as a generator,
|
||||||
|
@ -282,27 +299,14 @@ def train_while_improving(
|
||||||
dropouts = dropout
|
dropouts = dropout
|
||||||
results = []
|
results = []
|
||||||
losses = {}
|
losses = {}
|
||||||
if raw_text:
|
|
||||||
random.shuffle(raw_text)
|
|
||||||
raw_examples = [
|
|
||||||
Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
|
|
||||||
]
|
|
||||||
raw_batches = util.minibatch(raw_examples, size=8)
|
|
||||||
|
|
||||||
words_seen = 0
|
words_seen = 0
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
for step, (epoch, batch) in enumerate(train_data):
|
for step, (epoch, batch) in enumerate(train_data):
|
||||||
dropout = next(dropouts)
|
dropout = next(dropouts)
|
||||||
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
||||||
|
|
||||||
nlp.update(
|
nlp.update(
|
||||||
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
|
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
|
||||||
)
|
)
|
||||||
if raw_text:
|
|
||||||
# If raw text is available, perform 'rehearsal' updates,
|
|
||||||
# which use unlabelled data to reduce overfitting.
|
|
||||||
raw_batch = list(next(raw_batches))
|
|
||||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude)
|
|
||||||
# TODO: refactor this so we don't have to run it separately in here
|
# TODO: refactor this so we don't have to run it separately in here
|
||||||
for name, proc in nlp.pipeline:
|
for name, proc in nlp.pipeline:
|
||||||
if (
|
if (
|
||||||
|
@ -386,15 +390,6 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
|
||||||
def load_from_paths(
|
def load_from_paths(
|
||||||
config: Config,
|
config: Config,
|
||||||
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
|
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
|
||||||
import srsly
|
|
||||||
# TODO: separate checks from loading
|
|
||||||
raw_text = util.ensure_path(config["training"]["raw_text"])
|
|
||||||
if raw_text is not None:
|
|
||||||
if not raw_text.exists():
|
|
||||||
msg.fail("Can't find raw text", raw_text, exits=1)
|
|
||||||
raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
|
|
||||||
tag_map = {}
|
|
||||||
morph_rules = {}
|
|
||||||
weights_data = None
|
weights_data = None
|
||||||
init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
|
init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
|
@ -402,4 +397,4 @@ def load_from_paths(
|
||||||
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
|
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
|
||||||
with init_tok2vec.open("rb") as file_:
|
with init_tok2vec.open("rb") as file_:
|
||||||
weights_data = file_.read()
|
weights_data = file_.read()
|
||||||
return raw_text, tag_map, morph_rules, weights_data
|
return None, {}, {}, weights_data
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
[paths]
|
[paths]
|
||||||
train = ""
|
train = ""
|
||||||
dev = ""
|
dev = ""
|
||||||
raw_text = null
|
|
||||||
init_tok2vec = null
|
init_tok2vec = null
|
||||||
vocab_data = null
|
vocab_data = null
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue