spaCy/spacy/training/initialize.py

277 lines
11 KiB
Python
Raw Normal View History

from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING
2020-09-28 13:09:59 +00:00
from thinc.api import Config, fix_random_seed, set_gpu_allocator
from thinc.api import ConfigValidationError
from pathlib import Path
import srsly
2020-09-29 08:58:50 +00:00
import numpy
import tarfile
import gzip
import zipfile
import tqdm
Support large/infinite training corpora (#7208) * Support infinite generators for training corpora Support a training corpus with an infinite generator in the `spacy train` training loop: * Revert `create_train_batches` to the state where an infinite generator can be used as the in the first epoch of exactly one epoch without resulting in a memory leak (`max_epochs != 1` will still result in a memory leak) * Move the shuffling for the first epoch into the corpus reader, renaming it to `spacy.Corpus.v2`. * Switch to training option for shuffling in memory Training loop: * Add option `training.shuffle_train_corpus_in_memory` that controls whether the corpus is loaded in memory once and shuffled in the training loop * Revert changes to `create_train_batches` and rename to `create_train_batches_with_shuffling` for use with `spacy.Corpus.v1` and a corpus that should be loaded in memory * Add `create_train_batches_without_shuffling` for a corpus that should not be shuffled in the training loop: the corpus is merely batched during training Corpus readers: * Restore `spacy.Corpus.v1` * Add `spacy.ShuffledCorpus.v1` for a corpus shuffled in memory in the reader instead of the training loop * In combination with `shuffle_train_corpus_in_memory = False`, each epoch could result in a different augmentation * Refactor create_train_batches, validation * Rename config setting to `training.shuffle_train_corpus` * Refactor to use a single `create_train_batches` method with a `shuffle` option * Only validate `get_examples` in initialize step if: * labels are required * labels are not provided * Switch back to max_epochs=-1 for streaming train corpus * Use first 100 examples for stream train corpus init * Always check validate_get_examples in initialize
2021-04-08 08:08:04 +00:00
from itertools import islice
2020-09-28 13:09:59 +00:00
from .pretrain import get_tok2vec_ref
2020-09-28 13:09:59 +00:00
from ..lookups import Lookups
2020-09-29 08:58:50 +00:00
from ..vectors import Vectors
from ..errors import Errors, Warnings
from ..schemas import ConfigSchemaTraining
2020-09-29 14:08:39 +00:00
from ..util import registry, load_model_from_config, resolve_dot_names, logger
from ..util import load_model, ensure_path, get_sourced_components
from ..util import OOV_RANK, DEFAULT_OOV_PROB
2020-09-28 13:09:59 +00:00
if TYPE_CHECKING:
from ..language import Language # noqa: F401
2020-09-28 13:09:59 +00:00
2020-09-29 14:08:39 +00:00
def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
2020-09-28 13:09:59 +00:00
raw_config = config
config = raw_config.interpolate()
if "seed" not in config["training"]:
raise ValueError(Errors.E1015.format(value="[training] seed"))
if "gpu_allocator" not in config["training"]:
raise ValueError(Errors.E1015.format(value="[training] gpu_allocator"))
2020-09-28 13:09:59 +00:00
if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"])
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
# Use original config here before it's resolved to functions
sourced = get_sourced_components(config)
2020-09-28 13:09:59 +00:00
nlp = load_model_from_config(raw_config, auto_fill=True)
2020-09-29 14:08:39 +00:00
logger.info("Set up nlp object from config")
2020-09-28 13:09:59 +00:00
config = nlp.config.interpolate()
# Resolve all training-relevant sections using the filled nlp config
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"]]
if not isinstance(T["train_corpus"], str):
2021-01-05 02:41:53 +00:00
raise ConfigValidationError(
desc=Errors.E897.format(
field="training.train_corpus", type=type(T["train_corpus"])
)
)
if not isinstance(T["dev_corpus"], str):
2021-01-05 02:41:53 +00:00
raise ConfigValidationError(
desc=Errors.E897.format(
field="training.dev_corpus", type=type(T["dev_corpus"])
)
)
2020-09-28 13:09:59 +00:00
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
optimizer = T["optimizer"]
# Components that shouldn't be updated during training
frozen_components = T["frozen_components"]
# Sourced components that require resume_training
resume_components = [p for p in sourced if p not in frozen_components]
2020-09-29 14:08:39 +00:00
logger.info(f"Pipeline: {nlp.pipe_names}")
2020-09-28 13:09:59 +00:00
if resume_components:
with nlp.select_pipes(enable=resume_components):
2020-09-29 14:08:39 +00:00
logger.info(f"Resuming training for: {resume_components}")
2020-09-28 13:09:59 +00:00
nlp.resume_training(sgd=optimizer)
# Make sure that listeners are defined before initializing further
nlp._link_components()
2020-09-28 13:09:59 +00:00
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
Support large/infinite training corpora (#7208) * Support infinite generators for training corpora Support a training corpus with an infinite generator in the `spacy train` training loop: * Revert `create_train_batches` to the state where an infinite generator can be used as the in the first epoch of exactly one epoch without resulting in a memory leak (`max_epochs != 1` will still result in a memory leak) * Move the shuffling for the first epoch into the corpus reader, renaming it to `spacy.Corpus.v2`. * Switch to training option for shuffling in memory Training loop: * Add option `training.shuffle_train_corpus_in_memory` that controls whether the corpus is loaded in memory once and shuffled in the training loop * Revert changes to `create_train_batches` and rename to `create_train_batches_with_shuffling` for use with `spacy.Corpus.v1` and a corpus that should be loaded in memory * Add `create_train_batches_without_shuffling` for a corpus that should not be shuffled in the training loop: the corpus is merely batched during training Corpus readers: * Restore `spacy.Corpus.v1` * Add `spacy.ShuffledCorpus.v1` for a corpus shuffled in memory in the reader instead of the training loop * In combination with `shuffle_train_corpus_in_memory = False`, each epoch could result in a different augmentation * Refactor create_train_batches, validation * Rename config setting to `training.shuffle_train_corpus` * Refactor to use a single `create_train_batches` method with a `shuffle` option * Only validate `get_examples` in initialize step if: * labels are required * labels are not provided * Switch back to max_epochs=-1 for streaming train corpus * Use first 100 examples for stream train corpus init * Always check validate_get_examples in initialize
2021-04-08 08:08:04 +00:00
if T["max_epochs"] == -1:
logger.debug("Due to streamed train corpus, using only first 100 examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labels")
nlp.initialize(lambda: islice(train_corpus(nlp), 100), sgd=optimizer)
else:
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
2020-10-05 12:59:13 +00:00
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
# Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline:
for listener in getattr(proc, "listening_components", []): # e.g. tok2vec/transformer
# Don't warn about components not in the pipeline
if listener not in nlp.pipe_names:
continue
if listener in frozen_components and name not in frozen_components:
logger.warning(Warnings.W087.format(name=name, listener=listener))
# We always check this regardless, in case user freezes tok2vec
if listener not in frozen_components and name in frozen_components:
logger.warning(Warnings.W086.format(name=name, listener=listener))
2020-09-28 13:09:59 +00:00
return nlp
def init_vocab(
nlp: "Language",
2020-09-28 13:09:59 +00:00
*,
data: Optional[Path] = None,
lookups: Optional[Lookups] = None,
vectors: Optional[str] = None,
) -> "Language":
2020-09-28 13:09:59 +00:00
if lookups:
nlp.vocab.lookups = lookups
2020-09-29 14:08:39 +00:00
logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
2020-09-28 13:09:59 +00:00
data_path = ensure_path(data)
if data_path is not None:
lex_attrs = srsly.read_jsonl(data_path)
for lexeme in nlp.vocab:
lexeme.rank = OOV_RANK
for attrs in lex_attrs:
if "settings" in attrs:
continue
lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs)
if len(nlp.vocab):
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
else:
oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob})
2020-09-29 14:22:41 +00:00
logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
logger.info("Created vocabulary")
2020-09-28 13:09:59 +00:00
if vectors is not None:
load_vectors_into_model(nlp, vectors)
2020-09-29 14:22:41 +00:00
logger.info(f"Added vectors: {vectors}")
2020-09-29 20:53:14 +00:00
logger.info("Finished initializing nlp object")
2020-09-28 13:09:59 +00:00
def load_vectors_into_model(
nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
2020-09-28 13:09:59 +00:00
) -> None:
"""Load word vectors from an installed model or path into a model instance."""
try:
vectors_nlp = load_model(name)
except ConfigValidationError as e:
title = f"Config validation error for vectors {name}"
desc = (
"This typically means that there's a problem in the config.cfg included "
"with the packaged vectors. Make sure that the vectors package you're "
"loading is compatible with the current version of spaCy."
)
err = ConfigValidationError.from_error(e, title=title, desc=desc)
2020-09-28 13:09:59 +00:00
raise err from None
if len(vectors_nlp.vocab.vectors.keys()) == 0:
logger.warning(Warnings.W112.format(name=name))
2020-09-28 13:09:59 +00:00
nlp.vocab.vectors = vectors_nlp.vocab.vectors
if add_strings:
# I guess we should add the strings from the vectors_nlp model?
# E.g. if someone does a similarity query, they might expect the strings.
for key in nlp.vocab.vectors.key2row:
if key in vectors_nlp.vocab.strings:
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def init_tok2vec(
2020-09-29 14:47:55 +00:00
nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
2020-09-28 13:09:59 +00:00
) -> bool:
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
P = pretrain_config
2020-09-29 14:47:55 +00:00
I = init_config
2020-09-28 13:09:59 +00:00
weights_data = None
2020-09-29 14:47:55 +00:00
init_tok2vec = ensure_path(I["init_tok2vec"])
2020-09-28 13:09:59 +00:00
if init_tok2vec is not None:
if not init_tok2vec.exists():
err = f"can't find pretrained tok2vec: {init_tok2vec}"
errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}]
2020-09-28 13:09:59 +00:00
raise ConfigValidationError(config=nlp.config, errors=errors)
with init_tok2vec.open("rb") as file_:
weights_data = file_.read()
if weights_data is not None:
layer = get_tok2vec_ref(nlp, P)
2020-09-28 13:09:59 +00:00
layer.from_bytes(weights_data)
logger.info(f"Loaded pretrained weights from {init_tok2vec}")
2020-09-28 13:09:59 +00:00
return True
return False
2020-09-29 08:58:50 +00:00
def convert_vectors(
nlp: "Language",
2020-09-29 08:58:50 +00:00
vectors_loc: Optional[Path],
*,
truncate: int,
prune: int,
name: Optional[str] = None,
) -> None:
vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
for lex in nlp.vocab:
if lex.rank and lex.rank != OOV_RANK:
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
else:
if vectors_loc:
2020-09-29 14:08:39 +00:00
logger.info(f"Reading vectors from {vectors_loc}")
vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
logger.info(f"Loaded vectors from {vectors_loc}")
2020-09-29 08:58:50 +00:00
else:
vectors_data, vector_keys = (None, None)
if vector_keys is not None:
for word in vector_keys:
if word not in nlp.vocab:
nlp.vocab[word]
if vectors_data is not None:
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
if name is None:
# TODO: Is this correct? Does this matter?
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
else:
nlp.vocab.vectors.name = name
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
if prune >= 1:
nlp.vocab.prune_vectors(prune)
def read_vectors(vectors_loc: Path, truncate_vectors: int):
f = ensure_shape(vectors_loc)
2020-09-29 08:58:50 +00:00
shape = tuple(int(size) for size in next(f).split())
if truncate_vectors >= 1:
shape = (truncate_vectors, shape[1])
vectors_data = numpy.zeros(shape=shape, dtype="f")
vectors_keys = []
for i, line in enumerate(tqdm.tqdm(f)):
line = line.rstrip()
pieces = line.rsplit(" ", vectors_data.shape[1])
word = pieces.pop(0)
if len(pieces) != vectors_data.shape[1]:
raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
vectors_data[i] = numpy.asarray(pieces, dtype="f")
vectors_keys.append(word)
if i == truncate_vectors - 1:
break
return vectors_data, vectors_keys
def open_file(loc: Union[str, Path]) -> IO:
"""Handle .gz, .tar.gz or unzipped files"""
loc = ensure_path(loc)
if tarfile.is_tarfile(str(loc)):
return tarfile.open(str(loc), "r:gz")
elif loc.parts[-1].endswith("gz"):
return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
elif loc.parts[-1].endswith("zip"):
zip_file = zipfile.ZipFile(str(loc))
names = zip_file.namelist()
file_ = zip_file.open(names[0])
return (line.decode("utf8") for line in file_)
else:
return loc.open("r", encoding="utf8")
def ensure_shape(vectors_loc):
2020-09-29 08:58:50 +00:00
"""Ensure that the first line of the data is the vectors shape.
If it's not, we read in the data and output the shape as the first result,
so that the reader doesn't have to deal with the problem.
"""
lines = open_file(vectors_loc)
2020-09-29 08:58:50 +00:00
first_line = next(lines)
try:
shape = tuple(int(size) for size in first_line.split())
except ValueError:
shape = None
if shape is not None:
# All good, give the data
yield first_line
yield from lines
else:
# Figure out the shape, make it the first value, and then give the
# rest of the data.
width = len(first_line.split()) - 1
length = 1
for _ in lines:
length += 1
2020-09-29 08:58:50 +00:00
yield f"{length} {width}"
# Reading the lines in again from file. This to avoid having to
# store all the results in a list in memory
lines2 = open_file(vectors_loc)
yield from lines2