Fix logging

This commit is contained in:
Ines Montani 2020-09-29 16:08:39 +02:00
parent 63d1598137
commit aa2a6882d0
2 changed files with 22 additions and 24 deletions

View File

@ -19,13 +19,18 @@ def init_vectors_cli(
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
# fmt: on # fmt: on
): ):
"""Convert word vectors for use with spaCy. Will export an nlp object that
you can use in the [initialize.vocab] block of your config to initialize
a model with vectors.
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
msg.info(f"Creating blank nlp object for language '{lang}'") msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)() nlp = util.get_lang_class(lang)()
convert_vectors( convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
nlp, vectors_loc, truncate=truncate, prune=prune, name=name, silent=False msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
)
nlp.to_disk(output_dir) nlp.to_disk(output_dir)
msg.good( msg.good(
"Saved nlp object with vectors to output directory. You can now use the " "Saved nlp object with vectors to output directory. You can now use the "

View File

@ -2,7 +2,6 @@ from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING
from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import Config, fix_random_seed, set_gpu_allocator
from thinc.api import ConfigValidationError from thinc.api import ConfigValidationError
from pathlib import Path from pathlib import Path
from wasabi import Printer
import srsly import srsly
import numpy import numpy
import tarfile import tarfile
@ -14,16 +13,15 @@ from .loop import create_before_to_disk_callback
from ..lookups import Lookups from ..lookups import Lookups
from ..vectors import Vectors from ..vectors import Vectors
from ..errors import Errors from ..errors import Errors
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain from ..schemas import ConfigSchemaTraining
from ..util import registry, load_model_from_config, resolve_dot_names from ..util import registry, load_model_from_config, resolve_dot_names, logger
from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
if TYPE_CHECKING: if TYPE_CHECKING:
from ..language import Language # noqa: F401 from ..language import Language # noqa: F401
def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Language": def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
msg = Printer(no_print=silent)
raw_config = config raw_config = config
config = raw_config.interpolate() config = raw_config.interpolate()
if config["training"]["seed"] is not None: if config["training"]["seed"] is not None:
@ -34,7 +32,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Lang
# Use original config here before it's resolved to functions # Use original config here before it's resolved to functions
sourced_components = get_sourced_components(config) sourced_components = get_sourced_components(config)
nlp = load_model_from_config(raw_config, auto_fill=True) nlp = load_model_from_config(raw_config, auto_fill=True)
msg.good("Set up nlp object from config") logger.info("Set up nlp object from config")
config = nlp.config.interpolate() config = nlp.config.interpolate()
# Resolve all training-relevant sections using the filled nlp config # Resolve all training-relevant sections using the filled nlp config
T = registry.resolve(config["training"], schema=ConfigSchemaTraining) T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
@ -46,14 +44,14 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Lang
frozen_components = T["frozen_components"] frozen_components = T["frozen_components"]
# Sourced components that require resume_training # Sourced components that require resume_training
resume_components = [p for p in sourced_components if p not in frozen_components] resume_components = [p for p in sourced_components if p not in frozen_components]
msg.info(f"Pipeline: {nlp.pipe_names}") logger.info(f"Pipeline: {nlp.pipe_names}")
if resume_components: if resume_components:
with nlp.select_pipes(enable=resume_components): with nlp.select_pipes(enable=resume_components):
msg.info(f"Resuming training for: {resume_components}") logger.info(f"Resuming training for: {resume_components}")
nlp.resume_training(sgd=optimizer) nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]): with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
msg.good("Initialized pipeline components") logger.good("Initialized pipeline components")
# Verify the config after calling 'initialize' to ensure labels # Verify the config after calling 'initialize' to ensure labels
# are properly initialized # are properly initialized
verify_config(nlp) verify_config(nlp)
@ -72,12 +70,10 @@ def init_vocab(
data: Optional[Path] = None, data: Optional[Path] = None,
lookups: Optional[Lookups] = None, lookups: Optional[Lookups] = None,
vectors: Optional[str] = None, vectors: Optional[str] = None,
silent: bool = True,
) -> "Language": ) -> "Language":
msg = Printer(no_print=silent)
if lookups: if lookups:
nlp.vocab.lookups = lookups nlp.vocab.lookups = lookups
msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}") logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
data_path = ensure_path(data) data_path = ensure_path(data)
if data_path is not None: if data_path is not None:
lex_attrs = srsly.read_jsonl(data_path) lex_attrs = srsly.read_jsonl(data_path)
@ -93,11 +89,11 @@ def init_vocab(
else: else:
oov_prob = DEFAULT_OOV_PROB oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob}) nlp.vocab.cfg.update({"oov_prob": oov_prob})
msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") logger.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
msg.good("Created vocabulary") logger.good("Created vocabulary")
if vectors is not None: if vectors is not None:
load_vectors_into_model(nlp, vectors) load_vectors_into_model(nlp, vectors)
msg.good(f"Added vectors: {vectors}") logger.good(f"Added vectors: {vectors}")
def load_vectors_into_model( def load_vectors_into_model(
@ -209,9 +205,7 @@ def convert_vectors(
truncate: int, truncate: int,
prune: int, prune: int,
name: Optional[str] = None, name: Optional[str] = None,
silent: bool = True,
) -> None: ) -> None:
msg = Printer(no_print=silent)
vectors_loc = ensure_path(vectors_loc) vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@ -220,9 +214,9 @@ def convert_vectors(
nlp.vocab.vectors.add(lex.orth, row=lex.rank) nlp.vocab.vectors.add(lex.orth, row=lex.rank)
else: else:
if vectors_loc: if vectors_loc:
with msg.loading(f"Reading vectors from {vectors_loc}"): logger.info(f"Reading vectors from {vectors_loc}")
vectors_data, vector_keys = read_vectors(vectors_loc, truncate) vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
msg.good(f"Loaded vectors from {vectors_loc}") logger.info(f"Loaded vectors from {vectors_loc}")
else: else:
vectors_data, vector_keys = (None, None) vectors_data, vector_keys = (None, None)
if vector_keys is not None: if vector_keys is not None:
@ -239,7 +233,6 @@ def convert_vectors(
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
if prune >= 1: if prune >= 1:
nlp.vocab.prune_vectors(prune) nlp.vocab.prune_vectors(prune)
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
def read_vectors(vectors_loc: Path, truncate_vectors: int): def read_vectors(vectors_loc: Path, truncate_vectors: int):