mirror of https://github.com/explosion/spaCy.git
Fix logging
This commit is contained in:
parent
63d1598137
commit
aa2a6882d0
|
@ -19,13 +19,18 @@ def init_vectors_cli(
|
||||||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
|
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||||
|
you can use in the [initialize.vocab] block of your config to initialize
|
||||||
|
a model with vectors.
|
||||||
|
"""
|
||||||
|
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||||
nlp = util.get_lang_class(lang)()
|
nlp = util.get_lang_class(lang)()
|
||||||
convert_vectors(
|
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
||||||
nlp, vectors_loc, truncate=truncate, prune=prune, name=name, silent=False
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
)
|
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
msg.good(
|
msg.good(
|
||||||
"Saved nlp object with vectors to output directory. You can now use the "
|
"Saved nlp object with vectors to output directory. You can now use the "
|
||||||
|
|
|
@ -2,7 +2,6 @@ from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING
|
||||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator
|
from thinc.api import Config, fix_random_seed, set_gpu_allocator
|
||||||
from thinc.api import ConfigValidationError
|
from thinc.api import ConfigValidationError
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
|
||||||
import srsly
|
import srsly
|
||||||
import numpy
|
import numpy
|
||||||
import tarfile
|
import tarfile
|
||||||
|
@ -14,16 +13,15 @@ from .loop import create_before_to_disk_callback
|
||||||
from ..lookups import Lookups
|
from ..lookups import Lookups
|
||||||
from ..vectors import Vectors
|
from ..vectors import Vectors
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..util import registry, load_model_from_config, resolve_dot_names
|
from ..util import registry, load_model_from_config, resolve_dot_names, logger
|
||||||
from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
|
from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from ..language import Language # noqa: F401
|
from ..language import Language # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Language":
|
def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
||||||
msg = Printer(no_print=silent)
|
|
||||||
raw_config = config
|
raw_config = config
|
||||||
config = raw_config.interpolate()
|
config = raw_config.interpolate()
|
||||||
if config["training"]["seed"] is not None:
|
if config["training"]["seed"] is not None:
|
||||||
|
@ -34,7 +32,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Lang
|
||||||
# Use original config here before it's resolved to functions
|
# Use original config here before it's resolved to functions
|
||||||
sourced_components = get_sourced_components(config)
|
sourced_components = get_sourced_components(config)
|
||||||
nlp = load_model_from_config(raw_config, auto_fill=True)
|
nlp = load_model_from_config(raw_config, auto_fill=True)
|
||||||
msg.good("Set up nlp object from config")
|
logger.info("Set up nlp object from config")
|
||||||
config = nlp.config.interpolate()
|
config = nlp.config.interpolate()
|
||||||
# Resolve all training-relevant sections using the filled nlp config
|
# Resolve all training-relevant sections using the filled nlp config
|
||||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||||
|
@ -46,14 +44,14 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Lang
|
||||||
frozen_components = T["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
# Sourced components that require resume_training
|
# Sourced components that require resume_training
|
||||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
resume_components = [p for p in sourced_components if p not in frozen_components]
|
||||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
logger.info(f"Pipeline: {nlp.pipe_names}")
|
||||||
if resume_components:
|
if resume_components:
|
||||||
with nlp.select_pipes(enable=resume_components):
|
with nlp.select_pipes(enable=resume_components):
|
||||||
msg.info(f"Resuming training for: {resume_components}")
|
logger.info(f"Resuming training for: {resume_components}")
|
||||||
nlp.resume_training(sgd=optimizer)
|
nlp.resume_training(sgd=optimizer)
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
msg.good("Initialized pipeline components")
|
logger.good("Initialized pipeline components")
|
||||||
# Verify the config after calling 'initialize' to ensure labels
|
# Verify the config after calling 'initialize' to ensure labels
|
||||||
# are properly initialized
|
# are properly initialized
|
||||||
verify_config(nlp)
|
verify_config(nlp)
|
||||||
|
@ -72,12 +70,10 @@ def init_vocab(
|
||||||
data: Optional[Path] = None,
|
data: Optional[Path] = None,
|
||||||
lookups: Optional[Lookups] = None,
|
lookups: Optional[Lookups] = None,
|
||||||
vectors: Optional[str] = None,
|
vectors: Optional[str] = None,
|
||||||
silent: bool = True,
|
|
||||||
) -> "Language":
|
) -> "Language":
|
||||||
msg = Printer(no_print=silent)
|
|
||||||
if lookups:
|
if lookups:
|
||||||
nlp.vocab.lookups = lookups
|
nlp.vocab.lookups = lookups
|
||||||
msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
||||||
data_path = ensure_path(data)
|
data_path = ensure_path(data)
|
||||||
if data_path is not None:
|
if data_path is not None:
|
||||||
lex_attrs = srsly.read_jsonl(data_path)
|
lex_attrs = srsly.read_jsonl(data_path)
|
||||||
|
@ -93,11 +89,11 @@ def init_vocab(
|
||||||
else:
|
else:
|
||||||
oov_prob = DEFAULT_OOV_PROB
|
oov_prob = DEFAULT_OOV_PROB
|
||||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||||
msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
logger.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
||||||
msg.good("Created vocabulary")
|
logger.good("Created vocabulary")
|
||||||
if vectors is not None:
|
if vectors is not None:
|
||||||
load_vectors_into_model(nlp, vectors)
|
load_vectors_into_model(nlp, vectors)
|
||||||
msg.good(f"Added vectors: {vectors}")
|
logger.good(f"Added vectors: {vectors}")
|
||||||
|
|
||||||
|
|
||||||
def load_vectors_into_model(
|
def load_vectors_into_model(
|
||||||
|
@ -209,9 +205,7 @@ def convert_vectors(
|
||||||
truncate: int,
|
truncate: int,
|
||||||
prune: int,
|
prune: int,
|
||||||
name: Optional[str] = None,
|
name: Optional[str] = None,
|
||||||
silent: bool = True,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
msg = Printer(no_print=silent)
|
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||||
|
@ -220,9 +214,9 @@ def convert_vectors(
|
||||||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
||||||
else:
|
else:
|
||||||
if vectors_loc:
|
if vectors_loc:
|
||||||
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
logger.info(f"Reading vectors from {vectors_loc}")
|
||||||
vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
|
vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
|
||||||
msg.good(f"Loaded vectors from {vectors_loc}")
|
logger.info(f"Loaded vectors from {vectors_loc}")
|
||||||
else:
|
else:
|
||||||
vectors_data, vector_keys = (None, None)
|
vectors_data, vector_keys = (None, None)
|
||||||
if vector_keys is not None:
|
if vector_keys is not None:
|
||||||
|
@ -239,7 +233,6 @@ def convert_vectors(
|
||||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||||
if prune >= 1:
|
if prune >= 1:
|
||||||
nlp.vocab.prune_vectors(prune)
|
nlp.vocab.prune_vectors(prune)
|
||||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(vectors_loc: Path, truncate_vectors: int):
|
def read_vectors(vectors_loc: Path, truncate_vectors: int):
|
||||||
|
|
Loading…
Reference in New Issue