diff --git a/spacy/errors.py b/spacy/errors.py index 9d9a716d2..bf3628ce9 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -456,6 +456,10 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E901 = ("Failed to remove existing output directory: {path}. If your " + "config and the components you train change between runs, a " + "non-empty output directory can lead to stale pipeline data. To " + "solve this, remove the existing directories in the output directory.") E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " "Try checking whitespace and delimiters. See " "https://nightly.spacy.io/api/cli#convert") diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 0d4414964..67f61567e 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -3,19 +3,24 @@ from typing import Optional, TYPE_CHECKING from pathlib import Path from timeit import default_timer as timer from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator +from wasabi import Printer import random -import wasabi import sys +import shutil from .example import Example from ..schemas import ConfigSchemaTraining from ..errors import Errors -from ..util import resolve_dot_names, registry +from ..util import resolve_dot_names, registry, logger if TYPE_CHECKING: from ..language import Language # noqa: F401 +DIR_MODEL_BEST = "model-best" +DIR_MODEL_LAST = "model-last" + + def train( nlp: "Language", output_path: Optional[Path] = None, @@ -38,7 +43,7 @@ def train( RETURNS (Path / None): The path to the final exported model. """ # We use no_print here so we can respect the stdout/stderr options. - msg = wasabi.Printer(no_print=True) + msg = Printer(no_print=True) # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() if config["training"]["seed"] is not None: @@ -69,6 +74,7 @@ def train( eval_frequency=T["eval_frequency"], exclude=frozen_components, ) + clean_output_dir(output_path) stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n") if frozen_components: stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n") @@ -83,7 +89,7 @@ def train( update_meta(T, nlp, info) with nlp.use_params(optimizer.averages): nlp = before_to_disk(nlp) - nlp.to_disk(output_path / "model-best") + nlp.to_disk(output_path / DIR_MODEL_BEST) except Exception as e: if output_path is not None: # We don't want to swallow the traceback if we don't have a @@ -100,7 +106,7 @@ def train( finally: finalize_logger() if output_path is not None: - final_model_path = output_path / "model-last" + final_model_path = output_path / DIR_MODEL_LAST if optimizer.averages: with nlp.use_params(optimizer.averages): nlp.to_disk(final_model_path) @@ -305,3 +311,19 @@ def create_before_to_disk_callback( return modified_nlp return before_to_disk + + +def clean_output_dir(path: Union[str, Path]) -> None: + """Remove an existing output directory. Typically used to ensure that that + a directory like model-best and its contents aren't just being overwritten + by nlp.to_disk, which could preserve existing subdirectories (e.g. + components that don't exist anymore). + """ + if path is not None and path.exists(): + for subdir in [path / DIR_MODEL_BEST, path / DIR_MODEL_LAST]: + if subdir.exists(): + try: + shutil.rmtree(str(subdir)) + logger.debug(f"Removed existing output directory: {subdir}") + except Exception as e: + raise IOError(Errors.E901.format(path=path)) from e