From a1c5b694be117ac92e21f9860309821ad6da06f7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Jun 2020 02:22:13 +0200 Subject: [PATCH] Small fixes to train defaults --- spacy/cli/train_from_config.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index ec099b294..f24feffab 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -156,17 +156,18 @@ def train_cli( msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) - if output_path is not None and not output_path.exists(): - output_path.mkdir() - msg.good(f"Created output directory: {output_path}") - elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: - msg.warn( - "Output directory is not empty.", - "This can lead to unintended side effects when saving the model. " - "Please use an empty directory or a different path instead. If " - "the specified output path doesn't exist, the directory will be " - "created for you.", - ) + if output_path is not None: + if not output_path.exists(): + output_path.mkdir() + msg.good(f"Created output directory: {output_path}") + elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: + msg.warn( + "Output directory is not empty.", + "This can lead to unintended side effects when saving the model. " + "Please use an empty directory or a different path instead. If " + "the specified output path doesn't exist, the directory will be " + "created for you.", + ) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) tag_map = {} @@ -210,7 +211,8 @@ def train( # Read the config first without creating objects, to get to the original nlp_config config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) - if config["training"]["use_pytorch_for_gpu_memory"]: + if config["training"].get("use_pytorch_for_gpu_memory"): + # It feels kind of weird to not have a default for this. use_pytorch_for_gpu_memory() nlp_config = config["nlp"] config = util.load_config(config_path, create_objects=True) @@ -374,7 +376,7 @@ def create_train_batches(nlp, corpus, cfg): train_examples = list( corpus.train_dataset( nlp, - noise_level=cfg["noise_level"], + noise_level=0.0, # I think this is deprecated? orth_variant_level=cfg["orth_variant_level"], gold_preproc=cfg["gold_preproc"], max_length=cfg["max_length"],