From 6723d76f24a55f24ef1632ac8be46567a984d0ef Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Mon, 29 Aug 2022 10:23:05 +0200 Subject: [PATCH] Add ConsoleLogger.v2 (#11214) * Init * Change logger to ConsoleLogger.v2 * adjust naming * More naming adjustments * Fix output_file reference error * ignore type * Add basic test for logger * Hopefully fix mypy issue * mypy ignore line * Update mypy line Co-authored-by: Adriane Boyd * Update test method name Co-authored-by: Adriane Boyd * Change file saving logic * Fix finalize method * increase spacy-legacy version in requirements * Update docs * small adjustments Co-authored-by: Adriane Boyd --- requirements.txt | 2 +- setup.cfg | 2 +- spacy/tests/training/test_logger.py | 30 ++++++++ spacy/training/loggers.py | 102 +++++++++++++++++++++------- website/docs/api/legacy.md | 53 +++++++++++++++ website/docs/api/top-level.md | 57 +++++++++------- 6 files changed, 198 insertions(+), 48 deletions(-) create mode 100644 spacy/tests/training/test_logger.py diff --git a/requirements.txt b/requirements.txt index 437dd415a..3b8d66e0e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Our libraries -spacy-legacy>=3.0.9,<3.1.0 +spacy-legacy>=3.0.10,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 diff --git a/setup.cfg b/setup.cfg index bf4890a68..5fd820a96 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ setup_requires = thinc>=8.1.0,<8.2.0 install_requires = # Our libraries - spacy-legacy>=3.0.9,<3.1.0 + spacy-legacy>=3.0.10,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 diff --git a/spacy/tests/training/test_logger.py b/spacy/tests/training/test_logger.py new file mode 100644 index 000000000..0dfd0cbf4 --- /dev/null +++ b/spacy/tests/training/test_logger.py @@ -0,0 +1,30 @@ +import pytest +import spacy + +from spacy.training import loggers + + +@pytest.fixture() +def nlp(): + nlp = spacy.blank("en") + nlp.add_pipe("ner") + return nlp + + +@pytest.fixture() +def info(): + return { + "losses": {"ner": 100}, + "other_scores": {"ENTS_F": 0.85, "ENTS_P": 0.90, "ENTS_R": 0.80}, + "epoch": 100, + "step": 125, + "score": 85, + } + + +def test_console_logger(nlp, info): + console_logger = loggers.console_logger( + progress_bar=True, console_output=True, output_file=None + ) + log_step, finalize = console_logger(nlp) + log_step(info) diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index edd0f1959..408ea7140 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -1,10 +1,13 @@ -from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO +from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO, Union from wasabi import Printer +from pathlib import Path import tqdm import sys +import srsly from ..util import registry from ..errors import Errors +from .. import util if TYPE_CHECKING: from ..language import Language # noqa: F401 @@ -23,13 +26,44 @@ def setup_table( return final_cols, final_widths, ["r" for _ in final_widths] -@registry.loggers("spacy.ConsoleLogger.v1") -def console_logger(progress_bar: bool = False): +@registry.loggers("spacy.ConsoleLogger.v2") +def console_logger( + progress_bar: bool = False, + console_output: bool = True, + output_file: Optional[Union[str, Path]] = None, +): + """The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file. + progress_bar (bool): Whether the logger should print the progress bar. + console_output (bool): Whether the logger should print the logs on the console. + output_file (Optional[Union[str, Path]]): The file to save the training logs to. + """ + _log_exist = False + if output_file: + output_file = util.ensure_path(output_file) # type: ignore + if output_file.exists(): # type: ignore + _log_exist = True + if not output_file.parents[0].exists(): # type: ignore + output_file.parents[0].mkdir(parents=True) # type: ignore + def setup_printer( nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]: write = lambda text: print(text, file=stdout, flush=True) msg = Printer(no_print=True) + + nonlocal output_file + output_stream = None + if _log_exist: + write( + msg.warn( + f"Saving logs is disabled because {output_file} already exists." + ) + ) + output_file = None + elif output_file: + write(msg.info(f"Saving results to {output_file}")) + output_stream = open(output_file, "w", encoding="utf-8") + # ensure that only trainable components are logged logged_pipes = [ name @@ -40,13 +74,15 @@ def console_logger(progress_bar: bool = False): score_weights = nlp.config["training"]["score_weights"] score_cols = [col for col, value in score_weights.items() if value is not None] loss_cols = [f"Loss {pipe}" for pipe in logged_pipes] - spacing = 2 - table_header, table_widths, table_aligns = setup_table( - cols=["E", "#"] + loss_cols + score_cols + ["Score"], - widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6], - ) - write(msg.row(table_header, widths=table_widths, spacing=spacing)) - write(msg.row(["-" * width for width in table_widths], spacing=spacing)) + + if console_output: + spacing = 2 + table_header, table_widths, table_aligns = setup_table( + cols=["E", "#"] + loss_cols + score_cols + ["Score"], + widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6], + ) + write(msg.row(table_header, widths=table_widths, spacing=spacing)) + write(msg.row(["-" * width for width in table_widths], spacing=spacing)) progress = None def log_step(info: Optional[Dict[str, Any]]) -> None: @@ -57,12 +93,15 @@ def console_logger(progress_bar: bool = False): if progress is not None: progress.update(1) return - losses = [ - "{0:.2f}".format(float(info["losses"][pipe_name])) - for pipe_name in logged_pipes - ] + + losses = [] + log_losses = {} + for pipe_name in logged_pipes: + losses.append("{0:.2f}".format(float(info["losses"][pipe_name]))) + log_losses[pipe_name] = float(info["losses"][pipe_name]) scores = [] + log_scores = {} for col in score_cols: score = info["other_scores"].get(col, 0.0) try: @@ -73,6 +112,7 @@ def console_logger(progress_bar: bool = False): if col != "speed": score *= 100 scores.append("{0:.2f}".format(score)) + log_scores[str(col)] = score data = ( [info["epoch"], info["step"]] @@ -80,20 +120,36 @@ def console_logger(progress_bar: bool = False): + scores + ["{0:.2f}".format(float(info["score"]))] ) + + if output_stream: + # Write to log file per log_step + log_data = { + "epoch": info["epoch"], + "step": info["step"], + "losses": log_losses, + "scores": log_scores, + "score": float(info["score"]), + } + output_stream.write(srsly.json_dumps(log_data) + "\n") + if progress is not None: progress.close() - write( - msg.row(data, widths=table_widths, aligns=table_aligns, spacing=spacing) - ) - if progress_bar: - # Set disable=None, so that it disables on non-TTY - progress = tqdm.tqdm( - total=eval_frequency, disable=None, leave=False, file=stderr + if console_output: + write( + msg.row( + data, widths=table_widths, aligns=table_aligns, spacing=spacing + ) ) - progress.set_description(f"Epoch {info['epoch']+1}") + if progress_bar: + # Set disable=None, so that it disables on non-TTY + progress = tqdm.tqdm( + total=eval_frequency, disable=None, leave=False, file=stderr + ) + progress.set_description(f"Epoch {info['epoch']+1}") def finalize() -> None: - pass + if output_stream: + output_stream.close() return log_step, finalize diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md index 31d178b67..d9167c76f 100644 --- a/website/docs/api/legacy.md +++ b/website/docs/api/legacy.md @@ -248,6 +248,59 @@ added to an existing vectors table. See more details in ## Loggers {#loggers} +These functions are available from `@spacy.registry.loggers`. + +### spacy.ConsoleLogger.v1 {#ConsoleLogger_v1} + +> #### Example config +> +> ```ini +> [training.logger] +> @loggers = "spacy.ConsoleLogger.v1" +> progress_bar = true +> ``` + +Writes the results of a training step to the console in a tabular format. + + + +```cli +$ python -m spacy train config.cfg +``` + +``` +ℹ Using CPU +ℹ Loading config and nlp from: config.cfg +ℹ Pipeline: ['tok2vec', 'tagger'] +ℹ Start training +ℹ Training. Initial learn rate: 0.0 + +E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE +--- ------ ------------ ----------- ------- ------ + 0 0 0.00 86.20 0.22 0.00 + 0 200 3.08 18968.78 34.00 0.34 + 0 400 31.81 22539.06 33.64 0.34 + 0 600 92.13 22794.91 43.80 0.44 + 0 800 183.62 21541.39 56.05 0.56 + 0 1000 352.49 25461.82 65.15 0.65 + 0 1200 422.87 23708.82 71.84 0.72 + 0 1400 601.92 24994.79 76.57 0.77 + 0 1600 662.57 22268.02 80.20 0.80 + 0 1800 1101.50 28413.77 82.56 0.83 + 0 2000 1253.43 28736.36 85.00 0.85 + 0 2200 1411.02 28237.53 87.42 0.87 + 0 2400 1605.35 28439.95 88.70 0.89 +``` + +Note that the cumulative loss keeps increasing within one epoch, but should +start decreasing across epochs. + + + +| Name | Description | +| -------------- | --------------------------------------------------------- | +| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ | + Logging utilities for spaCy are implemented in the [`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the functions are typically available from `@spacy.registry.loggers`. diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 1e1925442..c3dc42f1a 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -275,8 +275,8 @@ Render a dependency parse tree or named entity visualization. ### displacy.parse_deps {#displacy.parse_deps tag="method" new="2"} -Generate dependency parse in `{'words': [], 'arcs': []}` format. -For use with the `manual=True` argument in `displacy.render`. +Generate dependency parse in `{'words': [], 'arcs': []}` format. For use with +the `manual=True` argument in `displacy.render`. > #### Example > @@ -297,8 +297,8 @@ For use with the `manual=True` argument in `displacy.render`. ### displacy.parse_ents {#displacy.parse_ents tag="method" new="2"} -Generate named entities in `[{start: i, end: i, label: 'label'}]` format. -For use with the `manual=True` argument in `displacy.render`. +Generate named entities in `[{start: i, end: i, label: 'label'}]` format. For +use with the `manual=True` argument in `displacy.render`. > #### Example > @@ -319,8 +319,8 @@ For use with the `manual=True` argument in `displacy.render`. ### displacy.parse_spans {#displacy.parse_spans tag="method" new="2"} -Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format. -For use with the `manual=True` argument in `displacy.render`. +Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format. For +use with the `manual=True` argument in `displacy.render`. > #### Example > @@ -505,7 +505,7 @@ finished. To log each training step, a and the accuracy scores on the development set. The built-in, default logger is the ConsoleLogger, which prints results to the -console in tabular format. The +console in tabular format and saves them to a `jsonl` file. The [spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as a dependency of spaCy, enables other loggers, such as one that sends results to a [Weights & Biases](https://www.wandb.com/) dashboard. @@ -513,16 +513,20 @@ a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of using one of the built-in loggers, you can [implement your own](/usage/training#custom-logging). -#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"} +#### spacy.ConsoleLogger.v2 {#ConsoleLogger tag="registered function"} > #### Example config > > ```ini > [training.logger] -> @loggers = "spacy.ConsoleLogger.v1" +> @loggers = "spacy.ConsoleLogger.v2" +> progress_bar = true +> console_output = true +> output_file = "training_log.jsonl" > ``` -Writes the results of a training step to the console in a tabular format. +Writes the results of a training step to the console in a tabular format and +saves them to a `jsonl` file. @@ -536,22 +540,23 @@ $ python -m spacy train config.cfg ℹ Pipeline: ['tok2vec', 'tagger'] ℹ Start training ℹ Training. Initial learn rate: 0.0 +ℹ Saving results to training_log.jsonl E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE --- ------ ------------ ----------- ------- ------ - 1 0 0.00 86.20 0.22 0.00 - 1 200 3.08 18968.78 34.00 0.34 - 1 400 31.81 22539.06 33.64 0.34 - 1 600 92.13 22794.91 43.80 0.44 - 1 800 183.62 21541.39 56.05 0.56 - 1 1000 352.49 25461.82 65.15 0.65 - 1 1200 422.87 23708.82 71.84 0.72 - 1 1400 601.92 24994.79 76.57 0.77 - 1 1600 662.57 22268.02 80.20 0.80 - 1 1800 1101.50 28413.77 82.56 0.83 - 1 2000 1253.43 28736.36 85.00 0.85 - 1 2200 1411.02 28237.53 87.42 0.87 - 1 2400 1605.35 28439.95 88.70 0.89 + 0 0 0.00 86.20 0.22 0.00 + 0 200 3.08 18968.78 34.00 0.34 + 0 400 31.81 22539.06 33.64 0.34 + 0 600 92.13 22794.91 43.80 0.44 + 0 800 183.62 21541.39 56.05 0.56 + 0 1000 352.49 25461.82 65.15 0.65 + 0 1200 422.87 23708.82 71.84 0.72 + 0 1400 601.92 24994.79 76.57 0.77 + 0 1600 662.57 22268.02 80.20 0.80 + 0 1800 1101.50 28413.77 82.56 0.83 + 0 2000 1253.43 28736.36 85.00 0.85 + 0 2200 1411.02 28237.53 87.42 0.87 + 0 2400 1605.35 28439.95 88.70 0.89 ``` Note that the cumulative loss keeps increasing within one epoch, but should @@ -559,6 +564,12 @@ start decreasing across epochs. +| Name | Description | +| ---------------- | --------------------------------------------------------------------- | +| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ | +| `console_output` | Whether the logger should print the logs on the console. ~~bool~~ | +| `output_file` | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ | + ## Readers {#readers} ### File readers {#file-readers source="github.com/explosion/srsly" new="3"}