From 6723d76f24a55f24ef1632ac8be46567a984d0ef Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 29 Aug 2022 10:23:05 +0200
Subject: [PATCH] Add ConsoleLogger.v2 (#11214)

* Init

* Change logger to ConsoleLogger.v2

* adjust naming

* More naming adjustments

* Fix output_file reference error

* ignore type

* Add basic test for logger

* Hopefully fix mypy issue

* mypy ignore line

* Update mypy line

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update test method name

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Change file saving logic

* Fix finalize method

* increase spacy-legacy version in requirements

* Update docs

* small adjustments

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 requirements.txt                    |   2 +-
 setup.cfg                           |   2 +-
 spacy/tests/training/test_logger.py |  30 ++++++++
 spacy/training/loggers.py           | 102 +++++++++++++++++++++-------
 website/docs/api/legacy.md          |  53 +++++++++++++++
 website/docs/api/top-level.md       |  57 +++++++++-------
 6 files changed, 198 insertions(+), 48 deletions(-)
 create mode 100644 spacy/tests/training/test_logger.py

diff --git a/requirements.txt b/requirements.txt
index 437dd415a..3b8d66e0e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=3.0.9,<3.1.0
+spacy-legacy>=3.0.10,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
diff --git a/setup.cfg b/setup.cfg
index bf4890a68..5fd820a96 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -41,7 +41,7 @@ setup_requires =
     thinc>=8.1.0,<8.2.0
 install_requires =
     # Our libraries
-    spacy-legacy>=3.0.9,<3.1.0
+    spacy-legacy>=3.0.10,<3.1.0
     spacy-loggers>=1.0.0,<2.0.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
diff --git a/spacy/tests/training/test_logger.py b/spacy/tests/training/test_logger.py
new file mode 100644
index 000000000..0dfd0cbf4
--- /dev/null
+++ b/spacy/tests/training/test_logger.py
@@ -0,0 +1,30 @@
+import pytest
+import spacy
+
+from spacy.training import loggers
+
+
+@pytest.fixture()
+def nlp():
+    nlp = spacy.blank("en")
+    nlp.add_pipe("ner")
+    return nlp
+
+
+@pytest.fixture()
+def info():
+    return {
+        "losses": {"ner": 100},
+        "other_scores": {"ENTS_F": 0.85, "ENTS_P": 0.90, "ENTS_R": 0.80},
+        "epoch": 100,
+        "step": 125,
+        "score": 85,
+    }
+
+
+def test_console_logger(nlp, info):
+    console_logger = loggers.console_logger(
+        progress_bar=True, console_output=True, output_file=None
+    )
+    log_step, finalize = console_logger(nlp)
+    log_step(info)
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index edd0f1959..408ea7140 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -1,10 +1,13 @@
-from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO
+from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO, Union
 from wasabi import Printer
+from pathlib import Path
 import tqdm
 import sys
+import srsly
 
 from ..util import registry
 from ..errors import Errors
+from .. import util
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
@@ -23,13 +26,44 @@ def setup_table(
     return final_cols, final_widths, ["r" for _ in final_widths]
 
 
-@registry.loggers("spacy.ConsoleLogger.v1")
-def console_logger(progress_bar: bool = False):
+@registry.loggers("spacy.ConsoleLogger.v2")
+def console_logger(
+    progress_bar: bool = False,
+    console_output: bool = True,
+    output_file: Optional[Union[str, Path]] = None,
+):
+    """The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file.
+    progress_bar (bool): Whether the logger should print the progress bar.
+    console_output (bool): Whether the logger should print the logs on the console.
+    output_file (Optional[Union[str, Path]]): The file to save the training logs to.
+    """
+    _log_exist = False
+    if output_file:
+        output_file = util.ensure_path(output_file)  # type: ignore
+        if output_file.exists():  # type: ignore
+            _log_exist = True
+        if not output_file.parents[0].exists():  # type: ignore
+            output_file.parents[0].mkdir(parents=True)  # type: ignore
+
     def setup_printer(
         nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
     ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
         write = lambda text: print(text, file=stdout, flush=True)
         msg = Printer(no_print=True)
+
+        nonlocal output_file
+        output_stream = None
+        if _log_exist:
+            write(
+                msg.warn(
+                    f"Saving logs is disabled because {output_file} already exists."
+                )
+            )
+            output_file = None
+        elif output_file:
+            write(msg.info(f"Saving results to {output_file}"))
+            output_stream = open(output_file, "w", encoding="utf-8")
+
         # ensure that only trainable components are logged
         logged_pipes = [
             name
@@ -40,13 +74,15 @@ def console_logger(progress_bar: bool = False):
         score_weights = nlp.config["training"]["score_weights"]
         score_cols = [col for col, value in score_weights.items() if value is not None]
         loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
-        spacing = 2
-        table_header, table_widths, table_aligns = setup_table(
-            cols=["E", "#"] + loss_cols + score_cols + ["Score"],
-            widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6],
-        )
-        write(msg.row(table_header, widths=table_widths, spacing=spacing))
-        write(msg.row(["-" * width for width in table_widths], spacing=spacing))
+
+        if console_output:
+            spacing = 2
+            table_header, table_widths, table_aligns = setup_table(
+                cols=["E", "#"] + loss_cols + score_cols + ["Score"],
+                widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6],
+            )
+            write(msg.row(table_header, widths=table_widths, spacing=spacing))
+            write(msg.row(["-" * width for width in table_widths], spacing=spacing))
         progress = None
 
         def log_step(info: Optional[Dict[str, Any]]) -> None:
@@ -57,12 +93,15 @@ def console_logger(progress_bar: bool = False):
                 if progress is not None:
                     progress.update(1)
                 return
-            losses = [
-                "{0:.2f}".format(float(info["losses"][pipe_name]))
-                for pipe_name in logged_pipes
-            ]
+
+            losses = []
+            log_losses = {}
+            for pipe_name in logged_pipes:
+                losses.append("{0:.2f}".format(float(info["losses"][pipe_name])))
+                log_losses[pipe_name] = float(info["losses"][pipe_name])
 
             scores = []
+            log_scores = {}
             for col in score_cols:
                 score = info["other_scores"].get(col, 0.0)
                 try:
@@ -73,6 +112,7 @@ def console_logger(progress_bar: bool = False):
                 if col != "speed":
                     score *= 100
                 scores.append("{0:.2f}".format(score))
+                log_scores[str(col)] = score
 
             data = (
                 [info["epoch"], info["step"]]
@@ -80,20 +120,36 @@ def console_logger(progress_bar: bool = False):
                 + scores
                 + ["{0:.2f}".format(float(info["score"]))]
             )
+
+            if output_stream:
+                # Write to log file per log_step
+                log_data = {
+                    "epoch": info["epoch"],
+                    "step": info["step"],
+                    "losses": log_losses,
+                    "scores": log_scores,
+                    "score": float(info["score"]),
+                }
+                output_stream.write(srsly.json_dumps(log_data) + "\n")
+
             if progress is not None:
                 progress.close()
-            write(
-                msg.row(data, widths=table_widths, aligns=table_aligns, spacing=spacing)
-            )
-            if progress_bar:
-                # Set disable=None, so that it disables on non-TTY
-                progress = tqdm.tqdm(
-                    total=eval_frequency, disable=None, leave=False, file=stderr
+            if console_output:
+                write(
+                    msg.row(
+                        data, widths=table_widths, aligns=table_aligns, spacing=spacing
+                    )
                 )
-                progress.set_description(f"Epoch {info['epoch']+1}")
+                if progress_bar:
+                    # Set disable=None, so that it disables on non-TTY
+                    progress = tqdm.tqdm(
+                        total=eval_frequency, disable=None, leave=False, file=stderr
+                    )
+                    progress.set_description(f"Epoch {info['epoch']+1}")
 
         def finalize() -> None:
-            pass
+            if output_stream:
+                output_stream.close()
 
         return log_step, finalize
 
diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md
index 31d178b67..d9167c76f 100644
--- a/website/docs/api/legacy.md
+++ b/website/docs/api/legacy.md
@@ -248,6 +248,59 @@ added to an existing vectors table. See more details in
 
 ## Loggers {#loggers}
 
+These functions are available from `@spacy.registry.loggers`.
+
+### spacy.ConsoleLogger.v1 {#ConsoleLogger_v1}
+
+> #### Example config
+>
+> ```ini
+> [training.logger]
+> @loggers = "spacy.ConsoleLogger.v1"
+> progress_bar = true
+> ```
+
+Writes the results of a training step to the console in a tabular format.
+
+<Accordion title="Example console output" spaced>
+
+```cli
+$ python -m spacy train config.cfg
+```
+
+```
+ℹ Using CPU
+ℹ Loading config and nlp from: config.cfg
+ℹ Pipeline: ['tok2vec', 'tagger']
+ℹ Start training
+ℹ Training. Initial learn rate: 0.0
+
+E     #        LOSS TOK2VEC   LOSS TAGGER   TAG_ACC   SCORE
+---   ------   ------------   -----------   -------   ------
+  0        0           0.00         86.20      0.22     0.00
+  0      200           3.08      18968.78     34.00     0.34
+  0      400          31.81      22539.06     33.64     0.34
+  0      600          92.13      22794.91     43.80     0.44
+  0      800         183.62      21541.39     56.05     0.56
+  0     1000         352.49      25461.82     65.15     0.65
+  0     1200         422.87      23708.82     71.84     0.72
+  0     1400         601.92      24994.79     76.57     0.77
+  0     1600         662.57      22268.02     80.20     0.80
+  0     1800        1101.50      28413.77     82.56     0.83
+  0     2000        1253.43      28736.36     85.00     0.85
+  0     2200        1411.02      28237.53     87.42     0.87
+  0     2400        1605.35      28439.95     88.70     0.89
+```
+
+Note that the cumulative loss keeps increasing within one epoch, but should
+start decreasing across epochs.
+
+ </Accordion>
+
+| Name           | Description                                               |
+| -------------- | --------------------------------------------------------- |
+| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ |
+
 Logging utilities for spaCy are implemented in the
 [`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the
 functions are typically available from `@spacy.registry.loggers`.
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 1e1925442..c3dc42f1a 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -275,8 +275,8 @@ Render a dependency parse tree or named entity visualization.
 
 ### displacy.parse_deps {#displacy.parse_deps tag="method" new="2"}
 
-Generate dependency parse in `{'words': [], 'arcs': []}` format.
-For use with the `manual=True` argument in `displacy.render`.
+Generate dependency parse in `{'words': [], 'arcs': []}` format. For use with
+the `manual=True` argument in `displacy.render`.
 
 > #### Example
 >
@@ -297,8 +297,8 @@ For use with the `manual=True` argument in `displacy.render`.
 
 ### displacy.parse_ents {#displacy.parse_ents tag="method" new="2"}
 
-Generate named entities in `[{start: i, end: i, label: 'label'}]` format.
-For use with the `manual=True` argument in `displacy.render`.
+Generate named entities in `[{start: i, end: i, label: 'label'}]` format. For
+use with the `manual=True` argument in `displacy.render`.
 
 > #### Example
 >
@@ -319,8 +319,8 @@ For use with the `manual=True` argument in `displacy.render`.
 
 ### displacy.parse_spans {#displacy.parse_spans tag="method" new="2"}
 
-Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format.
-For use with the `manual=True` argument in `displacy.render`.
+Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format. For
+use with the `manual=True` argument in `displacy.render`.
 
 > #### Example
 >
@@ -505,7 +505,7 @@ finished. To log each training step, a
 and the accuracy scores on the development set.
 
 The built-in, default logger is the ConsoleLogger, which prints results to the
-console in tabular format. The
+console in tabular format and saves them to a `jsonl` file. The
 [spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as
 a dependency of spaCy, enables other loggers, such as one that sends results to
 a [Weights & Biases](https://www.wandb.com/) dashboard.
@@ -513,16 +513,20 @@ a [Weights & Biases](https://www.wandb.com/) dashboard.
 Instead of using one of the built-in loggers, you can
 [implement your own](/usage/training#custom-logging).
 
-#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
+#### spacy.ConsoleLogger.v2 {#ConsoleLogger tag="registered function"}
 
 > #### Example config
 >
 > ```ini
 > [training.logger]
-> @loggers = "spacy.ConsoleLogger.v1"
+> @loggers = "spacy.ConsoleLogger.v2"
+> progress_bar = true
+> console_output = true
+> output_file = "training_log.jsonl"
 > ```
 
-Writes the results of a training step to the console in a tabular format.
+Writes the results of a training step to the console in a tabular format and
+saves them to a `jsonl` file.
 
 <Accordion title="Example console output" spaced>
 
@@ -536,22 +540,23 @@ $ python -m spacy train config.cfg
 ℹ Pipeline: ['tok2vec', 'tagger']
 ℹ Start training
 ℹ Training. Initial learn rate: 0.0
+ℹ Saving results to training_log.jsonl
 
 E     #        LOSS TOK2VEC   LOSS TAGGER   TAG_ACC   SCORE
 ---   ------   ------------   -----------   -------   ------
-  1        0           0.00         86.20      0.22     0.00
-  1      200           3.08      18968.78     34.00     0.34
-  1      400          31.81      22539.06     33.64     0.34
-  1      600          92.13      22794.91     43.80     0.44
-  1      800         183.62      21541.39     56.05     0.56
-  1     1000         352.49      25461.82     65.15     0.65
-  1     1200         422.87      23708.82     71.84     0.72
-  1     1400         601.92      24994.79     76.57     0.77
-  1     1600         662.57      22268.02     80.20     0.80
-  1     1800        1101.50      28413.77     82.56     0.83
-  1     2000        1253.43      28736.36     85.00     0.85
-  1     2200        1411.02      28237.53     87.42     0.87
-  1     2400        1605.35      28439.95     88.70     0.89
+  0        0           0.00         86.20      0.22     0.00
+  0      200           3.08      18968.78     34.00     0.34
+  0      400          31.81      22539.06     33.64     0.34
+  0      600          92.13      22794.91     43.80     0.44
+  0      800         183.62      21541.39     56.05     0.56
+  0     1000         352.49      25461.82     65.15     0.65
+  0     1200         422.87      23708.82     71.84     0.72
+  0     1400         601.92      24994.79     76.57     0.77
+  0     1600         662.57      22268.02     80.20     0.80
+  0     1800        1101.50      28413.77     82.56     0.83
+  0     2000        1253.43      28736.36     85.00     0.85
+  0     2200        1411.02      28237.53     87.42     0.87
+  0     2400        1605.35      28439.95     88.70     0.89
 ```
 
 Note that the cumulative loss keeps increasing within one epoch, but should
@@ -559,6 +564,12 @@ start decreasing across epochs.
 
  </Accordion>
 
+| Name             | Description                                                           |
+| ---------------- | --------------------------------------------------------------------- |
+| `progress_bar`   | Whether the logger should print the progress bar ~~bool~~             |
+| `console_output` | Whether the logger should print the logs on the console. ~~bool~~     |
+| `output_file`    | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ |
+
 ## Readers {#readers}
 
 ### File readers {#file-readers source="github.com/explosion/srsly" new="3"}