From 0094cb0d0472b08f92915e948907b237eea020e3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 28 Jul 2020 11:22:24 +0200
Subject: [PATCH] Remove scores list from config and document

---
 spacy/cli/train.py                          |  3 +-
 spacy/default_config.cfg                    |  1 -
 spacy/language.py                           | 18 +++++--
 spacy/schemas.py                            |  3 +-
 spacy/tests/pipeline/test_pipe_factories.py |  7 +--
 website/docs/api/language.md                | 54 ++++++++++++---------
 6 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index d52762525..44597c73e 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -445,9 +445,8 @@ def setup_printer(
 def update_meta(
     training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
 ) -> None:
-    score_cols = training["scores"]
     nlp.meta["performance"] = {}
-    for metric in score_cols:
+    for metric in training["scores_weights"]:
         nlp.meta["performance"][metric] = info["other_scores"][metric]
     for pipe_name in nlp.pipe_names:
         nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 50654acce..fead996ba 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -34,7 +34,6 @@ seed = 0
 accumulate_gradient = 1
 use_pytorch_for_gpu_memory = false
 # Control how scores are printed and checkpoints are evaluated.
-scores = ["token_acc", "speed"]
 score_weights = {}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
diff --git a/spacy/language.py b/spacy/language.py
index fa9cfb80f..a0b65fd9e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -224,19 +224,15 @@ class Language:
         # We're storing the filled config for each pipeline component and so
         # we can populate the config again later
         pipeline = {}
-        scores = self._config["training"].get("scores", [])
         score_weights = []
         for pipe_name in self.pipe_names:
             pipe_meta = self.get_pipe_meta(pipe_name)
             pipe_config = self.get_pipe_config(pipe_name)
             pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
-            scores.extend(pipe_meta.scores)
             if pipe_meta.default_score_weights:
                 score_weights.append(pipe_meta.default_score_weights)
         self._config["nlp"]["pipeline"] = self.pipe_names
         self._config["components"] = pipeline
-        self._config["training"]["scores"] = sorted(set(scores))
-        combined_score_weights = combine_score_weights(score_weights)
         self._config["training"]["score_weights"] = combine_score_weights(score_weights)
         if not srsly.is_json_serializable(self._config):
             raise ValueError(Errors.E961.format(config=self._config))
@@ -376,6 +372,12 @@ class Language:
             e.g. "token.ent_id". Used for pipeline analyis.
         retokenizes (bool): Whether the component changes the tokenization.
             Used for pipeline analysis.
+        scores (Iterable[str]): All scores set by the component if it's trainable,
+            e.g. ["ents_f", "ents_r", "ents_p"].
+        default_score_weights (Dict[str, float]): The scores to report during
+            training, and their default weight towards the final score used to
+            select the best model. Weights should sum to 1.0 per component and
+            will be combined and normalized for the whole pipeline.
         func (Optional[Callable]): Factory function if not used as a decorator.
         """
         if not isinstance(name, str):
@@ -448,6 +450,12 @@ class Language:
             e.g. "token.ent_id". Used for pipeline analyis.
         retokenizes (bool): Whether the component changes the tokenization.
             Used for pipeline analysis.
+        scores (Iterable[str]): All scores set by the component if it's trainable,
+            e.g. ["ents_f", "ents_r", "ents_p"].
+        default_score_weights (Dict[str, float]): The scores to report during
+            training, and their default weight towards the final score used to
+            select the best model. Weights should sum to 1.0 per component and
+            will be combined and normalized for the whole pipeline.
         func (Optional[Callable]): Factory function if not used as a decorator.
         """
         if name is not None and not isinstance(name, str):
@@ -1505,7 +1513,7 @@ class FactoryMeta:
     requires: Iterable[str] = tuple()
     retokenizes: bool = False
     scores: Iterable[str] = tuple()
-    default_score_weights: Dict[str, float] = None
+    default_score_weights: Optional[Dict[str, float]] = None  # noqa: E704
 
 
 def _get_config_overrides(
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 478755cf8..3f3c01f22 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -207,8 +207,7 @@ class ConfigSchemaTraining(BaseModel):
     seed: Optional[StrictInt] = Field(..., title="Random seed")
     accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
     use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch")
-    scores: List[StrictStr] = Field(..., title="Score types to be printed in overview")
-    score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Weights of each score type for selecting final model")
+    score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
     init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
     discard_oversize: StrictBool = Field(..., title="Whether to skip examples longer than batch size")
     batch_by: StrictStr = Field(..., title="Batch examples by type")
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 3f3cb8984..64c6c2d6f 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -343,10 +343,7 @@ def test_language_factories_invalid():
             [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
             {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
         ),
-        (
-            [{"a": 0.5, "b": 0.5}, {"b": 1.0}],
-            {"a": 0.25, "b": 0.75},
-        ),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
     ],
 )
 def test_language_factories_combine_score_weights(weights, expected):
@@ -371,11 +368,9 @@ def test_language_factories_scores():
     meta2 = Language.get_factory_meta(f"{name}2")
     assert meta2.default_score_weights == weights2
     nlp = Language()
-    nlp._config["training"]["scores"] = ["speed"]
     nlp._config["training"]["score_weights"] = {}
     nlp.add_pipe(f"{name}1")
     nlp.add_pipe(f"{name}2")
     cfg = nlp.config["training"]
-    assert cfg["scores"] == sorted(["speed", *list(weights1.keys()), *list(weights2.keys())])
     expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05}
     assert cfg["score_weights"] == expected_weights
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 1df7bcb48..a61249dcb 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -42,14 +42,16 @@ decorator. For more details and examples, see the
 > Language.component("my_component2", func=my_component)
 > ```
 
-| Name           | Type                 | Description                                                                                                                                   |
-| -------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
-| `name`         | str                  | The name of the component factory.                                                                                                            |
-| _keyword-only_ |                      |                                                                                                                                               |
-| `assigns`      | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
-| `requires`     | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
-| `retokenizes`  | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                      |
-| `func`         | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                  |
+| Name                    | Type                 | Description                                                                                                                                                                                                                 |
+| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `name`                  | str                  | The name of the component factory.                                                                                                                                                                                          |
+| _keyword-only_          |                      |                                                                                                                                                                                                                             |
+| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                                                                                                    |
+| `scores`                | `Iterable[str]`      | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
+| `default_score_weights` | `Dict[str, float]`   | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
+| `func`                  | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                                                                                                |
 
 ## Language.factory {#factory tag="classmethod"}
 
@@ -87,15 +89,17 @@ examples, see the
 > )
 > ```
 
-| Name             | Type                 | Description                                                                                                                                   |
-| ---------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
-| `name`           | str                  | The name of the component factory.                                                                                                            |
-| _keyword-only_   |                      |                                                                                                                                               |
-| `default_config` | `Dict[str, any]`     | The default config, describing the default values of the factory arguments.                                                                   |
-| `assigns`        | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
-| `requires`       | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
-| `retokenizes`    | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                      |
-| `func`           | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                  |
+| Name                    | Type                 | Description                                                                                                                                                                                                                 |
+| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `name`                  | str                  | The name of the component factory.                                                                                                                                                                                          |
+| _keyword-only_          |                      |                                                                                                                                                                                                                             |
+| `default_config`        | `Dict[str, any]`     | The default config, describing the default values of the factory arguments.                                                                                                                                                 |
+| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                                                                                                    |
+| `scores`                | `Iterable[str]`      | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
+| `default_score_weights` | `Dict[str, float]`   | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
+| `func`                  | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                                                                                                |
 
 ## Language.\_\_init\_\_ {#init tag="method"}
 
@@ -767,10 +771,12 @@ provided by the [`@Language.component`](/api/language#component) or
 component is added to the pipeline and stored on the `Language` class for each
 component instance and factory instance.
 
-| Name             | Type             | Description                                                                                                                                    |
-| ---------------- | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| `factory`        | str              | The name of the registered component factory.                                                                                                  |
-| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments.                                                                    |
-| `assigns`        | `Iterable[str]`  | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->  |
-| `requires`       | `Iterable[str]`  | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->  |
-| `retokenizes`    | bool             | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                       |
+| Name                    | Type               | Description                                                                                                                                                                                                                 |
+| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `factory`               | str                | The name of the registered component factory.                                                                                                                                                                               |
+| `default_config`        | `Dict[str, Any]`   | The default config, describing the default values of the factory arguments.                                                                                                                                                 |
+| `assigns`               | `Iterable[str]`    | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `requires`              | `Iterable[str]`    | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `retokenizes`           | bool               | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                                                                                                    |
+| `scores`                | `Iterable[str]`    | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
+| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |