Remove scores list from config and document

2020-07-28 11:22:24 +02:00 · 2020-07-28 11:22:24 +02:00 · 0094cb0d04
parent 9b704c3db3
commit 0094cb0d04
6 changed files with 46 additions and 40 deletions
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -445,9 +445,8 @@ def setup_printer(
 def update_meta(
    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
 ) -> None:
-    score_cols = training["scores"]
    nlp.meta["performance"] = {}
-    for metric in score_cols:
+    for metric in training["scores_weights"]:
        nlp.meta["performance"][metric] = info["other_scores"][metric]
    for pipe_name in nlp.pipe_names:
        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -34,7 +34,6 @@ seed = 0
 accumulate_gradient = 1
 use_pytorch_for_gpu_memory = false
 # Control how scores are printed and checkpoints are evaluated.
-scores = ["token_acc", "speed"]
 score_weights = {}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
--- a/spacy/language.py
+++ b/spacy/language.py
@ -224,19 +224,15 @@ class Language:
        # We're storing the filled config for each pipeline component and so
        # we can populate the config again later
        pipeline = {}
-        scores = self._config["training"].get("scores", [])
        score_weights = []
        for pipe_name in self.pipe_names:
            pipe_meta = self.get_pipe_meta(pipe_name)
            pipe_config = self.get_pipe_config(pipe_name)
            pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
-            scores.extend(pipe_meta.scores)
            if pipe_meta.default_score_weights:
                score_weights.append(pipe_meta.default_score_weights)
        self._config["nlp"]["pipeline"] = self.pipe_names
        self._config["components"] = pipeline
-        self._config["training"]["scores"] = sorted(set(scores))
-        combined_score_weights = combine_score_weights(score_weights)
        self._config["training"]["score_weights"] = combine_score_weights(score_weights)
        if not srsly.is_json_serializable(self._config):
            raise ValueError(Errors.E961.format(config=self._config))
@ -376,6 +372,12 @@ class Language:
            e.g. "token.ent_id". Used for pipeline analyis.
        retokenizes (bool): Whether the component changes the tokenization.
            Used for pipeline analysis.
+        scores (Iterable[str]): All scores set by the component if it's trainable,
+            e.g. ["ents_f", "ents_r", "ents_p"].
+        default_score_weights (Dict[str, float]): The scores to report during
+            training, and their default weight towards the final score used to
+            select the best model. Weights should sum to 1.0 per component and
+            will be combined and normalized for the whole pipeline.
        func (Optional[Callable]): Factory function if not used as a decorator.
        """
        if not isinstance(name, str):
@ -448,6 +450,12 @@ class Language:
            e.g. "token.ent_id". Used for pipeline analyis.
        retokenizes (bool): Whether the component changes the tokenization.
            Used for pipeline analysis.
+        scores (Iterable[str]): All scores set by the component if it's trainable,
+            e.g. ["ents_f", "ents_r", "ents_p"].
+        default_score_weights (Dict[str, float]): The scores to report during
+            training, and their default weight towards the final score used to
+            select the best model. Weights should sum to 1.0 per component and
+            will be combined and normalized for the whole pipeline.
        func (Optional[Callable]): Factory function if not used as a decorator.
        """
        if name is not None and not isinstance(name, str):
@ -1505,7 +1513,7 @@ class FactoryMeta:
    requires: Iterable[str] = tuple()
    retokenizes: bool = False
    scores: Iterable[str] = tuple()
-    default_score_weights: Dict[str, float] = None
+    default_score_weights: Optional[Dict[str, float]] = None  # noqa: E704


 def _get_config_overrides(
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -207,8 +207,7 @@ class ConfigSchemaTraining(BaseModel):
    seed: Optional[StrictInt] = Field(..., title="Random seed")
    accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
    use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch")
-    scores: List[StrictStr] = Field(..., title="Score types to be printed in overview")
-    score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Weights of each score type for selecting final model")
+    score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
    discard_oversize: StrictBool = Field(..., title="Whether to skip examples longer than batch size")
    batch_by: StrictStr = Field(..., title="Batch examples by type")
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@ -343,10 +343,7 @@ def test_language_factories_invalid():
            [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
            {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
        ),
-        (
-            [{"a": 0.5, "b": 0.5}, {"b": 1.0}],
-            {"a": 0.25, "b": 0.75},
-        ),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
    ],
 )
 def test_language_factories_combine_score_weights(weights, expected):
@ -371,11 +368,9 @@ def test_language_factories_scores():
    meta2 = Language.get_factory_meta(f"{name}2")
    assert meta2.default_score_weights == weights2
    nlp = Language()
-    nlp._config["training"]["scores"] = ["speed"]
    nlp._config["training"]["score_weights"] = {}
    nlp.add_pipe(f"{name}1")
    nlp.add_pipe(f"{name}2")
    cfg = nlp.config["training"]
-    assert cfg["scores"] == sorted(["speed", *list(weights1.keys()), *list(weights2.keys())])
    expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05}
    assert cfg["score_weights"] == expected_weights
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -42,14 +42,16 @@ decorator. For more details and examples, see the
 > Language.component("my_component2", func=my_component)
 > ```

-| Name           | Type                 | Description                                                                                                                                   |
-| -------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
-| `name`         | str                  | The name of the component factory.                                                                                                            |
-| _keyword-only_ |                      |                                                                                                                                               |
-| `assigns`      | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
-| `requires`     | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
-| `retokenizes`  | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                      |
-| `func`         | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                  |
+| Name                    | Type                 | Description                                                                                                                                                                                                                 |
+| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `name`                  | str                  | The name of the component factory.                                                                                                                                                                                          |
+| _keyword-only_          |                      |                                                                                                                                                                                                                             |
+| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                                                                                                    |
+| `scores`                | `Iterable[str]`      | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
+| `default_score_weights` | `Dict[str, float]`   | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
+| `func`                  | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                                                                                                |

 ## Language.factory {#factory tag="classmethod"}

@ -87,15 +89,17 @@ examples, see the
 > )
 > ```

-| Name             | Type                 | Description                                                                                                                                   |
-| ---------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
-| `name`           | str                  | The name of the component factory.                                                                                                            |
-| _keyword-only_   |                      |                                                                                                                                               |
-| `default_config` | `Dict[str, any]`     | The default config, describing the default values of the factory arguments.                                                                   |
-| `assigns`        | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
-| `requires`       | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
-| `retokenizes`    | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                      |
-| `func`           | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                  |
+| Name                    | Type                 | Description                                                                                                                                                                                                                 |
+| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `name`                  | str                  | The name of the component factory.                                                                                                                                                                                          |
+| _keyword-only_          |                      |                                                                                                                                                                                                                             |
+| `default_config`        | `Dict[str, any]`     | The default config, describing the default values of the factory arguments.                                                                                                                                                 |
+| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                                                                                                    |
+| `scores`                | `Iterable[str]`      | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
+| `default_score_weights` | `Dict[str, float]`   | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
+| `func`                  | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                                                                                                |

 ## Language.\_\_init\_\_ {#init tag="method"}

@ -767,10 +771,12 @@ provided by the [`@Language.component`](/api/language#component) or
 component is added to the pipeline and stored on the `Language` class for each
 component instance and factory instance.

-| Name             | Type             | Description                                                                                                                                    |
-| ---------------- | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| `factory`        | str              | The name of the registered component factory.                                                                                                  |
-| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments.                                                                    |
-| `assigns`        | `Iterable[str]`  | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->  |
-| `requires`       | `Iterable[str]`  | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->  |
-| `retokenizes`    | bool             | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                       |
+| Name                    | Type               | Description                                                                                                                                                                                                                 |
+| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `factory`               | str                | The name of the registered component factory.                                                                                                                                                                               |
+| `default_config`        | `Dict[str, Any]`   | The default config, describing the default values of the factory arguments.                                                                                                                                                 |
+| `assigns`               | `Iterable[str]`    | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `requires`              | `Iterable[str]`    | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `retokenizes`           | bool               | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                                                                                                    |
+| `scores`                | `Iterable[str]`    | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
+| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |