From 0094cb0d0472b08f92915e948907b237eea020e3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 28 Jul 2020 11:22:24 +0200 Subject: [PATCH] Remove scores list from config and document --- spacy/cli/train.py | 3 +- spacy/default_config.cfg | 1 - spacy/language.py | 18 +++++-- spacy/schemas.py | 3 +- spacy/tests/pipeline/test_pipe_factories.py | 7 +-- website/docs/api/language.md | 54 ++++++++++++--------- 6 files changed, 46 insertions(+), 40 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index d52762525..44597c73e 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -445,9 +445,8 @@ def setup_printer( def update_meta( training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] ) -> None: - score_cols = training["scores"] nlp.meta["performance"] = {} - for metric in score_cols: + for metric in training["scores_weights"]: nlp.meta["performance"][metric] = info["other_scores"][metric] for pipe_name in nlp.pipe_names: nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 50654acce..fead996ba 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -34,7 +34,6 @@ seed = 0 accumulate_gradient = 1 use_pytorch_for_gpu_memory = false # Control how scores are printed and checkpoints are evaluated. -scores = ["token_acc", "speed"] score_weights = {} # These settings are invalid for the transformer models. init_tok2vec = null diff --git a/spacy/language.py b/spacy/language.py index fa9cfb80f..a0b65fd9e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -224,19 +224,15 @@ class Language: # We're storing the filled config for each pipeline component and so # we can populate the config again later pipeline = {} - scores = self._config["training"].get("scores", []) score_weights = [] for pipe_name in self.pipe_names: pipe_meta = self.get_pipe_meta(pipe_name) pipe_config = self.get_pipe_config(pipe_name) pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config} - scores.extend(pipe_meta.scores) if pipe_meta.default_score_weights: score_weights.append(pipe_meta.default_score_weights) self._config["nlp"]["pipeline"] = self.pipe_names self._config["components"] = pipeline - self._config["training"]["scores"] = sorted(set(scores)) - combined_score_weights = combine_score_weights(score_weights) self._config["training"]["score_weights"] = combine_score_weights(score_weights) if not srsly.is_json_serializable(self._config): raise ValueError(Errors.E961.format(config=self._config)) @@ -376,6 +372,12 @@ class Language: e.g. "token.ent_id". Used for pipeline analyis. retokenizes (bool): Whether the component changes the tokenization. Used for pipeline analysis. + scores (Iterable[str]): All scores set by the component if it's trainable, + e.g. ["ents_f", "ents_r", "ents_p"]. + default_score_weights (Dict[str, float]): The scores to report during + training, and their default weight towards the final score used to + select the best model. Weights should sum to 1.0 per component and + will be combined and normalized for the whole pipeline. func (Optional[Callable]): Factory function if not used as a decorator. """ if not isinstance(name, str): @@ -448,6 +450,12 @@ class Language: e.g. "token.ent_id". Used for pipeline analyis. retokenizes (bool): Whether the component changes the tokenization. Used for pipeline analysis. + scores (Iterable[str]): All scores set by the component if it's trainable, + e.g. ["ents_f", "ents_r", "ents_p"]. + default_score_weights (Dict[str, float]): The scores to report during + training, and their default weight towards the final score used to + select the best model. Weights should sum to 1.0 per component and + will be combined and normalized for the whole pipeline. func (Optional[Callable]): Factory function if not used as a decorator. """ if name is not None and not isinstance(name, str): @@ -1505,7 +1513,7 @@ class FactoryMeta: requires: Iterable[str] = tuple() retokenizes: bool = False scores: Iterable[str] = tuple() - default_score_weights: Dict[str, float] = None + default_score_weights: Optional[Dict[str, float]] = None # noqa: E704 def _get_config_overrides( diff --git a/spacy/schemas.py b/spacy/schemas.py index 478755cf8..3f3c01f22 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -207,8 +207,7 @@ class ConfigSchemaTraining(BaseModel): seed: Optional[StrictInt] = Field(..., title="Random seed") accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch") - scores: List[StrictStr] = Field(..., title="Score types to be printed in overview") - score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Weights of each score type for selecting final model") + score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") discard_oversize: StrictBool = Field(..., title="Whether to skip examples longer than batch size") batch_by: StrictStr = Field(..., title="Batch examples by type") diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 3f3cb8984..64c6c2d6f 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -343,10 +343,7 @@ def test_language_factories_invalid(): [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}], {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25}, ), - ( - [{"a": 0.5, "b": 0.5}, {"b": 1.0}], - {"a": 0.25, "b": 0.75}, - ), + ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},), ], ) def test_language_factories_combine_score_weights(weights, expected): @@ -371,11 +368,9 @@ def test_language_factories_scores(): meta2 = Language.get_factory_meta(f"{name}2") assert meta2.default_score_weights == weights2 nlp = Language() - nlp._config["training"]["scores"] = ["speed"] nlp._config["training"]["score_weights"] = {} nlp.add_pipe(f"{name}1") nlp.add_pipe(f"{name}2") cfg = nlp.config["training"] - assert cfg["scores"] == sorted(["speed", *list(weights1.keys()), *list(weights2.keys())]) expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05} assert cfg["score_weights"] == expected_weights diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 1df7bcb48..a61249dcb 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -42,14 +42,16 @@ decorator. For more details and examples, see the > Language.component("my_component2", func=my_component) > ``` -| Name | Type | Description | -| -------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | The name of the component factory. | -| _keyword-only_ | | | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | -| `func` | `Optional[Callable]` | Optional function if not used a a decorator. | +| Name | Type | Description | +| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | str | The name of the component factory. | +| _keyword-only_ | | | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | +| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | +| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | +| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | +| `func` | `Optional[Callable]` | Optional function if not used a a decorator. | ## Language.factory {#factory tag="classmethod"} @@ -87,15 +89,17 @@ examples, see the > ) > ``` -| Name | Type | Description | -| ---------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | The name of the component factory. | -| _keyword-only_ | | | -| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | -| `func` | `Optional[Callable]` | Optional function if not used a a decorator. | +| Name | Type | Description | +| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | str | The name of the component factory. | +| _keyword-only_ | | | +| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | +| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | +| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | +| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | +| `func` | `Optional[Callable]` | Optional function if not used a a decorator. | ## Language.\_\_init\_\_ {#init tag="method"} @@ -767,10 +771,12 @@ provided by the [`@Language.component`](/api/language#component) or component is added to the pipeline and stored on the `Language` class for each component instance and factory instance. -| Name | Type | Description | -| ---------------- | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| `factory` | str | The name of the registered component factory. | -| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.   | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis.   | +| Name | Type | Description | +| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `factory` | str | The name of the registered component factory. | +| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.   | +| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis.   | +| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | +| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |