From b40f44419b03010d7eb14d255f9bfc99c3cad637 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 1 Aug 2020 13:40:06 +0200 Subject: [PATCH] Simplify pipe analysis - remove unused code - don't print by default - integrate attrs info into analysis output --- spacy/errors.py | 2 - spacy/language.py | 15 +- spacy/pipe_analysis.py | 159 +++++++-------------- spacy/tests/pipeline/test_analysis.py | 33 +---- website/docs/api/language.md | 72 +++++++--- website/docs/usage/processing-pipelines.md | 60 ++++++-- 6 files changed, 171 insertions(+), 170 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 3fe53d6db..124572b0b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -63,8 +63,6 @@ class Warnings: "have the spacy-lookups-data package installed.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " "the Knowledge Base.") - W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " - "previous components in the pipeline declare that they assign it.") W026 = ("Unable to set all sentence boundaries from dependency parses.") W027 = ("Found a large training file of {size} bytes. Note that it may " "be more efficient to split your training data into multiple " diff --git a/spacy/language.py b/spacy/language.py index 6230913b4..d1b180cef 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -18,7 +18,7 @@ from timeit import default_timer as timer from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab -from .pipe_analysis import validate_attrs, print_summary +from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .gold import Example from .scorer import Scorer from .util import create_default_optimizer, registry @@ -524,19 +524,20 @@ class Language: self, *, keys: List[str] = ["assigns", "requires", "scores", "retokenizes"], - pretty: bool = True, - no_print: bool = False, + pretty: bool = False, ) -> Optional[Dict[str, Any]]: """Analyze the current pipeline components, print a summary of what they assign or require and check that all requirements are met. keys (List[str]): The meta values to display in the table. Corresponds to values in FactoryMeta, defined by @Language.factory decorator. - pretty (bool): Pretty-print the results with colors and icons. - no_print (bool): Don't print anything and return structured dict instead. - RETURNS (dict): The data, if no_print is set to True. + pretty (bool): Pretty-print the results. + RETURNS (dict): The data. """ - return print_summary(self, keys=keys, pretty=pretty, no_print=no_print) + analysis = analyze_pipes(self, keys=keys) + if pretty: + print_pipe_analysis(analysis, keys=keys) + return analysis def get_pipe(self, name: str) -> Callable[[Doc], Doc]: """Get a pipeline component for a given component name. diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py index 71f99daef..008ac3384 100644 --- a/spacy/pipe_analysis.py +++ b/spacy/pipe_analysis.py @@ -1,9 +1,8 @@ from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING -from wasabi import Printer -import warnings +from wasabi import msg from .tokens import Doc, Token, Span -from .errors import Errors, Warnings +from .errors import Errors from .util import dot_to_dict if TYPE_CHECKING: @@ -11,35 +10,7 @@ if TYPE_CHECKING: from .language import Language # noqa: F401 -def analyze_pipes( - nlp: "Language", name: str, index: int, warn: bool = True -) -> List[str]: - """Analyze a pipeline component with respect to its position in the current - pipeline and the other components. Will check whether requirements are - fulfilled (e.g. if previous components assign the attributes). - - nlp (Language): The current nlp object. - name (str): The name of the pipeline component to analyze. - index (int): The index of the component in the pipeline. - warn (bool): Show user warning if problem is found. - RETURNS (List[str]): The problems found for the given pipeline component. - """ - assert nlp.pipeline[index][0] == name - prev_pipes = nlp.pipeline[:index] - meta = nlp.get_pipe_meta(name) - requires = {annot: False for annot in meta.requires} - if requires: - for prev_name, prev_pipe in prev_pipes: - prev_meta = nlp.get_pipe_meta(prev_name) - for annot in prev_meta.assigns: - requires[annot] = True - problems = [] - for annot, fulfilled in requires.items(): - if not fulfilled: - problems.append(annot) - if warn: - warnings.warn(Warnings.W025.format(name=name, attr=annot)) - return problems +DEFAULT_KEYS = ["requires", "assigns", "scores", "retokenizes"] def validate_attrs(values: Iterable[str]) -> Iterable[str]: @@ -88,97 +59,77 @@ def validate_attrs(values: Iterable[str]) -> Iterable[str]: return values -def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]: - assert feature in ["assigns", "requires"] - result = [] +def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]: + """Check which components in the pipeline assign or require an attribute. + + nlp (Language): The current nlp object. + attr (str): The attribute, e.g. "doc.tensor". + RETURNS (Dict[str, List[str]]): A dict keyed by "assigns" and "requires", + mapped to a list of component names. + """ + result = {"assigns": [], "requires": []} for pipe_name in nlp.pipe_names: meta = nlp.get_pipe_meta(pipe_name) - pipe_assigns = getattr(meta, feature, []) - if attr in pipe_assigns: - result.append(pipe_name) + if attr in meta.assigns: + result["assigns"].append(pipe_name) + if attr in meta.requires: + result["requires"].append(pipe_name) return result -def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]: - """Get all pipeline components that assign an attr, e.g. "doc.tensor". - - pipeline (Language): The current nlp object. - attr (str): The attribute to check. - RETURNS (List[str]): Names of components that require the attr. - """ - return _get_feature_for_attr(nlp, attr, "assigns") - - -def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]: - """Get all pipeline components that require an attr, e.g. "doc.tensor". - - pipeline (Language): The current nlp object. - attr (str): The attribute to check. - RETURNS (List[str]): Names of components that require the attr. - """ - return _get_feature_for_attr(nlp, attr, "requires") - - -def print_summary( - nlp: "Language", - *, - keys: List[str] = ["requires", "assigns", "scores", "retokenizes"], - pretty: bool = True, - no_print: bool = False, -) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]: +def analyze_pipes( + nlp: "Language", *, keys: List[str] = DEFAULT_KEYS, +) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Print a formatted summary for the current nlp object's pipeline. Shows a table with the pipeline components and why they assign and require, as well as any problems if available. nlp (Language): The nlp object. keys (List[str]): The meta keys to show in the table. - pretty (bool): Pretty-print the results (color etc). - no_print (bool): Don't print anything, just return the data. - RETURNS (dict): A dict with "overview" and "problems". + RETURNS (dict): A dict with "summary" and "problems". """ - msg = Printer(pretty=pretty, no_print=no_print) - overview = {} - problems = {} + result = {"summary": {}, "problems": {}} + all_attrs = set() for i, name in enumerate(nlp.pipe_names): meta = nlp.get_pipe_meta(name) - overview[name] = {"i": i, "name": name} - for key in keys: - overview[name][key] = getattr(meta, key, None) - problems[name] = analyze_pipes(nlp, name, i, warn=False) + all_attrs.update(meta.assigns) + all_attrs.update(meta.requires) + result["summary"][name] = {key: getattr(meta, key, None) for key in keys} + prev_pipes = nlp.pipeline[:i] + requires = {annot: False for annot in meta.requires} + if requires: + for prev_name, prev_pipe in prev_pipes: + prev_meta = nlp.get_pipe_meta(prev_name) + for annot in prev_meta.assigns: + requires[annot] = True + result["problems"][name] = [] + for annot, fulfilled in requires.items(): + if not fulfilled: + result["problems"][name].append(annot) + result["attrs"] = {attr: get_attr_info(nlp, attr) for attr in all_attrs} + return result + + +def print_pipe_analysis( + analysis: Dict[str, Union[List[str], Dict[str, List[str]]]], + *, + keys: List[str] = DEFAULT_KEYS, +) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]: + """Print a formatted version of the pipe analysis produced by analyze_pipes. + + analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis. + keys (List[str]): The meta keys to show in the table. + """ msg.divider("Pipeline Overview") header = ["#", "Component", *[key.capitalize() for key in keys]] - body = [[info for info in entry.values()] for entry in overview.values()] + summary = analysis["summary"].items() + body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)] msg.table(body, header=header, divider=True, multiline=True) - n_problems = sum(len(p) for p in problems.values()) - if any(p for p in problems.values()): + n_problems = sum(len(p) for p in analysis["problems"].values()) + if any(p for p in analysis["problems"].values()): msg.divider(f"Problems ({n_problems})") - for name, problem in problems.items(): + for name, problem in analysis["problems"].items(): if problem: msg.warn(f"'{name}' requirements not met: {', '.join(problem)}") else: msg.good("No problems found.") - if no_print: - return {"overview": overview, "problems": problems} - - -def count_pipeline_interdependencies(nlp: "Language") -> List[int]: - """Count how many subsequent components require an annotation set by each - component in the pipeline. - - nlp (Language): The current nlp object. - RETURNS (List[int]): The interdependency counts. - """ - pipe_assigns = [] - pipe_requires = [] - for name in nlp.pipe_names: - meta = nlp.get_pipe_meta(name) - pipe_assigns.append(set(meta.assigns)) - pipe_requires.append(set(meta.requires)) - counts = [] - for i, assigns in enumerate(pipe_assigns): - count = 0 - for requires in pipe_requires[i + 1 :]: - if assigns.intersection(requires): - count += 1 - counts.append(count) - return counts diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index 80987c838..df3d7dff5 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -1,6 +1,5 @@ from spacy.language import Language -from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr -from spacy.pipe_analysis import validate_attrs, count_pipeline_interdependencies +from spacy.pipe_analysis import get_attr_info, validate_attrs from mock import Mock import pytest @@ -29,10 +28,10 @@ def test_component_decorator_assigns(): nlp = Language() nlp.add_pipe("c1") nlp.add_pipe("c2") - problems = nlp.analyze_pipes(no_print=True)["problems"] + problems = nlp.analyze_pipes()["problems"] assert problems["c2"] == ["token.pos"] nlp.add_pipe("c3") - assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2"] + assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2"] nlp.add_pipe("c1", name="c4") test_component4_meta = nlp.get_pipe_meta("c1") assert test_component4_meta.factory == "c1" @@ -40,8 +39,8 @@ def test_component_decorator_assigns(): assert not Language.has_factory("c4") assert nlp.pipe_factories["c1"] == "c1" assert nlp.pipe_factories["c4"] == "c1" - assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2", "c4"] - assert get_requires_for_attr(nlp, "token.pos") == ["c2"] + assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2", "c4"] + assert get_attr_info(nlp, "token.pos")["requires"] == ["c2"] assert nlp("hello world") @@ -108,26 +107,8 @@ def test_analysis_validate_attrs_remove_pipe(): nlp = Language() nlp.add_pipe("pipe_analysis_c6") nlp.add_pipe("pipe_analysis_c7") - problems = nlp.analyze_pipes(no_print=True)["problems"] + problems = nlp.analyze_pipes()["problems"] assert problems["pipe_analysis_c7"] == ["token.pos"] nlp.remove_pipe("pipe_analysis_c7") - problems = nlp.analyze_pipes(no_print=True)["problems"] + problems = nlp.analyze_pipes()["problems"] assert all(p == [] for p in problems.values()) - - -def test_pipe_interdependencies(): - prefix = "test_pipe_interdependencies" - - @Language.component(f"{prefix}.fancifier", assigns=("doc._.fancy",)) - def fancifier(doc): - return doc - - @Language.component(f"{prefix}.needer", requires=("doc._.fancy",)) - def needer(doc): - return doc - - nlp = Language() - nlp.add_pipe(f"{prefix}.fancifier") - nlp.add_pipe(f"{prefix}.needer") - counts = count_pipeline_interdependencies(nlp) - assert counts == [1, 0] diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 608442122..ba62d0b13 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -98,10 +98,10 @@ decorator. For more details and examples, see the | ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `name` | str | The name of the component factory. | | _keyword-only_ | | | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | -| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).. | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. | @@ -146,10 +146,10 @@ examples, see the | `name` | str | The name of the component factory. | | _keyword-only_ | | | | `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | -| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. | @@ -622,12 +622,45 @@ doesn't, the pipeline analysis won't catch that. > nlp = spacy.blank("en") > nlp.add_pipe("tagger") > nlp.add_pipe("entity_linker") -> nlp.analyze_pipes() +> analysis = nlp.analyze_pipes() > ``` +```json +### Structured +{ + "summary": { + "tagger": { + "assigns": ["token.tag"], + "requires": [], + "scores": ["tag_acc", "pos_acc", "lemma_acc"], + "retokenizes": false + }, + "entity_linker": { + "assigns": ["token.ent_kb_id"], + "requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], + "scores": [], + "retokenizes": false + } + }, + "problems": { + "tagger": [], + "entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"] + }, + "attrs": { + "token.ent_iob": { "assigns": [], "requires": ["entity_linker"] }, + "doc.ents": { "assigns": [], "requires": ["entity_linker"] }, + "token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] }, + "doc.sents": { "assigns": [], "requires": ["entity_linker"] }, + "token.tag": { "assigns": ["tagger"], "requires": [] }, + "token.ent_type": { "assigns": [], "requires": ["entity_linker"] } + } +} ``` + +``` +### Pretty ============================= Pipeline Overview ============================= # Component Assigns Requires Scores Retokenizes @@ -649,13 +682,12 @@ token.ent_iob, token.ent_type -| Name | Type | Description | -| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. | -| `pretty` | bool | Pretty-print the results with colors and icons. Defaults to `True`. | -| `no_print` | bool | Don't print anything and return a structured dict instead. Defaults to `False`. | -| **RETURNS** | dict | Optional dict, if `no_print` is set to `True`. | +| Name | Type | Description | +| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. | +| `pretty` | bool | Pretty-print the results as a table. Defaults to `False`. | +| **RETURNS** | dict | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). | ## Language.meta {#meta tag="property"} @@ -892,8 +924,8 @@ instance and factory instance. | ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `factory` | str | The name of the registered component factory. | | `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.  | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis.  | -| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).  | +| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis).  | +| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index deca96840..6388529f6 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -319,17 +319,61 @@ attributes they set on the [`Doc`](/api/doc) and [`Token`](/api/token), whether they retokenize the `Doc` and which scores they produce during training. It will also show warnings if components require values that aren't set by previous component – for instance, if the entity linker is used but no component that -runs before it sets named entities. +runs before it sets named entities. Setting `pretty=True` will pretty-print a +table instead of only returning the structured data. + +> #### ✏️ Things to try +> +> 1. Add the components `"ner"` and `"sentencizer"` _before_ the entity linker. +> The analysis should now show no problems, because requirements are met. ```python +### {executable="true"} +import spacy + nlp = spacy.blank("en") nlp.add_pipe("tagger") -nlp.add_pipe("entity_linker") # this is a problem, because it needs entities -nlp.analyze_pipes() +# This is a problem because it needs entities and sentence boundaries +nlp.add_pipe("entity_linker") +analysis = nlp.analyze_pipes(pretty=True) +``` + + + +```json +### Structured +{ + "summary": { + "tagger": { + "assigns": ["token.tag"], + "requires": [], + "scores": ["tag_acc", "pos_acc", "lemma_acc"], + "retokenizes": false + }, + "entity_linker": { + "assigns": ["token.ent_kb_id"], + "requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], + "scores": [], + "retokenizes": false + } + }, + "problems": { + "tagger": [], + "entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"] + }, + "attrs": { + "token.ent_iob": { "assigns": [], "requires": ["entity_linker"] }, + "doc.ents": { "assigns": [], "requires": ["entity_linker"] }, + "token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] }, + "doc.sents": { "assigns": [], "requires": ["entity_linker"] }, + "token.tag": { "assigns": ["tagger"], "requires": [] }, + "token.ent_type": { "assigns": [], "requires": ["entity_linker"] } + } +} ``` ``` -### Example output +### Pretty ============================= Pipeline Overview ============================= # Component Assigns Requires Scores Retokenizes @@ -349,13 +393,7 @@ nlp.analyze_pipes() token.ent_iob, token.ent_type ``` -If you prefer a structured dictionary containing the component information and -the problems, you can set `no_print=True`. This will return the data instead of -printing it. - -``` -result = nlp.analyze_pipes(no_print=True) -``` +