Simplify pipe analysis

- remove unused code
- don't print by default
- integrate attrs info into analysis output
This commit is contained in:
Ines Montani 2020-08-01 13:40:06 +02:00
parent 98c6a85c8b
commit b40f44419b
6 changed files with 171 additions and 170 deletions

View File

@ -63,8 +63,6 @@ class Warnings:
"have the spacy-lookups-data package installed.")
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
"the Knowledge Base.")
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
"previous components in the pipeline declare that they assign it.")
W026 = ("Unable to set all sentence boundaries from dependency parses.")
W027 = ("Found a large training file of {size} bytes. Note that it may "
"be more efficient to split your training data into multiple "

View File

@ -18,7 +18,7 @@ from timeit import default_timer as timer
from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab
from .pipe_analysis import validate_attrs, print_summary
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .gold import Example
from .scorer import Scorer
from .util import create_default_optimizer, registry
@ -524,19 +524,20 @@ class Language:
self,
*,
keys: List[str] = ["assigns", "requires", "scores", "retokenizes"],
pretty: bool = True,
no_print: bool = False,
pretty: bool = False,
) -> Optional[Dict[str, Any]]:
"""Analyze the current pipeline components, print a summary of what
they assign or require and check that all requirements are met.
keys (List[str]): The meta values to display in the table. Corresponds
to values in FactoryMeta, defined by @Language.factory decorator.
pretty (bool): Pretty-print the results with colors and icons.
no_print (bool): Don't print anything and return structured dict instead.
RETURNS (dict): The data, if no_print is set to True.
pretty (bool): Pretty-print the results.
RETURNS (dict): The data.
"""
return print_summary(self, keys=keys, pretty=pretty, no_print=no_print)
analysis = analyze_pipes(self, keys=keys)
if pretty:
print_pipe_analysis(analysis, keys=keys)
return analysis
def get_pipe(self, name: str) -> Callable[[Doc], Doc]:
"""Get a pipeline component for a given component name.

View File

@ -1,9 +1,8 @@
from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
from wasabi import Printer
import warnings
from wasabi import msg
from .tokens import Doc, Token, Span
from .errors import Errors, Warnings
from .errors import Errors
from .util import dot_to_dict
if TYPE_CHECKING:
@ -11,35 +10,7 @@ if TYPE_CHECKING:
from .language import Language # noqa: F401
def analyze_pipes(
nlp: "Language", name: str, index: int, warn: bool = True
) -> List[str]:
"""Analyze a pipeline component with respect to its position in the current
pipeline and the other components. Will check whether requirements are
fulfilled (e.g. if previous components assign the attributes).
nlp (Language): The current nlp object.
name (str): The name of the pipeline component to analyze.
index (int): The index of the component in the pipeline.
warn (bool): Show user warning if problem is found.
RETURNS (List[str]): The problems found for the given pipeline component.
"""
assert nlp.pipeline[index][0] == name
prev_pipes = nlp.pipeline[:index]
meta = nlp.get_pipe_meta(name)
requires = {annot: False for annot in meta.requires}
if requires:
for prev_name, prev_pipe in prev_pipes:
prev_meta = nlp.get_pipe_meta(prev_name)
for annot in prev_meta.assigns:
requires[annot] = True
problems = []
for annot, fulfilled in requires.items():
if not fulfilled:
problems.append(annot)
if warn:
warnings.warn(Warnings.W025.format(name=name, attr=annot))
return problems
DEFAULT_KEYS = ["requires", "assigns", "scores", "retokenizes"]
def validate_attrs(values: Iterable[str]) -> Iterable[str]:
@ -88,97 +59,77 @@ def validate_attrs(values: Iterable[str]) -> Iterable[str]:
return values
def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]:
assert feature in ["assigns", "requires"]
result = []
def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
"""Check which components in the pipeline assign or require an attribute.
nlp (Language): The current nlp object.
attr (str): The attribute, e.g. "doc.tensor".
RETURNS (Dict[str, List[str]]): A dict keyed by "assigns" and "requires",
mapped to a list of component names.
"""
result = {"assigns": [], "requires": []}
for pipe_name in nlp.pipe_names:
meta = nlp.get_pipe_meta(pipe_name)
pipe_assigns = getattr(meta, feature, [])
if attr in pipe_assigns:
result.append(pipe_name)
if attr in meta.assigns:
result["assigns"].append(pipe_name)
if attr in meta.requires:
result["requires"].append(pipe_name)
return result
def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]:
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
pipeline (Language): The current nlp object.
attr (str): The attribute to check.
RETURNS (List[str]): Names of components that require the attr.
"""
return _get_feature_for_attr(nlp, attr, "assigns")
def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]:
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
pipeline (Language): The current nlp object.
attr (str): The attribute to check.
RETURNS (List[str]): Names of components that require the attr.
"""
return _get_feature_for_attr(nlp, attr, "requires")
def print_summary(
nlp: "Language",
*,
keys: List[str] = ["requires", "assigns", "scores", "retokenizes"],
pretty: bool = True,
no_print: bool = False,
) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
def analyze_pipes(
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS,
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
"""Print a formatted summary for the current nlp object's pipeline. Shows
a table with the pipeline components and why they assign and require, as
well as any problems if available.
nlp (Language): The nlp object.
keys (List[str]): The meta keys to show in the table.
pretty (bool): Pretty-print the results (color etc).
no_print (bool): Don't print anything, just return the data.
RETURNS (dict): A dict with "overview" and "problems".
RETURNS (dict): A dict with "summary" and "problems".
"""
msg = Printer(pretty=pretty, no_print=no_print)
overview = {}
problems = {}
result = {"summary": {}, "problems": {}}
all_attrs = set()
for i, name in enumerate(nlp.pipe_names):
meta = nlp.get_pipe_meta(name)
overview[name] = {"i": i, "name": name}
for key in keys:
overview[name][key] = getattr(meta, key, None)
problems[name] = analyze_pipes(nlp, name, i, warn=False)
all_attrs.update(meta.assigns)
all_attrs.update(meta.requires)
result["summary"][name] = {key: getattr(meta, key, None) for key in keys}
prev_pipes = nlp.pipeline[:i]
requires = {annot: False for annot in meta.requires}
if requires:
for prev_name, prev_pipe in prev_pipes:
prev_meta = nlp.get_pipe_meta(prev_name)
for annot in prev_meta.assigns:
requires[annot] = True
result["problems"][name] = []
for annot, fulfilled in requires.items():
if not fulfilled:
result["problems"][name].append(annot)
result["attrs"] = {attr: get_attr_info(nlp, attr) for attr in all_attrs}
return result
def print_pipe_analysis(
analysis: Dict[str, Union[List[str], Dict[str, List[str]]]],
*,
keys: List[str] = DEFAULT_KEYS,
) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
"""Print a formatted version of the pipe analysis produced by analyze_pipes.
analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis.
keys (List[str]): The meta keys to show in the table.
"""
msg.divider("Pipeline Overview")
header = ["#", "Component", *[key.capitalize() for key in keys]]
body = [[info for info in entry.values()] for entry in overview.values()]
summary = analysis["summary"].items()
body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)]
msg.table(body, header=header, divider=True, multiline=True)
n_problems = sum(len(p) for p in problems.values())
if any(p for p in problems.values()):
n_problems = sum(len(p) for p in analysis["problems"].values())
if any(p for p in analysis["problems"].values()):
msg.divider(f"Problems ({n_problems})")
for name, problem in problems.items():
for name, problem in analysis["problems"].items():
if problem:
msg.warn(f"'{name}' requirements not met: {', '.join(problem)}")
else:
msg.good("No problems found.")
if no_print:
return {"overview": overview, "problems": problems}
def count_pipeline_interdependencies(nlp: "Language") -> List[int]:
"""Count how many subsequent components require an annotation set by each
component in the pipeline.
nlp (Language): The current nlp object.
RETURNS (List[int]): The interdependency counts.
"""
pipe_assigns = []
pipe_requires = []
for name in nlp.pipe_names:
meta = nlp.get_pipe_meta(name)
pipe_assigns.append(set(meta.assigns))
pipe_requires.append(set(meta.requires))
counts = []
for i, assigns in enumerate(pipe_assigns):
count = 0
for requires in pipe_requires[i + 1 :]:
if assigns.intersection(requires):
count += 1
counts.append(count)
return counts

View File

@ -1,6 +1,5 @@
from spacy.language import Language
from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr
from spacy.pipe_analysis import validate_attrs, count_pipeline_interdependencies
from spacy.pipe_analysis import get_attr_info, validate_attrs
from mock import Mock
import pytest
@ -29,10 +28,10 @@ def test_component_decorator_assigns():
nlp = Language()
nlp.add_pipe("c1")
nlp.add_pipe("c2")
problems = nlp.analyze_pipes(no_print=True)["problems"]
problems = nlp.analyze_pipes()["problems"]
assert problems["c2"] == ["token.pos"]
nlp.add_pipe("c3")
assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2"]
assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2"]
nlp.add_pipe("c1", name="c4")
test_component4_meta = nlp.get_pipe_meta("c1")
assert test_component4_meta.factory == "c1"
@ -40,8 +39,8 @@ def test_component_decorator_assigns():
assert not Language.has_factory("c4")
assert nlp.pipe_factories["c1"] == "c1"
assert nlp.pipe_factories["c4"] == "c1"
assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2", "c4"]
assert get_requires_for_attr(nlp, "token.pos") == ["c2"]
assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2", "c4"]
assert get_attr_info(nlp, "token.pos")["requires"] == ["c2"]
assert nlp("hello world")
@ -108,26 +107,8 @@ def test_analysis_validate_attrs_remove_pipe():
nlp = Language()
nlp.add_pipe("pipe_analysis_c6")
nlp.add_pipe("pipe_analysis_c7")
problems = nlp.analyze_pipes(no_print=True)["problems"]
problems = nlp.analyze_pipes()["problems"]
assert problems["pipe_analysis_c7"] == ["token.pos"]
nlp.remove_pipe("pipe_analysis_c7")
problems = nlp.analyze_pipes(no_print=True)["problems"]
problems = nlp.analyze_pipes()["problems"]
assert all(p == [] for p in problems.values())
def test_pipe_interdependencies():
prefix = "test_pipe_interdependencies"
@Language.component(f"{prefix}.fancifier", assigns=("doc._.fancy",))
def fancifier(doc):
return doc
@Language.component(f"{prefix}.needer", requires=("doc._.fancy",))
def needer(doc):
return doc
nlp = Language()
nlp.add_pipe(f"{prefix}.fancifier")
nlp.add_pipe(f"{prefix}.needer")
counts = count_pipeline_interdependencies(nlp)
assert counts == [1, 0]

View File

@ -98,10 +98,10 @@ decorator. For more details and examples, see the
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | The name of the component factory. |
| _keyword-only_ | | |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).. |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
@ -146,10 +146,10 @@ examples, see the
| `name` | str | The name of the component factory. |
| _keyword-only_ | | |
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
@ -622,12 +622,45 @@ doesn't, the pipeline analysis won't catch that.
> nlp = spacy.blank("en")
> nlp.add_pipe("tagger")
> nlp.add_pipe("entity_linker")
> nlp.analyze_pipes()
> analysis = nlp.analyze_pipes()
> ```
<Accordion title="Example output" spaced>
```json
### Structured
{
"summary": {
"tagger": {
"assigns": ["token.tag"],
"requires": [],
"scores": ["tag_acc", "pos_acc", "lemma_acc"],
"retokenizes": false
},
"entity_linker": {
"assigns": ["token.ent_kb_id"],
"requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
"scores": [],
"retokenizes": false
}
},
"problems": {
"tagger": [],
"entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"]
},
"attrs": {
"token.ent_iob": { "assigns": [], "requires": ["entity_linker"] },
"doc.ents": { "assigns": [], "requires": ["entity_linker"] },
"token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] },
"doc.sents": { "assigns": [], "requires": ["entity_linker"] },
"token.tag": { "assigns": ["tagger"], "requires": [] },
"token.ent_type": { "assigns": [], "requires": ["entity_linker"] }
}
}
```
```
### Pretty
============================= Pipeline Overview =============================
# Component Assigns Requires Scores Retokenizes
@ -649,13 +682,12 @@ token.ent_iob, token.ent_type
</Accordion>
| Name | Type | Description |
| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| _keyword-only_ | | |
| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. |
| `pretty` | bool | Pretty-print the results with colors and icons. Defaults to `True`. |
| `no_print` | bool | Don't print anything and return a structured dict instead. Defaults to `False`. |
| **RETURNS** | dict | Optional dict, if `no_print` is set to `True`. |
| Name | Type | Description |
| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| _keyword-only_ | | |
| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. |
| `pretty` | bool | Pretty-print the results as a table. Defaults to `False`. |
| **RETURNS** | dict | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). |
## Language.meta {#meta tag="property"}
@ -892,8 +924,8 @@ instance and factory instance.
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory` | str | The name of the registered component factory. |
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.  |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis.  |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).  |
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis).  |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |

View File

@ -319,17 +319,61 @@ attributes they set on the [`Doc`](/api/doc) and [`Token`](/api/token), whether
they retokenize the `Doc` and which scores they produce during training. It will
also show warnings if components require values that aren't set by previous
component for instance, if the entity linker is used but no component that
runs before it sets named entities.
runs before it sets named entities. Setting `pretty=True` will pretty-print a
table instead of only returning the structured data.
> #### ✏️ Things to try
>
> 1. Add the components `"ner"` and `"sentencizer"` _before_ the entity linker.
> The analysis should now show no problems, because requirements are met.
```python
### {executable="true"}
import spacy
nlp = spacy.blank("en")
nlp.add_pipe("tagger")
nlp.add_pipe("entity_linker") # this is a problem, because it needs entities
nlp.analyze_pipes()
# This is a problem because it needs entities and sentence boundaries
nlp.add_pipe("entity_linker")
analysis = nlp.analyze_pipes(pretty=True)
```
<Accordion title="Example output">
```json
### Structured
{
"summary": {
"tagger": {
"assigns": ["token.tag"],
"requires": [],
"scores": ["tag_acc", "pos_acc", "lemma_acc"],
"retokenizes": false
},
"entity_linker": {
"assigns": ["token.ent_kb_id"],
"requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
"scores": [],
"retokenizes": false
}
},
"problems": {
"tagger": [],
"entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"]
},
"attrs": {
"token.ent_iob": { "assigns": [], "requires": ["entity_linker"] },
"doc.ents": { "assigns": [], "requires": ["entity_linker"] },
"token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] },
"doc.sents": { "assigns": [], "requires": ["entity_linker"] },
"token.tag": { "assigns": ["tagger"], "requires": [] },
"token.ent_type": { "assigns": [], "requires": ["entity_linker"] }
}
}
```
```
### Example output
### Pretty
============================= Pipeline Overview =============================
# Component Assigns Requires Scores Retokenizes
@ -349,13 +393,7 @@ nlp.analyze_pipes()
token.ent_iob, token.ent_type
```
If you prefer a structured dictionary containing the component information and
the problems, you can set `no_print=True`. This will return the data instead of
printing it.
```
result = nlp.analyze_pipes(no_print=True)
```
</Accordion>
<Infobox variant="warning" title="Important note">