From 69aca7d8391e0bbc551fe588e1f3b06f1d68a3f2 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 7 Aug 2019 00:40:53 +0200 Subject: [PATCH] Add validate option to EntityRuler (#4089) * Add validate option to EntityRuler * Add validate to EntityRuler, passed to Matcher and PhraseMatcher * Add validate to usage and API docs * Update website/docs/usage/rule-based-matching.md Co-Authored-By: Ines Montani * Update website/docs/usage/rule-based-matching.md Co-Authored-By: Ines Montani --- spacy/pipeline/entityruler.py | 10 ++++--- spacy/tests/pipeline/test_entity_ruler.py | 19 ++++++++++++ website/docs/api/entityruler.md | 1 + website/docs/usage/rule-based-matching.md | 35 +++++++++++++++++++++++ 4 files changed, 61 insertions(+), 4 deletions(-) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 35b465ceb..23c8c91ba 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -26,7 +26,7 @@ class EntityRuler(object): name = "entity_ruler" - def __init__(self, nlp, phrase_matcher_attr=None, **cfg): + def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg): """Initialize the entitiy ruler. If patterns are supplied here, they need to be a list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either be a token pattern (list) or a phrase pattern @@ -36,6 +36,8 @@ class EntityRuler(object): and process phrase patterns. phrase_matcher_attr (int / unicode): Token attribute to match on, passed to the internal PhraseMatcher as `attr` + validate (bool): Whether patterns should be validated, passed to + Matcher and PhraseMatcher as `validate` patterns (iterable): Optional patterns to load in. overwrite_ents (bool): If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. @@ -50,15 +52,15 @@ class EntityRuler(object): self.overwrite = cfg.get("overwrite_ents", False) self.token_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list) - self.matcher = Matcher(nlp.vocab) + self.matcher = Matcher(nlp.vocab, validate=validate) if phrase_matcher_attr is not None: self.phrase_matcher_attr = phrase_matcher_attr self.phrase_matcher = PhraseMatcher( - nlp.vocab, attr=self.phrase_matcher_attr + nlp.vocab, attr=self.phrase_matcher_attr, validate=validate ) else: self.phrase_matcher_attr = None - self.phrase_matcher = PhraseMatcher(nlp.vocab) + self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate) self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) patterns = cfg.get("patterns") if patterns is not None: diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 5ab1a3af0..57e980ec3 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -5,6 +5,7 @@ import pytest from spacy.tokens import Span from spacy.language import Language from spacy.pipeline import EntityRuler +from spacy.errors import MatchPatternError @pytest.fixture @@ -127,3 +128,21 @@ def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns): assert len(new_ruler) == len(patterns) assert len(new_ruler.labels) == 4 assert new_ruler.phrase_matcher_attr == "LOWER" + + +def test_entity_ruler_validate(nlp): + ruler = EntityRuler(nlp) + validated_ruler = EntityRuler(nlp, validate=True) + + valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]} + invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]} + + # invalid pattern is added without errors without validate + ruler.add_patterns([invalid_pattern]) + + # valid pattern is added without errors with validate + validated_ruler.add_patterns([valid_pattern]) + + # invalid pattern raises error with validate + with pytest.raises(MatchPatternError): + validated_ruler.add_patterns([invalid_pattern]) diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 4424bd254..46dbb3d1d 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -35,6 +35,7 @@ be a token pattern (list) or a phrase pattern (string). For example: | `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. | | `patterns` | iterable | Optional patterns to load in. | | `phrase_matcher_attr` | int / unicode | Optional attr to pass to the internal [`PhraseMatcher`](/api/phtasematcher). defaults to `None` | +| `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. | | `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. | | `**cfg` | - | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. | | **RETURNS** | `EntityRuler` | The newly constructed object. | diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 16db191d1..80125d933 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -326,6 +326,31 @@ character, but no whitespace – so you'll know it will be handled as one token. [{"ORTH": "User"}, {"ORTH": "name"}, {"ORTH": ":"}, {}] ``` +#### Validating and debugging patterns {#pattern-validation new="2.1"} + +The `Matcher` can validate patterns against a JSON schema with the option +`validate=True`. This is useful for debugging patterns during development, in +particular for catching unsupported attributes. + +```python +### {executable="true"} +import spacy +from spacy.matcher import Matcher + +nlp = spacy.load("en_core_web_sm") +matcher = Matcher(nlp.vocab, validate=True) +# Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE +pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}] +matcher.add("HelloWorld", None, pattern) + +# Raises an error: +# +# spacy.errors.MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld' +# Pattern 0: +# - Additional properties are not allowed ('CASEINSENSITIVE' was unexpected) [2] + +``` + ### Adding on_match rules {#on_match} To move on to a more realistic example, let's say you're working with a large @@ -901,6 +926,16 @@ doc = nlp(u"MyCorp Inc. is a company in the U.S.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` +#### Validating and debugging EntityRuler patterns {#entityruler-pattern-validation} + +The `EntityRuler` can validate patterns against a JSON schema with the option +`validate=True`. See details under [Validating and debugging +patterns](#pattern-validation). + +```python +ruler = EntityRuler(nlp, validate=True) +``` + ### Using pattern files {#entityruler-files} The [`to_disk`](/api/entityruler#to_disk) and