Add validate option to EntityRuler (#4089)

* Add validate option to EntityRuler

* Add validate to EntityRuler, passed to Matcher and PhraseMatcher

* Add validate to usage and API docs

* Update website/docs/usage/rule-based-matching.md

Co-Authored-By: Ines Montani <ines@ines.io>

* Update website/docs/usage/rule-based-matching.md

Co-Authored-By: Ines Montani <ines@ines.io>
This commit is contained in:
adrianeboyd 2019-08-07 00:40:53 +02:00 committed by Ines Montani
parent 4ae320e5c2
commit 69aca7d839
4 changed files with 61 additions and 4 deletions

View File

@ -26,7 +26,7 @@ class EntityRuler(object):
name = "entity_ruler"
def __init__(self, nlp, phrase_matcher_attr=None, **cfg):
def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
"""Initialize the entitiy ruler. If patterns are supplied here, they
need to be a list of dictionaries with a `"label"` and `"pattern"`
key. A pattern can either be a token pattern (list) or a phrase pattern
@ -36,6 +36,8 @@ class EntityRuler(object):
and process phrase patterns.
phrase_matcher_attr (int / unicode): Token attribute to match on, passed
to the internal PhraseMatcher as `attr`
validate (bool): Whether patterns should be validated, passed to
Matcher and PhraseMatcher as `validate`
patterns (iterable): Optional patterns to load in.
overwrite_ents (bool): If existing entities are present, e.g. entities
added by the model, overwrite them by matches if necessary.
@ -50,15 +52,15 @@ class EntityRuler(object):
self.overwrite = cfg.get("overwrite_ents", False)
self.token_patterns = defaultdict(list)
self.phrase_patterns = defaultdict(list)
self.matcher = Matcher(nlp.vocab)
self.matcher = Matcher(nlp.vocab, validate=validate)
if phrase_matcher_attr is not None:
self.phrase_matcher_attr = phrase_matcher_attr
self.phrase_matcher = PhraseMatcher(
nlp.vocab, attr=self.phrase_matcher_attr
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
)
else:
self.phrase_matcher_attr = None
self.phrase_matcher = PhraseMatcher(nlp.vocab)
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
patterns = cfg.get("patterns")
if patterns is not None:

View File

@ -5,6 +5,7 @@ import pytest
from spacy.tokens import Span
from spacy.language import Language
from spacy.pipeline import EntityRuler
from spacy.errors import MatchPatternError
@pytest.fixture
@ -127,3 +128,21 @@ def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns):
assert len(new_ruler) == len(patterns)
assert len(new_ruler.labels) == 4
assert new_ruler.phrase_matcher_attr == "LOWER"
def test_entity_ruler_validate(nlp):
ruler = EntityRuler(nlp)
validated_ruler = EntityRuler(nlp, validate=True)
valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
# invalid pattern is added without errors without validate
ruler.add_patterns([invalid_pattern])
# valid pattern is added without errors with validate
validated_ruler.add_patterns([valid_pattern])
# invalid pattern raises error with validate
with pytest.raises(MatchPatternError):
validated_ruler.add_patterns([invalid_pattern])

View File

@ -35,6 +35,7 @@ be a token pattern (list) or a phrase pattern (string). For example:
| `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. |
| `patterns` | iterable | Optional patterns to load in. |
| `phrase_matcher_attr` | int / unicode | Optional attr to pass to the internal [`PhraseMatcher`](/api/phtasematcher). defaults to `None` |
| `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. |
| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. |
| `**cfg` | - | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. |
| **RETURNS** | `EntityRuler` | The newly constructed object. |

View File

@ -326,6 +326,31 @@ character, but no whitespace so you'll know it will be handled as one token.
[{"ORTH": "User"}, {"ORTH": "name"}, {"ORTH": ":"}, {}]
```
#### Validating and debugging patterns {#pattern-validation new="2.1"}
The `Matcher` can validate patterns against a JSON schema with the option
`validate=True`. This is useful for debugging patterns during development, in
particular for catching unsupported attributes.
```python
### {executable="true"}
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab, validate=True)
# Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
matcher.add("HelloWorld", None, pattern)
# Raises an error:
#
# spacy.errors.MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
# Pattern 0:
# - Additional properties are not allowed ('CASEINSENSITIVE' was unexpected) [2]
```
### Adding on_match rules {#on_match}
To move on to a more realistic example, let's say you're working with a large
@ -901,6 +926,16 @@ doc = nlp(u"MyCorp Inc. is a company in the U.S.")
print([(ent.text, ent.label_) for ent in doc.ents])
```
#### Validating and debugging EntityRuler patterns {#entityruler-pattern-validation}
The `EntityRuler` can validate patterns against a JSON schema with the option
`validate=True`. See details under [Validating and debugging
patterns](#pattern-validation).
```python
ruler = EntityRuler(nlp, validate=True)
```
### Using pattern files {#entityruler-files}
The [`to_disk`](/api/entityruler#to_disk) and