mirror of https://github.com/explosion/spaCy.git
Add validate option to EntityRuler (#4089)
* Add validate option to EntityRuler * Add validate to EntityRuler, passed to Matcher and PhraseMatcher * Add validate to usage and API docs * Update website/docs/usage/rule-based-matching.md Co-Authored-By: Ines Montani <ines@ines.io> * Update website/docs/usage/rule-based-matching.md Co-Authored-By: Ines Montani <ines@ines.io>
This commit is contained in:
parent
4ae320e5c2
commit
69aca7d839
|
@ -26,7 +26,7 @@ class EntityRuler(object):
|
|||
|
||||
name = "entity_ruler"
|
||||
|
||||
def __init__(self, nlp, phrase_matcher_attr=None, **cfg):
|
||||
def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
|
||||
"""Initialize the entitiy ruler. If patterns are supplied here, they
|
||||
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
||||
key. A pattern can either be a token pattern (list) or a phrase pattern
|
||||
|
@ -36,6 +36,8 @@ class EntityRuler(object):
|
|||
and process phrase patterns.
|
||||
phrase_matcher_attr (int / unicode): Token attribute to match on, passed
|
||||
to the internal PhraseMatcher as `attr`
|
||||
validate (bool): Whether patterns should be validated, passed to
|
||||
Matcher and PhraseMatcher as `validate`
|
||||
patterns (iterable): Optional patterns to load in.
|
||||
overwrite_ents (bool): If existing entities are present, e.g. entities
|
||||
added by the model, overwrite them by matches if necessary.
|
||||
|
@ -50,15 +52,15 @@ class EntityRuler(object):
|
|||
self.overwrite = cfg.get("overwrite_ents", False)
|
||||
self.token_patterns = defaultdict(list)
|
||||
self.phrase_patterns = defaultdict(list)
|
||||
self.matcher = Matcher(nlp.vocab)
|
||||
self.matcher = Matcher(nlp.vocab, validate=validate)
|
||||
if phrase_matcher_attr is not None:
|
||||
self.phrase_matcher_attr = phrase_matcher_attr
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
nlp.vocab, attr=self.phrase_matcher_attr
|
||||
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
||||
)
|
||||
else:
|
||||
self.phrase_matcher_attr = None
|
||||
self.phrase_matcher = PhraseMatcher(nlp.vocab)
|
||||
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
|
||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||
patterns = cfg.get("patterns")
|
||||
if patterns is not None:
|
||||
|
|
|
@ -5,6 +5,7 @@ import pytest
|
|||
from spacy.tokens import Span
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy.errors import MatchPatternError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -127,3 +128,21 @@ def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns):
|
|||
assert len(new_ruler) == len(patterns)
|
||||
assert len(new_ruler.labels) == 4
|
||||
assert new_ruler.phrase_matcher_attr == "LOWER"
|
||||
|
||||
|
||||
def test_entity_ruler_validate(nlp):
|
||||
ruler = EntityRuler(nlp)
|
||||
validated_ruler = EntityRuler(nlp, validate=True)
|
||||
|
||||
valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
|
||||
invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
|
||||
|
||||
# invalid pattern is added without errors without validate
|
||||
ruler.add_patterns([invalid_pattern])
|
||||
|
||||
# valid pattern is added without errors with validate
|
||||
validated_ruler.add_patterns([valid_pattern])
|
||||
|
||||
# invalid pattern raises error with validate
|
||||
with pytest.raises(MatchPatternError):
|
||||
validated_ruler.add_patterns([invalid_pattern])
|
||||
|
|
|
@ -35,6 +35,7 @@ be a token pattern (list) or a phrase pattern (string). For example:
|
|||
| `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. |
|
||||
| `patterns` | iterable | Optional patterns to load in. |
|
||||
| `phrase_matcher_attr` | int / unicode | Optional attr to pass to the internal [`PhraseMatcher`](/api/phtasematcher). defaults to `None` |
|
||||
| `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. |
|
||||
| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. |
|
||||
| `**cfg` | - | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. |
|
||||
| **RETURNS** | `EntityRuler` | The newly constructed object. |
|
||||
|
|
|
@ -326,6 +326,31 @@ character, but no whitespace – so you'll know it will be handled as one token.
|
|||
[{"ORTH": "User"}, {"ORTH": "name"}, {"ORTH": ":"}, {}]
|
||||
```
|
||||
|
||||
#### Validating and debugging patterns {#pattern-validation new="2.1"}
|
||||
|
||||
The `Matcher` can validate patterns against a JSON schema with the option
|
||||
`validate=True`. This is useful for debugging patterns during development, in
|
||||
particular for catching unsupported attributes.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
import spacy
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
matcher = Matcher(nlp.vocab, validate=True)
|
||||
# Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE
|
||||
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
|
||||
matcher.add("HelloWorld", None, pattern)
|
||||
|
||||
# Raises an error:
|
||||
#
|
||||
# spacy.errors.MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
|
||||
# Pattern 0:
|
||||
# - Additional properties are not allowed ('CASEINSENSITIVE' was unexpected) [2]
|
||||
|
||||
```
|
||||
|
||||
### Adding on_match rules {#on_match}
|
||||
|
||||
To move on to a more realistic example, let's say you're working with a large
|
||||
|
@ -901,6 +926,16 @@ doc = nlp(u"MyCorp Inc. is a company in the U.S.")
|
|||
print([(ent.text, ent.label_) for ent in doc.ents])
|
||||
```
|
||||
|
||||
#### Validating and debugging EntityRuler patterns {#entityruler-pattern-validation}
|
||||
|
||||
The `EntityRuler` can validate patterns against a JSON schema with the option
|
||||
`validate=True`. See details under [Validating and debugging
|
||||
patterns](#pattern-validation).
|
||||
|
||||
```python
|
||||
ruler = EntityRuler(nlp, validate=True)
|
||||
```
|
||||
|
||||
### Using pattern files {#entityruler-files}
|
||||
|
||||
The [`to_disk`](/api/entityruler#to_disk) and
|
||||
|
|
Loading…
Reference in New Issue