Add validate option to EntityRuler (#4089)

* Add validate option to EntityRuler * Add validate to EntityRuler, passed to Matcher and PhraseMatcher * Add validate to usage and API docs * Update website/docs/usage/rule-based-matching.md Co-Authored-By: Ines Montani <ines@ines.io> * Update website/docs/usage/rule-based-matching.md Co-Authored-By: Ines Montani <ines@ines.io>
2019-08-07 00:40:53 +02:00 · 2019-08-07 00:40:53 +02:00 · 69aca7d839
parent 4ae320e5c2
commit 69aca7d839
4 changed files with 61 additions and 4 deletions
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -26,7 +26,7 @@ class EntityRuler(object):

    name = "entity_ruler"

-    def __init__(self, nlp, phrase_matcher_attr=None, **cfg):
+    def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
        """Initialize the entitiy ruler. If patterns are supplied here, they
        need to be a list of dictionaries with a `"label"` and `"pattern"`
        key. A pattern can either be a token pattern (list) or a phrase pattern
@ -36,6 +36,8 @@ class EntityRuler(object):
            and process phrase patterns.
        phrase_matcher_attr (int / unicode): Token attribute to match on, passed
            to the internal PhraseMatcher as `attr`
+        validate (bool): Whether patterns should be validated, passed to
+            Matcher and PhraseMatcher as `validate`
        patterns (iterable): Optional patterns to load in.
        overwrite_ents (bool): If existing entities are present, e.g. entities
            added by the model, overwrite them by matches if necessary.
@ -50,15 +52,15 @@ class EntityRuler(object):
        self.overwrite = cfg.get("overwrite_ents", False)
        self.token_patterns = defaultdict(list)
        self.phrase_patterns = defaultdict(list)
-        self.matcher = Matcher(nlp.vocab)
+        self.matcher = Matcher(nlp.vocab, validate=validate)
        if phrase_matcher_attr is not None:
            self.phrase_matcher_attr = phrase_matcher_attr
            self.phrase_matcher = PhraseMatcher(
-                nlp.vocab, attr=self.phrase_matcher_attr
+                nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
            )
        else:
            self.phrase_matcher_attr = None
-            self.phrase_matcher = PhraseMatcher(nlp.vocab)
+            self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
        self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
        patterns = cfg.get("patterns")
        if patterns is not None:
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@ -5,6 +5,7 @@ import pytest
 from spacy.tokens import Span
 from spacy.language import Language
 from spacy.pipeline import EntityRuler
+from spacy.errors import MatchPatternError


@pytest.fixture
@ -127,3 +128,21 @@ def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns):
    assert len(new_ruler) == len(patterns)
    assert len(new_ruler.labels) == 4
    assert new_ruler.phrase_matcher_attr == "LOWER"
+
+
+def test_entity_ruler_validate(nlp):
+    ruler = EntityRuler(nlp)
+    validated_ruler = EntityRuler(nlp, validate=True)
+
+    valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
+    invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
+
+    # invalid pattern is added without errors without validate
+    ruler.add_patterns([invalid_pattern])
+
+    # valid pattern is added without errors with validate
+    validated_ruler.add_patterns([valid_pattern])
+
+    # invalid pattern raises error with validate
+    with pytest.raises(MatchPatternError):
+        validated_ruler.add_patterns([invalid_pattern])
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@ -35,6 +35,7 @@ be a token pattern (list) or a phrase pattern (string). For example:
 | `nlp`                 | `Language`    | The shared nlp object to pass the vocab to the matchers and process phrase patterns.                                                                  |
 | `patterns`            | iterable      | Optional patterns to load in.                                                                                                                         |
 | `phrase_matcher_attr` | int / unicode | Optional attr to pass to the internal [`PhraseMatcher`](/api/phtasematcher). defaults to `None`                                                       |
+| `validate`            | bool          | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`.                                         |
 | `overwrite_ents`      | bool          | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`.                      |
 | `**cfg`               | -             | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. |
 | **RETURNS**           | `EntityRuler` | The newly constructed object.                                                                                                                         |
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -326,6 +326,31 @@ character, but no whitespace – so you'll know it will be handled as one token.
 [{"ORTH": "User"}, {"ORTH": "name"}, {"ORTH": ":"}, {}]
 ```

+#### Validating and debugging patterns {#pattern-validation new="2.1"}
+
+The `Matcher` can validate patterns against a JSON schema with the option
+`validate=True`. This is useful for debugging patterns during development, in
+particular for catching unsupported attributes.
+
+```python
+### {executable="true"}
+import spacy
+from spacy.matcher import Matcher
+
+nlp = spacy.load("en_core_web_sm")
+matcher = Matcher(nlp.vocab, validate=True)
+# Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE
+pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
+matcher.add("HelloWorld", None, pattern)
+
+# Raises an error:
+#
+# spacy.errors.MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
+# Pattern 0:
+# - Additional properties are not allowed ('CASEINSENSITIVE' was unexpected) [2]
+
+```
+
 ### Adding on_match rules {#on_match}

 To move on to a more realistic example, let's say you're working with a large
@ -901,6 +926,16 @@ doc = nlp(u"MyCorp Inc. is a company in the U.S.")
 print([(ent.text, ent.label_) for ent in doc.ents])
 ```

+#### Validating and debugging EntityRuler patterns {#entityruler-pattern-validation}
+
+The `EntityRuler` can validate patterns against a JSON schema with the option
+`validate=True`. See details under [Validating and debugging
+patterns](#pattern-validation).
+
+```python
+ruler = EntityRuler(nlp, validate=True)
+```
+
 ### Using pattern files {#entityruler-files}

 The [`to_disk`](/api/entityruler#to_disk) and