From 69aca7d8391e0bbc551fe588e1f3b06f1d68a3f2 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Wed, 7 Aug 2019 00:40:53 +0200
Subject: [PATCH] Add validate option to EntityRuler (#4089)

* Add validate option to EntityRuler

* Add validate to EntityRuler, passed to Matcher and PhraseMatcher

* Add validate to usage and API docs

* Update website/docs/usage/rule-based-matching.md

Co-Authored-By: Ines Montani <ines@ines.io>

* Update website/docs/usage/rule-based-matching.md

Co-Authored-By: Ines Montani <ines@ines.io>
---
 spacy/pipeline/entityruler.py             | 10 ++++---
 spacy/tests/pipeline/test_entity_ruler.py | 19 ++++++++++++
 website/docs/api/entityruler.md           |  1 +
 website/docs/usage/rule-based-matching.md | 35 +++++++++++++++++++++++
 4 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 35b465ceb..23c8c91ba 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -26,7 +26,7 @@ class EntityRuler(object):
 
     name = "entity_ruler"
 
-    def __init__(self, nlp, phrase_matcher_attr=None, **cfg):
+    def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
         """Initialize the entitiy ruler. If patterns are supplied here, they
         need to be a list of dictionaries with a `"label"` and `"pattern"`
         key. A pattern can either be a token pattern (list) or a phrase pattern
@@ -36,6 +36,8 @@ class EntityRuler(object):
             and process phrase patterns.
         phrase_matcher_attr (int / unicode): Token attribute to match on, passed
             to the internal PhraseMatcher as `attr`
+        validate (bool): Whether patterns should be validated, passed to
+            Matcher and PhraseMatcher as `validate`
         patterns (iterable): Optional patterns to load in.
         overwrite_ents (bool): If existing entities are present, e.g. entities
             added by the model, overwrite them by matches if necessary.
@@ -50,15 +52,15 @@ class EntityRuler(object):
         self.overwrite = cfg.get("overwrite_ents", False)
         self.token_patterns = defaultdict(list)
         self.phrase_patterns = defaultdict(list)
-        self.matcher = Matcher(nlp.vocab)
+        self.matcher = Matcher(nlp.vocab, validate=validate)
         if phrase_matcher_attr is not None:
             self.phrase_matcher_attr = phrase_matcher_attr
             self.phrase_matcher = PhraseMatcher(
-                nlp.vocab, attr=self.phrase_matcher_attr
+                nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
             )
         else:
             self.phrase_matcher_attr = None
-            self.phrase_matcher = PhraseMatcher(nlp.vocab)
+            self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
         self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
         patterns = cfg.get("patterns")
         if patterns is not None:
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 5ab1a3af0..57e980ec3 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -5,6 +5,7 @@ import pytest
 from spacy.tokens import Span
 from spacy.language import Language
 from spacy.pipeline import EntityRuler
+from spacy.errors import MatchPatternError
 
 
 @pytest.fixture
@@ -127,3 +128,21 @@ def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns):
     assert len(new_ruler) == len(patterns)
     assert len(new_ruler.labels) == 4
     assert new_ruler.phrase_matcher_attr == "LOWER"
+
+
+def test_entity_ruler_validate(nlp):
+    ruler = EntityRuler(nlp)
+    validated_ruler = EntityRuler(nlp, validate=True)
+
+    valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
+    invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
+
+    # invalid pattern is added without errors without validate
+    ruler.add_patterns([invalid_pattern])
+
+    # valid pattern is added without errors with validate
+    validated_ruler.add_patterns([valid_pattern])
+
+    # invalid pattern raises error with validate
+    with pytest.raises(MatchPatternError):
+        validated_ruler.add_patterns([invalid_pattern])
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index 4424bd254..46dbb3d1d 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -35,6 +35,7 @@ be a token pattern (list) or a phrase pattern (string). For example:
 | `nlp`                 | `Language`    | The shared nlp object to pass the vocab to the matchers and process phrase patterns.                                                                  |
 | `patterns`            | iterable      | Optional patterns to load in.                                                                                                                         |
 | `phrase_matcher_attr` | int / unicode | Optional attr to pass to the internal [`PhraseMatcher`](/api/phtasematcher). defaults to `None`                                                       |
+| `validate`            | bool          | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`.                                         |
 | `overwrite_ents`      | bool          | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`.                      |
 | `**cfg`               | -             | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. |
 | **RETURNS**           | `EntityRuler` | The newly constructed object.                                                                                                                         |
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 16db191d1..80125d933 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -326,6 +326,31 @@ character, but no whitespace – so you'll know it will be handled as one token.
 [{"ORTH": "User"}, {"ORTH": "name"}, {"ORTH": ":"}, {}]
 ```
 
+#### Validating and debugging patterns {#pattern-validation new="2.1"}
+
+The `Matcher` can validate patterns against a JSON schema with the option
+`validate=True`. This is useful for debugging patterns during development, in
+particular for catching unsupported attributes.
+
+```python
+### {executable="true"}
+import spacy
+from spacy.matcher import Matcher
+
+nlp = spacy.load("en_core_web_sm")
+matcher = Matcher(nlp.vocab, validate=True)
+# Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE
+pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
+matcher.add("HelloWorld", None, pattern)
+
+# Raises an error:
+#
+# spacy.errors.MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
+# Pattern 0:
+# - Additional properties are not allowed ('CASEINSENSITIVE' was unexpected) [2]
+
+```
+
 ### Adding on_match rules {#on_match}
 
 To move on to a more realistic example, let's say you're working with a large
@@ -901,6 +926,16 @@ doc = nlp(u"MyCorp Inc. is a company in the U.S.")
 print([(ent.text, ent.label_) for ent in doc.ents])
 ```
 
+#### Validating and debugging EntityRuler patterns {#entityruler-pattern-validation}
+
+The `EntityRuler` can validate patterns against a JSON schema with the option
+`validate=True`. See details under [Validating and debugging
+patterns](#pattern-validation).
+
+```python
+ruler = EntityRuler(nlp, validate=True)
+```
+
 ### Using pattern files {#entityruler-files}
 
 The [`to_disk`](/api/entityruler#to_disk) and