From 5feb342f5e67603e4f1ea94f8eb8d010b251f5e9 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Thu, 29 Aug 2019 12:02:26 +0200
Subject: [PATCH] Add more token attributes to token pattern schema (#4210)

Add token attributes with tests to token pattern schema.
---
 spacy/matcher/_schemas.py               | 29 +++++++++++++++++++++++-
 spacy/tests/matcher/test_matcher_api.py | 30 +++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py
index 4f4992781..3c2127c31 100644
--- a/spacy/matcher/_schemas.py
+++ b/spacy/matcher/_schemas.py
@@ -89,7 +89,10 @@ TOKEN_PATTERN_SCHEMA = {
                 "title": "Fine-grained part-of-speech tag",
                 "$ref": "#/definitions/string_value",
             },
-            "DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"},
+            "DEP": {
+                "title": "Dependency label",
+                "$ref": "#/definitions/string_value"
+            },
             "LEMMA": {
                 "title": "Lemma (base form)",
                 "$ref": "#/definitions/string_value",
@@ -142,10 +145,34 @@ TOKEN_PATTERN_SCHEMA = {
                 "title": "Token is whitespace",
                 "$ref": "#/definitions/boolean_value",
             },
+            "IS_BRACKET": {
+                "title": "Token is a bracket",
+                "$ref": "#/definitions/boolean_value",
+            },
+            "IS_QUOTE": {
+                "title": "Token is a quotation mark",
+                "$ref": "#/definitions/boolean_value",
+            },
+            "IS_LEFT_PUNCT": {
+                "title": "Token is a left punctuation mark",
+                "$ref": "#/definitions/boolean_value",
+            },
+            "IS_RIGHT_PUNCT": {
+                "title": "Token is a right punctuation mark",
+                "$ref": "#/definitions/boolean_value",
+            },
+            "IS_CURRENCY": {
+                "title": "Token is a currency symbol",
+                "$ref": "#/definitions/boolean_value",
+            },
             "IS_STOP": {
                 "title": "Token is stop word",
                 "$ref": "#/definitions/boolean_value",
             },
+            "IS_SENT_START": {
+                "title": "Token is the first in a sentence",
+                "$ref": "#/definitions/boolean_value",
+            },
             "LIKE_NUM": {
                 "title": "Token resembles a number",
                 "$ref": "#/definitions/boolean_value",
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 401b7c928..ccbc7c57e 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -380,3 +380,33 @@ def test_attr_pipeline_checks(en_vocab):
     matcher(doc1)
     matcher(doc2)
     matcher(doc3)
+
+
+@pytest.mark.parametrize(
+    "pattern,text",
+    [
+        ([{"IS_ALPHA": True}], "a"),
+        ([{"IS_ASCII": True}], "a"),
+        ([{"IS_DIGIT": True}], "1"),
+        ([{"IS_LOWER": True}], "a"),
+        ([{"IS_UPPER": True}], "A"),
+        ([{"IS_TITLE": True}], "Aaaa"),
+        ([{"IS_PUNCT": True}], "."),
+        ([{"IS_SPACE": True}], "\n"),
+        ([{"IS_BRACKET": True}], "["),
+        ([{"IS_QUOTE": True}], "\""),
+        ([{"IS_LEFT_PUNCT": True}], "``"),
+        ([{"IS_RIGHT_PUNCT": True}], "''"),
+        ([{"IS_STOP": True}], "the"),
+        ([{"LIKE_NUM": True}], "1"),
+        ([{"LIKE_URL": True}], "http://example.com"),
+        ([{"LIKE_EMAIL": True}], "mail@example.com"),
+    ],
+)
+def test_matcher_schema_token_attributes(en_vocab, pattern, text):
+    matcher = Matcher(en_vocab)
+    doc = Doc(en_vocab, words=text.split(' '))
+    matcher.add("Rule", None, pattern)
+    assert len(matcher) == 1
+    matches = matcher(doc)
+    assert len(matches) == 1