Add more token attributes to token pattern schema (#4210)

Add token attributes with tests to token pattern schema.
2019-08-29 12:02:26 +02:00 · 2019-08-29 12:02:26 +02:00 · 5feb342f5e
parent b91425f803
commit 5feb342f5e
2 changed files with 58 additions and 1 deletions
--- a/spacy/matcher/_schemas.py
+++ b/spacy/matcher/_schemas.py
@ -89,7 +89,10 @@ TOKEN_PATTERN_SCHEMA = {
                "title": "Fine-grained part-of-speech tag",
                "$ref": "#/definitions/string_value",
            },
-            "DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"},
+            "DEP": {
+                "title": "Dependency label",
+                "$ref": "#/definitions/string_value"
+            },
            "LEMMA": {
                "title": "Lemma (base form)",
                "$ref": "#/definitions/string_value",
@ -142,10 +145,34 @@ TOKEN_PATTERN_SCHEMA = {
                "title": "Token is whitespace",
                "$ref": "#/definitions/boolean_value",
            },
+            "IS_BRACKET": {
+                "title": "Token is a bracket",
+                "$ref": "#/definitions/boolean_value",
+            },
+            "IS_QUOTE": {
+                "title": "Token is a quotation mark",
+                "$ref": "#/definitions/boolean_value",
+            },
+            "IS_LEFT_PUNCT": {
+                "title": "Token is a left punctuation mark",
+                "$ref": "#/definitions/boolean_value",
+            },
+            "IS_RIGHT_PUNCT": {
+                "title": "Token is a right punctuation mark",
+                "$ref": "#/definitions/boolean_value",
+            },
+            "IS_CURRENCY": {
+                "title": "Token is a currency symbol",
+                "$ref": "#/definitions/boolean_value",
+            },
            "IS_STOP": {
                "title": "Token is stop word",
                "$ref": "#/definitions/boolean_value",
            },
+            "IS_SENT_START": {
+                "title": "Token is the first in a sentence",
+                "$ref": "#/definitions/boolean_value",
+            },
            "LIKE_NUM": {
                "title": "Token resembles a number",
                "$ref": "#/definitions/boolean_value",
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -380,3 +380,33 @@ def test_attr_pipeline_checks(en_vocab):
    matcher(doc1)
    matcher(doc2)
    matcher(doc3)
+
+
+@pytest.mark.parametrize(
+    "pattern,text",
+    [
+        ([{"IS_ALPHA": True}], "a"),
+        ([{"IS_ASCII": True}], "a"),
+        ([{"IS_DIGIT": True}], "1"),
+        ([{"IS_LOWER": True}], "a"),
+        ([{"IS_UPPER": True}], "A"),
+        ([{"IS_TITLE": True}], "Aaaa"),
+        ([{"IS_PUNCT": True}], "."),
+        ([{"IS_SPACE": True}], "\n"),
+        ([{"IS_BRACKET": True}], "["),
+        ([{"IS_QUOTE": True}], "\""),
+        ([{"IS_LEFT_PUNCT": True}], "``"),
+        ([{"IS_RIGHT_PUNCT": True}], "''"),
+        ([{"IS_STOP": True}], "the"),
+        ([{"LIKE_NUM": True}], "1"),
+        ([{"LIKE_URL": True}], "http://example.com"),
+        ([{"LIKE_EMAIL": True}], "mail@example.com"),
+    ],
+)
+def test_matcher_schema_token_attributes(en_vocab, pattern, text):
+    matcher = Matcher(en_vocab)
+    doc = Doc(en_vocab, words=text.split(' '))
+    matcher.add("Rule", None, pattern)
+    assert len(matcher) == 1
+    matches = matcher(doc)
+    assert len(matches) == 1