From 5feb342f5e67603e4f1ea94f8eb8d010b251f5e9 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 29 Aug 2019 12:02:26 +0200 Subject: [PATCH] Add more token attributes to token pattern schema (#4210) Add token attributes with tests to token pattern schema. --- spacy/matcher/_schemas.py | 29 +++++++++++++++++++++++- spacy/tests/matcher/test_matcher_api.py | 30 +++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py index 4f4992781..3c2127c31 100644 --- a/spacy/matcher/_schemas.py +++ b/spacy/matcher/_schemas.py @@ -89,7 +89,10 @@ TOKEN_PATTERN_SCHEMA = { "title": "Fine-grained part-of-speech tag", "$ref": "#/definitions/string_value", }, - "DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"}, + "DEP": { + "title": "Dependency label", + "$ref": "#/definitions/string_value" + }, "LEMMA": { "title": "Lemma (base form)", "$ref": "#/definitions/string_value", @@ -142,10 +145,34 @@ TOKEN_PATTERN_SCHEMA = { "title": "Token is whitespace", "$ref": "#/definitions/boolean_value", }, + "IS_BRACKET": { + "title": "Token is a bracket", + "$ref": "#/definitions/boolean_value", + }, + "IS_QUOTE": { + "title": "Token is a quotation mark", + "$ref": "#/definitions/boolean_value", + }, + "IS_LEFT_PUNCT": { + "title": "Token is a left punctuation mark", + "$ref": "#/definitions/boolean_value", + }, + "IS_RIGHT_PUNCT": { + "title": "Token is a right punctuation mark", + "$ref": "#/definitions/boolean_value", + }, + "IS_CURRENCY": { + "title": "Token is a currency symbol", + "$ref": "#/definitions/boolean_value", + }, "IS_STOP": { "title": "Token is stop word", "$ref": "#/definitions/boolean_value", }, + "IS_SENT_START": { + "title": "Token is the first in a sentence", + "$ref": "#/definitions/boolean_value", + }, "LIKE_NUM": { "title": "Token resembles a number", "$ref": "#/definitions/boolean_value", diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 401b7c928..ccbc7c57e 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -380,3 +380,33 @@ def test_attr_pipeline_checks(en_vocab): matcher(doc1) matcher(doc2) matcher(doc3) + + +@pytest.mark.parametrize( + "pattern,text", + [ + ([{"IS_ALPHA": True}], "a"), + ([{"IS_ASCII": True}], "a"), + ([{"IS_DIGIT": True}], "1"), + ([{"IS_LOWER": True}], "a"), + ([{"IS_UPPER": True}], "A"), + ([{"IS_TITLE": True}], "Aaaa"), + ([{"IS_PUNCT": True}], "."), + ([{"IS_SPACE": True}], "\n"), + ([{"IS_BRACKET": True}], "["), + ([{"IS_QUOTE": True}], "\""), + ([{"IS_LEFT_PUNCT": True}], "``"), + ([{"IS_RIGHT_PUNCT": True}], "''"), + ([{"IS_STOP": True}], "the"), + ([{"LIKE_NUM": True}], "1"), + ([{"LIKE_URL": True}], "http://example.com"), + ([{"LIKE_EMAIL": True}], "mail@example.com"), + ], +) +def test_matcher_schema_token_attributes(en_vocab, pattern, text): + matcher = Matcher(en_vocab) + doc = Doc(en_vocab, words=text.split(' ')) + matcher.add("Rule", None, pattern) + assert len(matcher) == 1 + matches = matcher(doc) + assert len(matches) == 1