Add more token attributes to token pattern schema (#4210)

Add token attributes with tests to token pattern schema.
This commit is contained in:
adrianeboyd 2019-08-29 12:02:26 +02:00 committed by Ines Montani
parent b91425f803
commit 5feb342f5e
2 changed files with 58 additions and 1 deletions

View File

@ -89,7 +89,10 @@ TOKEN_PATTERN_SCHEMA = {
"title": "Fine-grained part-of-speech tag",
"$ref": "#/definitions/string_value",
},
"DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"},
"DEP": {
"title": "Dependency label",
"$ref": "#/definitions/string_value"
},
"LEMMA": {
"title": "Lemma (base form)",
"$ref": "#/definitions/string_value",
@ -142,10 +145,34 @@ TOKEN_PATTERN_SCHEMA = {
"title": "Token is whitespace",
"$ref": "#/definitions/boolean_value",
},
"IS_BRACKET": {
"title": "Token is a bracket",
"$ref": "#/definitions/boolean_value",
},
"IS_QUOTE": {
"title": "Token is a quotation mark",
"$ref": "#/definitions/boolean_value",
},
"IS_LEFT_PUNCT": {
"title": "Token is a left punctuation mark",
"$ref": "#/definitions/boolean_value",
},
"IS_RIGHT_PUNCT": {
"title": "Token is a right punctuation mark",
"$ref": "#/definitions/boolean_value",
},
"IS_CURRENCY": {
"title": "Token is a currency symbol",
"$ref": "#/definitions/boolean_value",
},
"IS_STOP": {
"title": "Token is stop word",
"$ref": "#/definitions/boolean_value",
},
"IS_SENT_START": {
"title": "Token is the first in a sentence",
"$ref": "#/definitions/boolean_value",
},
"LIKE_NUM": {
"title": "Token resembles a number",
"$ref": "#/definitions/boolean_value",

View File

@ -380,3 +380,33 @@ def test_attr_pipeline_checks(en_vocab):
matcher(doc1)
matcher(doc2)
matcher(doc3)
@pytest.mark.parametrize(
"pattern,text",
[
([{"IS_ALPHA": True}], "a"),
([{"IS_ASCII": True}], "a"),
([{"IS_DIGIT": True}], "1"),
([{"IS_LOWER": True}], "a"),
([{"IS_UPPER": True}], "A"),
([{"IS_TITLE": True}], "Aaaa"),
([{"IS_PUNCT": True}], "."),
([{"IS_SPACE": True}], "\n"),
([{"IS_BRACKET": True}], "["),
([{"IS_QUOTE": True}], "\""),
([{"IS_LEFT_PUNCT": True}], "``"),
([{"IS_RIGHT_PUNCT": True}], "''"),
([{"IS_STOP": True}], "the"),
([{"LIKE_NUM": True}], "1"),
([{"LIKE_URL": True}], "http://example.com"),
([{"LIKE_EMAIL": True}], "mail@example.com"),
],
)
def test_matcher_schema_token_attributes(en_vocab, pattern, text):
matcher = Matcher(en_vocab)
doc = Doc(en_vocab, words=text.split(' '))
matcher.add("Rule", None, pattern)
assert len(matcher) == 1
matches = matcher(doc)
assert len(matches) == 1