mirror of https://github.com/explosion/spaCy.git
Add more token attributes to token pattern schema (#4210)
Add token attributes with tests to token pattern schema.
This commit is contained in:
parent
b91425f803
commit
5feb342f5e
|
@ -89,7 +89,10 @@ TOKEN_PATTERN_SCHEMA = {
|
|||
"title": "Fine-grained part-of-speech tag",
|
||||
"$ref": "#/definitions/string_value",
|
||||
},
|
||||
"DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"},
|
||||
"DEP": {
|
||||
"title": "Dependency label",
|
||||
"$ref": "#/definitions/string_value"
|
||||
},
|
||||
"LEMMA": {
|
||||
"title": "Lemma (base form)",
|
||||
"$ref": "#/definitions/string_value",
|
||||
|
@ -142,10 +145,34 @@ TOKEN_PATTERN_SCHEMA = {
|
|||
"title": "Token is whitespace",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_BRACKET": {
|
||||
"title": "Token is a bracket",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_QUOTE": {
|
||||
"title": "Token is a quotation mark",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_LEFT_PUNCT": {
|
||||
"title": "Token is a left punctuation mark",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_RIGHT_PUNCT": {
|
||||
"title": "Token is a right punctuation mark",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_CURRENCY": {
|
||||
"title": "Token is a currency symbol",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_STOP": {
|
||||
"title": "Token is stop word",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_SENT_START": {
|
||||
"title": "Token is the first in a sentence",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"LIKE_NUM": {
|
||||
"title": "Token resembles a number",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
|
|
|
@ -380,3 +380,33 @@ def test_attr_pipeline_checks(en_vocab):
|
|||
matcher(doc1)
|
||||
matcher(doc2)
|
||||
matcher(doc3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pattern,text",
|
||||
[
|
||||
([{"IS_ALPHA": True}], "a"),
|
||||
([{"IS_ASCII": True}], "a"),
|
||||
([{"IS_DIGIT": True}], "1"),
|
||||
([{"IS_LOWER": True}], "a"),
|
||||
([{"IS_UPPER": True}], "A"),
|
||||
([{"IS_TITLE": True}], "Aaaa"),
|
||||
([{"IS_PUNCT": True}], "."),
|
||||
([{"IS_SPACE": True}], "\n"),
|
||||
([{"IS_BRACKET": True}], "["),
|
||||
([{"IS_QUOTE": True}], "\""),
|
||||
([{"IS_LEFT_PUNCT": True}], "``"),
|
||||
([{"IS_RIGHT_PUNCT": True}], "''"),
|
||||
([{"IS_STOP": True}], "the"),
|
||||
([{"LIKE_NUM": True}], "1"),
|
||||
([{"LIKE_URL": True}], "http://example.com"),
|
||||
([{"LIKE_EMAIL": True}], "mail@example.com"),
|
||||
],
|
||||
)
|
||||
def test_matcher_schema_token_attributes(en_vocab, pattern, text):
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = Doc(en_vocab, words=text.split(' '))
|
||||
matcher.add("Rule", None, pattern)
|
||||
assert len(matcher) == 1
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
|
|
Loading…
Reference in New Issue