From 175847f92cb89f2515f848c5eda8872af247954f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 2 Aug 2021 19:39:26 +0200 Subject: [PATCH] Support list values and INTERSECTS in Matcher (#8784) * Support list values and IS_INTERSECT in Matcher * Support list values as token attributes for set operators, not just as pattern values. * Add `IS_INTERSECT` operator. * Fix incorrect `ISSUBSET` and `ISSUPERSET` in schema and docs. * Rename IS_INTERSECT to INTERSECTS --- spacy/matcher/matcher.pyx | 15 +++-- spacy/schemas.py | 6 +- spacy/tests/matcher/test_matcher_api.py | 76 +++++++++++++++++++++++ website/docs/api/matcher.md | 15 ++--- website/docs/usage/rule-based-matching.md | 15 ++--- 5 files changed, 106 insertions(+), 21 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 7b1cfb633..555766f62 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -845,7 +845,7 @@ class _RegexPredicate: class _SetPredicate: - operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET") + operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS") def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): self.i = i @@ -868,14 +868,16 @@ class _SetPredicate: else: value = get_token_attr_for_matcher(token.c, self.attr) - if self.predicate in ("IS_SUBSET", "IS_SUPERSET"): + if self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"): if self.attr == MORPH: # break up MORPH into individual Feat=Val values value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value)) else: - # IS_SUBSET for other attrs will be equivalent to "IN" - # IS_SUPERSET will only match for other attrs with 0 or 1 values - value = set([value]) + # treat a single value as a list + if isinstance(value, (str, int)): + value = set([get_string_id(value)]) + else: + value = set(get_string_id(v) for v in value) if self.predicate == "IN": return value in self.value elif self.predicate == "NOT_IN": @@ -884,6 +886,8 @@ class _SetPredicate: return value <= self.value elif self.predicate == "IS_SUPERSET": return value >= self.value + elif self.predicate == "INTERSECTS": + return bool(value & self.value) def __repr__(self): return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate)) @@ -928,6 +932,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab): "NOT_IN": _SetPredicate, "IS_SUBSET": _SetPredicate, "IS_SUPERSET": _SetPredicate, + "INTERSECTS": _SetPredicate, "==": _ComparisonPredicate, "!=": _ComparisonPredicate, ">=": _ComparisonPredicate, diff --git a/spacy/schemas.py b/spacy/schemas.py index 992e17d70..83623b104 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -159,6 +159,7 @@ class TokenPatternString(BaseModel): NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in") IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset") IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset") + INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects") class Config: extra = "forbid" @@ -175,8 +176,9 @@ class TokenPatternNumber(BaseModel): REGEX: Optional[StrictStr] = Field(None, alias="regex") IN: Optional[List[StrictInt]] = Field(None, alias="in") NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in") - ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset") - ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset") + IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset") + IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset") + INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects") EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=") GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index e0f655bbe..a42735eae 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -270,6 +270,16 @@ def test_matcher_subset_value_operator(en_vocab): doc[0].tag_ = "A" assert len(matcher(doc)) == 0 + # IS_SUBSET with a list value + Token.set_extension("ext", default=[]) + matcher = Matcher(en_vocab) + pattern = [{"_": {"ext": {"IS_SUBSET": ["A", "B"]}}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0]._.ext = ["A"] + doc[1]._.ext = ["C", "D"] + assert len(matcher(doc)) == 2 + def test_matcher_superset_value_operator(en_vocab): matcher = Matcher(en_vocab) @@ -308,6 +318,72 @@ def test_matcher_superset_value_operator(en_vocab): doc[0].tag_ = "A" assert len(matcher(doc)) == 3 + # IS_SUPERSET with a list value + Token.set_extension("ext", default=[]) + matcher = Matcher(en_vocab) + pattern = [{"_": {"ext": {"IS_SUPERSET": ["A"]}}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0]._.ext = ["A", "B"] + assert len(matcher(doc)) == 1 + + +def test_matcher_intersect_value_operator(en_vocab): + matcher = Matcher(en_vocab) + pattern = [{"MORPH": {"INTERSECTS": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + assert len(matcher(doc)) == 0 + doc[0].set_morph("Feat=Val") + assert len(matcher(doc)) == 1 + doc[0].set_morph("Feat=Val|Feat2=Val2") + assert len(matcher(doc)) == 1 + doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3") + assert len(matcher(doc)) == 1 + doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4") + assert len(matcher(doc)) == 1 + + # INTERSECTS with a single value is the same as IN + matcher = Matcher(en_vocab) + pattern = [{"TAG": {"INTERSECTS": ["A", "B"]}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0].tag_ = "A" + assert len(matcher(doc)) == 1 + + # INTERSECTS with an empty pattern list matches nothing + matcher = Matcher(en_vocab) + pattern = [{"TAG": {"INTERSECTS": []}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0].tag_ = "A" + assert len(matcher(doc)) == 0 + + # INTERSECTS with a list value + Token.set_extension("ext", default=[]) + matcher = Matcher(en_vocab) + pattern = [{"_": {"ext": {"INTERSECTS": ["A", "C"]}}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0]._.ext = ["A", "B"] + assert len(matcher(doc)) == 1 + + # INTERSECTS with an empty pattern list matches nothing + matcher = Matcher(en_vocab) + pattern = [{"_": {"ext": {"INTERSECTS": []}}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0]._.ext = ["A", "B"] + assert len(matcher(doc)) == 0 + + # INTERSECTS with an empty value matches nothing + matcher = Matcher(en_vocab) + pattern = [{"_": {"ext": {"INTERSECTS": ["A", "B"]}}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0]._.ext = [] + assert len(matcher(doc)) == 0 + def test_matcher_morph_handling(en_vocab): # order of features in pattern doesn't matter diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 9c15f8797..c34560dec 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -77,13 +77,14 @@ it compares to another value. > ] > ``` -| Attribute | Description | -| -------------------------- | ------------------------------------------------------------------------------------------------------- | -| `IN` | Attribute value is member of a list. ~~Any~~ | -| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | -| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ | -| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ | -| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | +| Attribute | Description | +| -------------------------- | -------------------------------------------------------------------------------------------------------- | +| `IN` | Attribute value is member of a list. ~~Any~~ | +| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | +| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ | +| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ | +| `INTERSECTS` | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ | +| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | ## Matcher.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index b718ef2b2..81c838584 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -240,13 +240,14 @@ following rich comparison attributes are available: > # "Number=Sing|Gender=Neut|Polite=Infm" will not match because it's a superset > ``` -| Attribute | Description | -| -------------------------- | ------------------------------------------------------------------------------------------------------- | -| `IN` | Attribute value is member of a list. ~~Any~~ | -| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | -| `IS_SUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ | -| `IS_SUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ | -| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | +| Attribute | Description | +| -------------------------- | --------------------------------------------------------------------------------------------------------- | +| `IN` | Attribute value is member of a list. ~~Any~~ | +| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | +| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ | +| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ | +| `INTERSECTS` | Attribute value (for `MORPH` or custom list attributes) has a non-empty intersection with a list. ~~Any~~ | +| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | #### Regular expressions {#regex new="2.1"}