From 175847f92cb89f2515f848c5eda8872af247954f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 2 Aug 2021 19:39:26 +0200
Subject: [PATCH] Support list values and INTERSECTS in Matcher (#8784)

* Support list values and IS_INTERSECT in Matcher

* Support list values as token attributes for set operators, not just as
pattern values.

* Add `IS_INTERSECT` operator.

* Fix incorrect `ISSUBSET` and `ISSUPERSET` in schema and docs.

* Rename IS_INTERSECT to INTERSECTS
---
 spacy/matcher/matcher.pyx                 | 15 +++--
 spacy/schemas.py                          |  6 +-
 spacy/tests/matcher/test_matcher_api.py   | 76 +++++++++++++++++++++++
 website/docs/api/matcher.md               | 15 ++---
 website/docs/usage/rule-based-matching.md | 15 ++---
 5 files changed, 106 insertions(+), 21 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 7b1cfb633..555766f62 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -845,7 +845,7 @@ class _RegexPredicate:
 
 
 class _SetPredicate:
-    operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
+    operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
 
     def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
         self.i = i
@@ -868,14 +868,16 @@ class _SetPredicate:
         else:
             value = get_token_attr_for_matcher(token.c, self.attr)
 
-        if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
+        if self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"):
             if self.attr == MORPH:
                 # break up MORPH into individual Feat=Val values
                 value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
             else:
-                # IS_SUBSET for other attrs will be equivalent to "IN"
-                # IS_SUPERSET will only match for other attrs with 0 or 1 values
-                value = set([value])
+                # treat a single value as a list
+                if isinstance(value, (str, int)):
+                    value = set([get_string_id(value)])
+                else:
+                    value = set(get_string_id(v) for v in value)
         if self.predicate == "IN":
             return value in self.value
         elif self.predicate == "NOT_IN":
@@ -884,6 +886,8 @@ class _SetPredicate:
             return value <= self.value
         elif self.predicate == "IS_SUPERSET":
             return value >= self.value
+        elif self.predicate == "INTERSECTS":
+            return bool(value & self.value)
 
     def __repr__(self):
         return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
@@ -928,6 +932,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
         "NOT_IN": _SetPredicate,
         "IS_SUBSET": _SetPredicate,
         "IS_SUPERSET": _SetPredicate,
+        "INTERSECTS": _SetPredicate,
         "==": _ComparisonPredicate,
         "!=": _ComparisonPredicate,
         ">=": _ComparisonPredicate,
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 992e17d70..83623b104 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -159,6 +159,7 @@ class TokenPatternString(BaseModel):
     NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
     IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
     IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
+    INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
 
     class Config:
         extra = "forbid"
@@ -175,8 +176,9 @@ class TokenPatternNumber(BaseModel):
     REGEX: Optional[StrictStr] = Field(None, alias="regex")
     IN: Optional[List[StrictInt]] = Field(None, alias="in")
     NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
-    ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
-    ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
+    IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
+    IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
+    INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
     EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
     NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
     GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index e0f655bbe..a42735eae 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -270,6 +270,16 @@ def test_matcher_subset_value_operator(en_vocab):
     doc[0].tag_ = "A"
     assert len(matcher(doc)) == 0
 
+    # IS_SUBSET with a list value
+    Token.set_extension("ext", default=[])
+    matcher = Matcher(en_vocab)
+    pattern = [{"_": {"ext": {"IS_SUBSET": ["A", "B"]}}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0]._.ext = ["A"]
+    doc[1]._.ext = ["C", "D"]
+    assert len(matcher(doc)) == 2
+
 
 def test_matcher_superset_value_operator(en_vocab):
     matcher = Matcher(en_vocab)
@@ -308,6 +318,72 @@ def test_matcher_superset_value_operator(en_vocab):
     doc[0].tag_ = "A"
     assert len(matcher(doc)) == 3
 
+    # IS_SUPERSET with a list value
+    Token.set_extension("ext", default=[])
+    matcher = Matcher(en_vocab)
+    pattern = [{"_": {"ext": {"IS_SUPERSET": ["A"]}}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0]._.ext = ["A", "B"]
+    assert len(matcher(doc)) == 1
+
+
+def test_matcher_intersect_value_operator(en_vocab):
+    matcher = Matcher(en_vocab)
+    pattern = [{"MORPH": {"INTERSECTS": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc)) == 0
+    doc[0].set_morph("Feat=Val")
+    assert len(matcher(doc)) == 1
+    doc[0].set_morph("Feat=Val|Feat2=Val2")
+    assert len(matcher(doc)) == 1
+    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
+    assert len(matcher(doc)) == 1
+    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
+    assert len(matcher(doc)) == 1
+
+    # INTERSECTS with a single value is the same as IN
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"INTERSECTS": ["A", "B"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 1
+
+    # INTERSECTS with an empty pattern list matches nothing
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"INTERSECTS": []}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 0
+
+    # INTERSECTS with a list value
+    Token.set_extension("ext", default=[])
+    matcher = Matcher(en_vocab)
+    pattern = [{"_": {"ext": {"INTERSECTS": ["A", "C"]}}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0]._.ext = ["A", "B"]
+    assert len(matcher(doc)) == 1
+
+    # INTERSECTS with an empty pattern list matches nothing
+    matcher = Matcher(en_vocab)
+    pattern = [{"_": {"ext": {"INTERSECTS": []}}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0]._.ext = ["A", "B"]
+    assert len(matcher(doc)) == 0
+
+    # INTERSECTS with an empty value matches nothing
+    matcher = Matcher(en_vocab)
+    pattern = [{"_": {"ext": {"INTERSECTS": ["A", "B"]}}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0]._.ext = []
+    assert len(matcher(doc)) == 0
+
 
 def test_matcher_morph_handling(en_vocab):
     # order of features in pattern doesn't matter
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 9c15f8797..c34560dec 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -77,13 +77,14 @@ it compares to another value.
 > ]
 > ```
 
-| Attribute                  | Description                                                                                             |
-| -------------------------- | ------------------------------------------------------------------------------------------------------- |
-| `IN`                       | Attribute value is member of a list. ~~Any~~                                                            |
-| `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      |
-| `ISSUBSET`                 | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          |
-| `ISSUPERSET`               | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        |
-| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
+| Attribute                  | Description                                                                                              |
+| -------------------------- | -------------------------------------------------------------------------------------------------------- |
+| `IN`                       | Attribute value is member of a list. ~~Any~~                                                             |
+| `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                       |
+| `IS_SUBSET`                | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~                   |
+| `IS_SUPERSET`              | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~                 |
+| `INTERSECTS`               | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ |
+| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~  |
 
 ## Matcher.\_\_init\_\_ {#init tag="method"}
 
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index b718ef2b2..81c838584 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -240,13 +240,14 @@ following rich comparison attributes are available:
 > # "Number=Sing|Gender=Neut|Polite=Infm" will not match because it's a superset
 > ```
 
-| Attribute                  | Description                                                                                             |
-| -------------------------- | ------------------------------------------------------------------------------------------------------- |
-| `IN`                       | Attribute value is member of a list. ~~Any~~                                                            |
-| `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      |
-| `IS_SUBSET`                | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          |
-| `IS_SUPERSET`              | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        |
-| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
+| Attribute                  | Description                                                                                               |
+| -------------------------- | --------------------------------------------------------------------------------------------------------- |
+| `IN`                       | Attribute value is member of a list. ~~Any~~                                                              |
+| `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                        |
+| `IS_SUBSET`                | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~                    |
+| `IS_SUPERSET`              | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~                  |
+| `INTERSECTS`               | Attribute value (for `MORPH` or custom list attributes) has a non-empty intersection with a list. ~~Any~~ |
+| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~   |
 
 #### Regular expressions {#regex new="2.1"}