mirror of https://github.com/explosion/spaCy.git
Support list values and INTERSECTS in Matcher (#8784)
* Support list values and IS_INTERSECT in Matcher * Support list values as token attributes for set operators, not just as pattern values. * Add `IS_INTERSECT` operator. * Fix incorrect `ISSUBSET` and `ISSUPERSET` in schema and docs. * Rename IS_INTERSECT to INTERSECTS
This commit is contained in:
parent
fbbbda1954
commit
175847f92c
|
@ -845,7 +845,7 @@ class _RegexPredicate:
|
|||
|
||||
|
||||
class _SetPredicate:
|
||||
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
|
||||
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
|
||||
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||
self.i = i
|
||||
|
@ -868,14 +868,16 @@ class _SetPredicate:
|
|||
else:
|
||||
value = get_token_attr_for_matcher(token.c, self.attr)
|
||||
|
||||
if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
|
||||
if self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"):
|
||||
if self.attr == MORPH:
|
||||
# break up MORPH into individual Feat=Val values
|
||||
value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
|
||||
else:
|
||||
# IS_SUBSET for other attrs will be equivalent to "IN"
|
||||
# IS_SUPERSET will only match for other attrs with 0 or 1 values
|
||||
value = set([value])
|
||||
# treat a single value as a list
|
||||
if isinstance(value, (str, int)):
|
||||
value = set([get_string_id(value)])
|
||||
else:
|
||||
value = set(get_string_id(v) for v in value)
|
||||
if self.predicate == "IN":
|
||||
return value in self.value
|
||||
elif self.predicate == "NOT_IN":
|
||||
|
@ -884,6 +886,8 @@ class _SetPredicate:
|
|||
return value <= self.value
|
||||
elif self.predicate == "IS_SUPERSET":
|
||||
return value >= self.value
|
||||
elif self.predicate == "INTERSECTS":
|
||||
return bool(value & self.value)
|
||||
|
||||
def __repr__(self):
|
||||
return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
|
||||
|
@ -928,6 +932,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
|
|||
"NOT_IN": _SetPredicate,
|
||||
"IS_SUBSET": _SetPredicate,
|
||||
"IS_SUPERSET": _SetPredicate,
|
||||
"INTERSECTS": _SetPredicate,
|
||||
"==": _ComparisonPredicate,
|
||||
"!=": _ComparisonPredicate,
|
||||
">=": _ComparisonPredicate,
|
||||
|
|
|
@ -159,6 +159,7 @@ class TokenPatternString(BaseModel):
|
|||
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
||||
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
||||
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
|
||||
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
@ -175,8 +176,9 @@ class TokenPatternNumber(BaseModel):
|
|||
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
||||
IN: Optional[List[StrictInt]] = Field(None, alias="in")
|
||||
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
|
||||
ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
|
||||
ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
|
||||
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
|
||||
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
|
||||
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
|
||||
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
|
||||
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
|
||||
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
|
||||
|
|
|
@ -270,6 +270,16 @@ def test_matcher_subset_value_operator(en_vocab):
|
|||
doc[0].tag_ = "A"
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
# IS_SUBSET with a list value
|
||||
Token.set_extension("ext", default=[])
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"_": {"ext": {"IS_SUBSET": ["A", "B"]}}}]
|
||||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
doc[0]._.ext = ["A"]
|
||||
doc[1]._.ext = ["C", "D"]
|
||||
assert len(matcher(doc)) == 2
|
||||
|
||||
|
||||
def test_matcher_superset_value_operator(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
|
@ -308,6 +318,72 @@ def test_matcher_superset_value_operator(en_vocab):
|
|||
doc[0].tag_ = "A"
|
||||
assert len(matcher(doc)) == 3
|
||||
|
||||
# IS_SUPERSET with a list value
|
||||
Token.set_extension("ext", default=[])
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"_": {"ext": {"IS_SUPERSET": ["A"]}}}]
|
||||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
doc[0]._.ext = ["A", "B"]
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
|
||||
def test_matcher_intersect_value_operator(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"MORPH": {"INTERSECTS": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
|
||||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc)) == 0
|
||||
doc[0].set_morph("Feat=Val")
|
||||
assert len(matcher(doc)) == 1
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2")
|
||||
assert len(matcher(doc)) == 1
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
|
||||
assert len(matcher(doc)) == 1
|
||||
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
# INTERSECTS with a single value is the same as IN
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"TAG": {"INTERSECTS": ["A", "B"]}}]
|
||||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
doc[0].tag_ = "A"
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
# INTERSECTS with an empty pattern list matches nothing
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"TAG": {"INTERSECTS": []}}]
|
||||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
doc[0].tag_ = "A"
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
# INTERSECTS with a list value
|
||||
Token.set_extension("ext", default=[])
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"_": {"ext": {"INTERSECTS": ["A", "C"]}}}]
|
||||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
doc[0]._.ext = ["A", "B"]
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
# INTERSECTS with an empty pattern list matches nothing
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"_": {"ext": {"INTERSECTS": []}}}]
|
||||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
doc[0]._.ext = ["A", "B"]
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
# INTERSECTS with an empty value matches nothing
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"_": {"ext": {"INTERSECTS": ["A", "B"]}}}]
|
||||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
doc[0]._.ext = []
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
|
||||
def test_matcher_morph_handling(en_vocab):
|
||||
# order of features in pattern doesn't matter
|
||||
|
|
|
@ -77,13 +77,14 @@ it compares to another value.
|
|||
> ]
|
||||
> ```
|
||||
|
||||
| Attribute | Description |
|
||||
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
|
||||
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
||||
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
||||
| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
|
||||
| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
|
||||
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
||||
| Attribute | Description |
|
||||
| -------------------------- | -------------------------------------------------------------------------------------------------------- |
|
||||
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
||||
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
||||
| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ |
|
||||
| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ |
|
||||
| `INTERSECTS` | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ |
|
||||
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
||||
|
||||
## Matcher.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
|
|
|
@ -240,13 +240,14 @@ following rich comparison attributes are available:
|
|||
> # "Number=Sing|Gender=Neut|Polite=Infm" will not match because it's a superset
|
||||
> ```
|
||||
|
||||
| Attribute | Description |
|
||||
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
|
||||
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
||||
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
||||
| `IS_SUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
|
||||
| `IS_SUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
|
||||
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
||||
| Attribute | Description |
|
||||
| -------------------------- | --------------------------------------------------------------------------------------------------------- |
|
||||
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
||||
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
||||
| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ |
|
||||
| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ |
|
||||
| `INTERSECTS` | Attribute value (for `MORPH` or custom list attributes) has a non-empty intersection with a list. ~~Any~~ |
|
||||
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
||||
|
||||
#### Regular expressions {#regex new="2.1"}
|
||||
|
||||
|
|
Loading…
Reference in New Issue