diff --git a/spacy/schemas.py b/spacy/schemas.py index b3ea11d8b..cf58688ef 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -222,6 +222,8 @@ class TokenPattern(BaseModel): lemma: Optional[StringValue] = None shape: Optional[StringValue] = None ent_type: Optional[StringValue] = None + ent_id: Optional[StringValue] = None + ent_kb_id: Optional[StringValue] = None norm: Optional[StringValue] = None length: Optional[NumberValue] = None spacy: Optional[StrictBool] = None diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 4d21aea81..74feb7c5d 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -22,6 +22,8 @@ TEST_PATTERNS = [ ([{"TEXT": {"VALUE": "foo"}}], 2, 0), # prev: (1, 0) ([{"IS_DIGIT": -1}], 1, 0), ([{"ORTH": -1}], 1, 0), + ([{"ENT_ID": -1}], 1, 0), + ([{"ENT_KB_ID": -1}], 1, 0), # Good patterns ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0), ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0), @@ -33,6 +35,8 @@ TEST_PATTERNS = [ ([{"orth": "foo"}], 0, 0), # prev: xfail ([{"IS_SENT_START": True}], 0, 0), ([{"SENT_START": True}], 0, 0), + ([{"ENT_ID": "STRING"}], 0, 0), + ([{"ENT_KB_ID": "STRING"}], 0, 0), ] diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index c34560dec..803105ba2 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -44,6 +44,8 @@ rule-based matching are: | `SPACY` | Token has a trailing space. ~~bool~~ | |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ | +| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | +| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ | | `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | | `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |