From fdf8c77630b13758c1d49b335897084435e89e89 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 21 Jan 2021 09:59:17 +0100 Subject: [PATCH] support IS_SENT_START in PhraseMatcher (#6771) * support IS_SENT_START in PhraseMatcher * add unit test and friendlier error * use IDS.get instead --- spacy/matcher/phrasematcher.pyx | 5 ++++- spacy/tests/matcher/test_phrase_matcher.py | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 00c3357f5..c1883869e 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -8,6 +8,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter import warnings +from ..attrs import IDS from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA from ..structs cimport TokenC from ..tokens.token cimport Token @@ -58,9 +59,11 @@ cdef class PhraseMatcher: attr = attr.upper() if attr == "TEXT": attr = "ORTH" + if attr == "IS_SENT_START": + attr = "SENT_START" if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]: raise ValueError(Errors.E152.format(attr=attr)) - self.attr = self.vocab.strings[attr] + self.attr = IDS.get(attr) def __len__(self): """Get the number of match IDs added to the matcher. diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 60aa584ef..b523ee157 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -290,3 +290,8 @@ def test_phrase_matcher_pickle(en_vocab): # clunky way to vaguely check that callback is unpickled (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1] assert isinstance(callbacks.get("TEST2"), Mock) + + +@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"]) +def test_phrase_matcher_sent_start(en_vocab, attr): + matcher = PhraseMatcher(en_vocab, attr=attr)