diff --git a/spacy/tests/regression/test_issue1971.py b/spacy/tests/regression/test_issue1971.py new file mode 100644 index 000000000..93bfc7410 --- /dev/null +++ b/spacy/tests/regression/test_issue1971.py @@ -0,0 +1,23 @@ +# coding: utf8 +from __future__ import unicode_literals + +from spacy.matcher import Matcher +from spacy.tokens import Token, Doc + + +def test_issue1971(en_vocab): + # Possibly related to #2675 and #2671? + matcher = Matcher(en_vocab) + pattern = [ + {"ORTH": "Doe"}, + {"ORTH": "!", "OP": "?"}, + {"_": {"optional": True}, "OP": "?"}, + {"ORTH": "!", "OP": "?"}, + ] + Token.set_extension("optional", default=False) + matcher.add("TEST", None, pattern) + doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) + # We could also assert length 1 here, but this is more conclusive, because + # the real problem here is that it returns a duplicate match for a match_id + # that's not actually in the vocab! + assert all(match_id in en_vocab.strings for match_id, start, end in matcher(doc))