From 718f1c50fb8a2cd6fdc0376f8c6af1142c54a1e8 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 3 Nov 2017 21:11:20 +0100 Subject: [PATCH] Add regression test for #1491 --- spacy/tests/regression/test_issue1491.py | 28 ++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 spacy/tests/regression/test_issue1491.py diff --git a/spacy/tests/regression/test_issue1491.py b/spacy/tests/regression/test_issue1491.py new file mode 100644 index 000000000..ef8c639a6 --- /dev/null +++ b/spacy/tests/regression/test_issue1491.py @@ -0,0 +1,28 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +import regex as re + +from ...lang.en import English +from ...tokenizer import Tokenizer + + +@pytest.mark.xfail +def test_issue1491(): + """Test possible off-by-one error in tokenizer prefix/suffix/infix rules.""" + prefix_re = re.compile(r'''[\[\("']''') + suffix_re = re.compile(r'''[\]\)"']''') + infix_re = re.compile(r'''[-~]''') + + def my_tokenizer(nlp): + return Tokenizer(nlp.vocab, {}, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer) + + nlp = English() + nlp.tokenizer = my_tokenizer(nlp) + doc = nlp("single quote 'goodbye end.") + tokens = [token.text for token in doc] + assert tokens == ['single', 'quote', "'", 'goodbye', 'end', '.']