diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py index 351394021..1b7cbaa7b 100644 --- a/spacy/tests/tokenizer/test_infix.py +++ b/spacy/tests/tokenizer/test_infix.py @@ -24,6 +24,12 @@ def test_ellipsis(en_tokenizer): tokens = en_tokenizer('best...known') assert len(tokens) == 3 +def test_big_ellipsis(en_tokenizer): + '''Test regression identified in Issue #360''' + tokens = en_tokenizer(u'$45...............Asking') + assert len(tokens) > 2 + + def test_email(en_tokenizer): tokens = en_tokenizer('hello@example.com') diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 229e70793..0a2df1bcb 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -227,6 +227,8 @@ cdef class Tokenizer: for match in matches: infix_start = match.start() infix_end = match.end() + if infix_start == start: + continue span = string[start:infix_start] tokens.push_back(self.vocab.get(tokens.mem, span), False)