* Fix Issue #360: Tokenizer failed when the infix regex matched the start of the string while trying to tokenize multi-infix tokens.

This commit is contained in:
Matthew Honnibal 2016-05-09 13:23:47 +02:00
parent eab2376547
commit cc8bf62208
2 changed files with 8 additions and 0 deletions

View File

@ -24,6 +24,12 @@ def test_ellipsis(en_tokenizer):
tokens = en_tokenizer('best...known') tokens = en_tokenizer('best...known')
assert len(tokens) == 3 assert len(tokens) == 3
def test_big_ellipsis(en_tokenizer):
'''Test regression identified in Issue #360'''
tokens = en_tokenizer(u'$45...............Asking')
assert len(tokens) > 2
def test_email(en_tokenizer): def test_email(en_tokenizer):
tokens = en_tokenizer('hello@example.com') tokens = en_tokenizer('hello@example.com')

View File

@ -227,6 +227,8 @@ cdef class Tokenizer:
for match in matches: for match in matches:
infix_start = match.start() infix_start = match.start()
infix_end = match.end() infix_end = match.end()
if infix_start == start:
continue
span = string[start:infix_start] span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False) tokens.push_back(self.vocab.get(tokens.mem, span), False)