Allow zero-width 'infix' token

This commit is contained in:
Raphaël Bournhonesque 2017-01-23 18:28:01 +01:00
parent 01c2daf0c9
commit dce8f5515e
1 changed files with 9 additions and 12 deletions

View File

@ -289,21 +289,18 @@ cdef class Tokenizer:
infix_end = match.end() infix_end = match.end()
if infix_start == start: if infix_start == start:
continue continue
if infix_start == infix_end:
msg = ("Tokenizer found a zero-width 'infix' token.\n"
"If you're using a built-in tokenizer, please\n"
"report this bug. If you're using a tokenizer\n"
"you developed, check your TOKENIZER_INFIXES\n"
"tuple.\n"
"String being matched: {string}\n"
"Language: {lang}")
raise ValueError(msg.format(string=string, lang=self.vocab.lang))
span = string[start:infix_start] span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False) tokens.push_back(self.vocab.get(tokens.mem, span), False)
infix_span = string[infix_start:infix_end] if infix_start != infix_end:
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) # If infix_start != infix_end, it means the infix
# token is non-empty. Empty infix tokens are useful
# for tokenization in some languages (see
# https://github.com/explosion/spaCy/issues/768)
infix_span = string[infix_start:infix_end]
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
start = infix_end start = infix_end
span = string[start:] span = string[start:]
tokens.push_back(self.vocab.get(tokens.mem, span), False) tokens.push_back(self.vocab.get(tokens.mem, span), False)