mirror of https://github.com/explosion/spaCy.git
Allow zero-width 'infix' token
This commit is contained in:
parent
01c2daf0c9
commit
dce8f5515e
|
@ -289,21 +289,18 @@ cdef class Tokenizer:
|
||||||
infix_end = match.end()
|
infix_end = match.end()
|
||||||
if infix_start == start:
|
if infix_start == start:
|
||||||
continue
|
continue
|
||||||
if infix_start == infix_end:
|
|
||||||
msg = ("Tokenizer found a zero-width 'infix' token.\n"
|
|
||||||
"If you're using a built-in tokenizer, please\n"
|
|
||||||
"report this bug. If you're using a tokenizer\n"
|
|
||||||
"you developed, check your TOKENIZER_INFIXES\n"
|
|
||||||
"tuple.\n"
|
|
||||||
"String being matched: {string}\n"
|
|
||||||
"Language: {lang}")
|
|
||||||
raise ValueError(msg.format(string=string, lang=self.vocab.lang))
|
|
||||||
|
|
||||||
span = string[start:infix_start]
|
span = string[start:infix_start]
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
||||||
|
|
||||||
|
if infix_start != infix_end:
|
||||||
|
# If infix_start != infix_end, it means the infix
|
||||||
|
# token is non-empty. Empty infix tokens are useful
|
||||||
|
# for tokenization in some languages (see
|
||||||
|
# https://github.com/explosion/spaCy/issues/768)
|
||||||
infix_span = string[infix_start:infix_end]
|
infix_span = string[infix_start:infix_end]
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
|
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
|
||||||
|
|
||||||
start = infix_end
|
start = infix_end
|
||||||
span = string[start:]
|
span = string[start:]
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
||||||
|
|
Loading…
Reference in New Issue