From dce8f5515ee4232af8161abcb9b06ebc5526d9cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Mon, 23 Jan 2017 18:28:01 +0100 Subject: [PATCH] Allow zero-width 'infix' token --- spacy/tokenizer.pyx | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 0e83c4a75..8f2f111e7 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -289,21 +289,18 @@ cdef class Tokenizer: infix_end = match.end() if infix_start == start: continue - if infix_start == infix_end: - msg = ("Tokenizer found a zero-width 'infix' token.\n" - "If you're using a built-in tokenizer, please\n" - "report this bug. If you're using a tokenizer\n" - "you developed, check your TOKENIZER_INFIXES\n" - "tuple.\n" - "String being matched: {string}\n" - "Language: {lang}") - raise ValueError(msg.format(string=string, lang=self.vocab.lang)) span = string[start:infix_start] tokens.push_back(self.vocab.get(tokens.mem, span), False) - - infix_span = string[infix_start:infix_end] - tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) + + if infix_start != infix_end: + # If infix_start != infix_end, it means the infix + # token is non-empty. Empty infix tokens are useful + # for tokenization in some languages (see + # https://github.com/explosion/spaCy/issues/768) + infix_span = string[infix_start:infix_end] + tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) + start = infix_end span = string[start:] tokens.push_back(self.vocab.get(tokens.mem, span), False)