From 33b0f86de3a0c1f45ae4c2e7abdad4ac4ef316d1 Mon Sep 17 00:00:00 2001 From: Felix Sonntag Date: Sun, 19 Nov 2017 15:14:40 +0100 Subject: [PATCH] Changed tokenizer to add infix when infix_start is offset --- spacy/tokenizer.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 095fbf4ad..600c81fff 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -241,11 +241,10 @@ cdef class Tokenizer: for match in matches: infix_start = match.start() infix_end = match.end() - if infix_start == start: - continue - span = string[start:infix_start] - tokens.push_back(self.vocab.get(tokens.mem, span), False) + if infix_start != start: + span = string[start:infix_start] + tokens.push_back(self.vocab.get(tokens.mem, span), False) if infix_start != infix_end: # If infix_start != infix_end, it means the infix