From 3b5baa660fa4176c4fd603de2f6ccd966973439e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 14 Jul 2015 00:10:51 +0200 Subject: [PATCH] * Fix tokenizer --- spacy/tokenizer.pyx | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index df10c35df..d174ca71a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -110,16 +110,12 @@ cdef class Tokenizer: if cached == NULL: return False cdef int i - cdef int less_one = cached.length-1 if cached.is_lex: - for i in range(less_one): - # There's a space at the end of the chunk. + for i in range(cached.length): tokens.push_back(cached.data.lexemes[i], False) - tokens.push_back(cached.data.lexemes[less_one], False) else: - for i in range(less_one): + for i in range(cached.length): tokens.push_back(&cached.data.tokens[i], False) - tokens.push_back(&cached.data.tokens[less_one], False) return True cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1: