* Fix tokenizer

This commit is contained in:
Matthew Honnibal 2015-07-14 00:10:51 +02:00
parent 2ae0b439b2
commit 3b5baa660f
1 changed files with 2 additions and 6 deletions

View File

@ -110,16 +110,12 @@ cdef class Tokenizer:
if cached == NULL: if cached == NULL:
return False return False
cdef int i cdef int i
cdef int less_one = cached.length-1
if cached.is_lex: if cached.is_lex:
for i in range(less_one): for i in range(cached.length):
# There's a space at the end of the chunk.
tokens.push_back(cached.data.lexemes[i], False) tokens.push_back(cached.data.lexemes[i], False)
tokens.push_back(cached.data.lexemes[less_one], False)
else: else:
for i in range(less_one): for i in range(cached.length):
tokens.push_back(&cached.data.tokens[i], False) tokens.push_back(&cached.data.tokens[i], False)
tokens.push_back(&cached.data.tokens[less_one], False)
return True return True
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1: cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1: