* Fix Issue #12: Incorrect token.idx calculations for some punctuation, in the presence of token cache

This commit is contained in:
Matthew Honnibal 2015-01-30 12:33:38 +11:00
parent b38093237e
commit 0a7fcebdf7
1 changed files with 6 additions and 2 deletions

View File

@ -95,7 +95,6 @@ cdef class Tokenizer:
return tokens
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
#cached = <Cached*>self._specials.get(key)
cached = <_Cached*>self._cache.get(key)
if cached == NULL:
return False
@ -176,7 +175,12 @@ cdef class Tokenizer:
if string.n != 0:
cache_hit = self._try_cache(idx, string.key, tokens)
if cache_hit:
idx = tokens.data[tokens.length - 1].idx + 1
# Get last idx
idx = tokens.data[tokens.length - 1].idx
# Increment by last length
idx += tokens.data[tokens.length - 1].lex.length
# Add 1 for space
idx += 1
else:
split = self._find_infix(string.chars, string.n)
if split == 0 or split == -1: