mirror of https://github.com/explosion/spaCy.git
* Fix Issue #12: Incorrect token.idx calculations for some punctuation, in the presence of token cache
This commit is contained in:
parent
b38093237e
commit
0a7fcebdf7
|
@ -95,7 +95,6 @@ cdef class Tokenizer:
|
|||
return tokens
|
||||
|
||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
|
||||
#cached = <Cached*>self._specials.get(key)
|
||||
cached = <_Cached*>self._cache.get(key)
|
||||
if cached == NULL:
|
||||
return False
|
||||
|
@ -176,7 +175,12 @@ cdef class Tokenizer:
|
|||
if string.n != 0:
|
||||
cache_hit = self._try_cache(idx, string.key, tokens)
|
||||
if cache_hit:
|
||||
idx = tokens.data[tokens.length - 1].idx + 1
|
||||
# Get last idx
|
||||
idx = tokens.data[tokens.length - 1].idx
|
||||
# Increment by last length
|
||||
idx += tokens.data[tokens.length - 1].lex.length
|
||||
# Add 1 for space
|
||||
idx += 1
|
||||
else:
|
||||
split = self._find_infix(string.chars, string.n)
|
||||
if split == 0 or split == -1:
|
||||
|
|
Loading…
Reference in New Issue