mirror of https://github.com/explosion/spaCy.git
* Fix Issue #12: Incorrect token.idx calculations for some punctuation, in the presence of token cache
This commit is contained in:
parent
b38093237e
commit
0a7fcebdf7
|
@ -95,7 +95,6 @@ cdef class Tokenizer:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
|
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
|
||||||
#cached = <Cached*>self._specials.get(key)
|
|
||||||
cached = <_Cached*>self._cache.get(key)
|
cached = <_Cached*>self._cache.get(key)
|
||||||
if cached == NULL:
|
if cached == NULL:
|
||||||
return False
|
return False
|
||||||
|
@ -176,7 +175,12 @@ cdef class Tokenizer:
|
||||||
if string.n != 0:
|
if string.n != 0:
|
||||||
cache_hit = self._try_cache(idx, string.key, tokens)
|
cache_hit = self._try_cache(idx, string.key, tokens)
|
||||||
if cache_hit:
|
if cache_hit:
|
||||||
idx = tokens.data[tokens.length - 1].idx + 1
|
# Get last idx
|
||||||
|
idx = tokens.data[tokens.length - 1].idx
|
||||||
|
# Increment by last length
|
||||||
|
idx += tokens.data[tokens.length - 1].lex.length
|
||||||
|
# Add 1 for space
|
||||||
|
idx += 1
|
||||||
else:
|
else:
|
||||||
split = self._find_infix(string.chars, string.n)
|
split = self._find_infix(string.chars, string.n)
|
||||||
if split == 0 or split == -1:
|
if split == 0 or split == -1:
|
||||||
|
|
Loading…
Reference in New Issue