* Fix Issue #12: Incorrect token.idx calculations for some punctuation, in the presence of token cache

2015-01-30 12:33:38 +11:00 · 2015-01-30 12:33:38 +11:00 · 0a7fcebdf7
parent b38093237e
commit 0a7fcebdf7
1 changed files with 6 additions and 2 deletions
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -95,7 +95,6 @@ cdef class Tokenizer:
        return tokens

    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
-        #cached = <Cached*>self._specials.get(key)
        cached = <_Cached*>self._cache.get(key)
        if cached == NULL:
            return False
@ -176,7 +175,12 @@ cdef class Tokenizer:
        if string.n != 0:
            cache_hit = self._try_cache(idx, string.key, tokens)
            if cache_hit:
-                idx = tokens.data[tokens.length - 1].idx + 1
+                # Get last idx
+                idx = tokens.data[tokens.length - 1].idx
+                # Increment by last length
+                idx += tokens.data[tokens.length - 1].lex.length
+                # Add 1 for space
+                idx += 1
            else:
                split = self._find_infix(string.chars, string.n)
                if split == 0 or split == -1: