Fix token.idx for special cases with affixes (#6035)

2020-09-13 14:05:36 +02:00 · 2020-09-13 14:05:36 +02:00 · c7bd631b5f
parent 54c40223a1
commit c7bd631b5f
2 changed files with 11 additions and 2 deletions
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -155,3 +155,11 @@ def test_tokenizer_special_cases_with_period(tokenizer):
    tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
    doc = tokenizer(text)
    assert [token.text for token in doc] == ["_SPECIAL_", "."]
+
+
+def test_tokenizer_special_cases_idx(tokenizer):
+    text = "the _ID'X_"
+    tokenizer.add_special_case("_ID'X_", [{"orth": "_ID"}, {"orth": "'X_"}])
+    doc = tokenizer(text)
+    assert doc[1].idx == 4
+    assert doc[2].idx == 7
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -343,8 +343,9 @@ cdef class Tokenizer:
                    for j in range(cached.length):
                        tokens[i + offset + j] = cached.data.tokens[j]
                        tokens[i + offset + j].idx = orig_idx + idx_offset
-                        idx_offset += cached.data.tokens[j].lex.length + \
-                                1 if cached.data.tokens[j].spacy else 0
+                        idx_offset += cached.data.tokens[j].lex.length
+                        if cached.data.tokens[j].spacy:
+                            idx_offset += 1
                    tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
                    i += span_end - span_start
                    offset += span[3]