Fix spacy when retokenizing cases with affixes (#6475)

Preserve `token.spacy` corresponding to the span end token in the original doc rather than adjusting for the current offset. * If not modifying in place, this checks in the original document (`doc.c` rather than `tokens`). * If modifying in place, the document has not been modified past the current span start position so the value at the current span end position is valid.
2020-12-08 07:25:56 +01:00 · 2020-12-08 07:25:56 +01:00 · 29b058ebdc
parent 4448680750
commit 29b058ebdc
2 changed files with 18 additions and 1 deletions
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -2,6 +2,7 @@ import pytest
 from spacy.vocab import Vocab
 from spacy.tokenizer import Tokenizer
 from spacy.util import ensure_path
+from spacy.lang.en import English


 def test_tokenizer_handles_no_word(tokenizer):
@ -150,6 +151,22 @@ def test_tokenizer_special_cases_with_affixes(tokenizer):
    ]


+def test_tokenizer_special_cases_with_affixes_preserve_spacy():
+    tokenizer = English().tokenizer
+    # reset all special cases
+    tokenizer.rules = {}
+
+    # in-place modification (only merges)
+    text = "''a'' "
+    tokenizer.add_special_case("''", [{"ORTH": "''"}])
+    assert tokenizer(text).text == text
+
+    # not in-place (splits and merges)
+    tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}])
+    text = "ab ab ab ''ab ab'' ab'' ''ab"
+    assert tokenizer(text).text == text
+
+
 def test_tokenizer_special_cases_with_period(tokenizer):
    text = "_SPECIAL_."
    tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -338,7 +338,7 @@ cdef class Tokenizer:
                    # Copy special case tokens into doc and adjust token and
                    # character offsets
                    idx_offset = 0
-                    orig_final_spacy = doc.c[span_end + offset - 1].spacy
+                    orig_final_spacy = doc.c[span_end - 1].spacy
                    orig_idx = doc.c[i].idx
                    for j in range(cached.length):
                        tokens[i + offset + j] = cached.data.tokens[j]