diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 23c2d5c47..82032b2da 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -2,6 +2,7 @@ import pytest from spacy.vocab import Vocab from spacy.tokenizer import Tokenizer from spacy.util import ensure_path +from spacy.lang.en import English def test_tokenizer_handles_no_word(tokenizer): @@ -150,6 +151,22 @@ def test_tokenizer_special_cases_with_affixes(tokenizer): ] +def test_tokenizer_special_cases_with_affixes_preserve_spacy(): + tokenizer = English().tokenizer + # reset all special cases + tokenizer.rules = {} + + # in-place modification (only merges) + text = "''a'' " + tokenizer.add_special_case("''", [{"ORTH": "''"}]) + assert tokenizer(text).text == text + + # not in-place (splits and merges) + tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}]) + text = "ab ab ab ''ab ab'' ab'' ''ab" + assert tokenizer(text).text == text + + def test_tokenizer_special_cases_with_period(tokenizer): text = "_SPECIAL_." tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}]) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 17714940d..8d8fac4fd 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -338,7 +338,7 @@ cdef class Tokenizer: # Copy special case tokens into doc and adjust token and # character offsets idx_offset = 0 - orig_final_spacy = doc.c[span_end + offset - 1].spacy + orig_final_spacy = doc.c[span_end - 1].spacy orig_idx = doc.c[i].idx for j in range(cached.length): tokens[i + offset + j] = cached.data.tokens[j]