Fix spacy when retokenizing cases with affixes (#6475)

Preserve `token.spacy` corresponding to the span end token in the
original doc rather than adjusting for the current offset.

* If not modifying in place, this checks in the original document
(`doc.c` rather than `tokens`).
* If modifying in place, the document has not been modified past the
current span start position so the value at the current span end
position is valid.
This commit is contained in:
Adriane Boyd 2020-12-08 07:25:56 +01:00 committed by GitHub
parent 4448680750
commit 29b058ebdc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 18 additions and 1 deletions

View File

@ -2,6 +2,7 @@ import pytest
from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer
from spacy.util import ensure_path
from spacy.lang.en import English
def test_tokenizer_handles_no_word(tokenizer):
@ -150,6 +151,22 @@ def test_tokenizer_special_cases_with_affixes(tokenizer):
]
def test_tokenizer_special_cases_with_affixes_preserve_spacy():
tokenizer = English().tokenizer
# reset all special cases
tokenizer.rules = {}
# in-place modification (only merges)
text = "''a'' "
tokenizer.add_special_case("''", [{"ORTH": "''"}])
assert tokenizer(text).text == text
# not in-place (splits and merges)
tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}])
text = "ab ab ab ''ab ab'' ab'' ''ab"
assert tokenizer(text).text == text
def test_tokenizer_special_cases_with_period(tokenizer):
text = "_SPECIAL_."
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])

View File

@ -338,7 +338,7 @@ cdef class Tokenizer:
# Copy special case tokens into doc and adjust token and
# character offsets
idx_offset = 0
orig_final_spacy = doc.c[span_end + offset - 1].spacy
orig_final_spacy = doc.c[span_end - 1].spacy
orig_idx = doc.c[i].idx
for j in range(cached.length):
tokens[i + offset + j] = cached.data.tokens[j]