mirror of https://github.com/explosion/spaCy.git
Fix token.idx for special cases with affixes (#6035)
This commit is contained in:
parent
54c40223a1
commit
c7bd631b5f
|
@ -155,3 +155,11 @@ def test_tokenizer_special_cases_with_period(tokenizer):
|
|||
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
|
||||
doc = tokenizer(text)
|
||||
assert [token.text for token in doc] == ["_SPECIAL_", "."]
|
||||
|
||||
|
||||
def test_tokenizer_special_cases_idx(tokenizer):
|
||||
text = "the _ID'X_"
|
||||
tokenizer.add_special_case("_ID'X_", [{"orth": "_ID"}, {"orth": "'X_"}])
|
||||
doc = tokenizer(text)
|
||||
assert doc[1].idx == 4
|
||||
assert doc[2].idx == 7
|
||||
|
|
|
@ -343,8 +343,9 @@ cdef class Tokenizer:
|
|||
for j in range(cached.length):
|
||||
tokens[i + offset + j] = cached.data.tokens[j]
|
||||
tokens[i + offset + j].idx = orig_idx + idx_offset
|
||||
idx_offset += cached.data.tokens[j].lex.length + \
|
||||
1 if cached.data.tokens[j].spacy else 0
|
||||
idx_offset += cached.data.tokens[j].lex.length
|
||||
if cached.data.tokens[j].spacy:
|
||||
idx_offset += 1
|
||||
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
|
||||
i += span_end - span_start
|
||||
offset += span[3]
|
||||
|
|
Loading…
Reference in New Issue