mirror of https://github.com/explosion/spaCy.git
Update lemma and vector information after splitting a token (#4097)
* fixing vector and lemma attributes after retokenizer.split * fixing unit test with mockup tensor * xp instead of numpy
This commit is contained in:
parent
a2ac2e873f
commit
963ea5e8d0
|
@ -0,0 +1,44 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.tokens import Doc
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def test_issue3540(en_vocab):
|
||||
|
||||
words = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
tensor = np.asarray([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], dtype="f")
|
||||
doc = Doc(en_vocab, words=words)
|
||||
doc.tensor = tensor
|
||||
|
||||
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
|
||||
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
|
||||
vectors_1 = [token.vector for token in doc]
|
||||
assert len(vectors_1) == len(doc)
|
||||
|
||||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[3], 1), doc[2]]
|
||||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||||
retokenizer.split(doc[3], [u"New", u"York"], heads=heads, attrs=attrs)
|
||||
|
||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
|
||||
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
|
||||
vectors_2 = [token.vector for token in doc]
|
||||
assert len(vectors_2) == len(doc)
|
||||
|
||||
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
||||
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
||||
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
||||
|
||||
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
||||
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
|
@ -404,14 +404,24 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
|||
doc._realloc(doc.length * 2)
|
||||
# Move tokens after the split to create space for the new tokens
|
||||
doc.length = len(doc) + nb_subtokens -1
|
||||
to_process_tensor = (doc.tensor is not None and doc.tensor.size != 0)
|
||||
if to_process_tensor:
|
||||
xp = get_array_module(doc.tensor)
|
||||
doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
|
||||
for token_to_move in range(doc.length - 1, token_index, -1):
|
||||
doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
|
||||
if to_process_tensor:
|
||||
doc.tensor[token_to_move + nb_subtokens - 1] = doc.tensor[token_to_move]
|
||||
# Host the tokens in the newly created space
|
||||
cdef int idx_offset = 0
|
||||
for i, orth in enumerate(orths):
|
||||
token = &doc.c[token_index + i]
|
||||
lex = doc.vocab.get(doc.mem, orth)
|
||||
token.lex = lex
|
||||
token.lemma = 0 # reset lemma
|
||||
if to_process_tensor:
|
||||
# setting the tensors of the split tokens to array of zeros
|
||||
doc.tensor[token_index + i] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")
|
||||
# Update the character offset of the subtokens
|
||||
if i != 0:
|
||||
token.idx = orig_token.idx + idx_offset
|
||||
|
|
Loading…
Reference in New Issue