mirror of https://github.com/explosion/spaCy.git
* Fix Issue #122: Incorrect calculation of children after Doc.merge()
This commit is contained in:
parent
454c1996d0
commit
a7e6c5ac8f
|
@ -447,9 +447,8 @@ cdef class Doc:
|
||||||
|
|
||||||
cdef Span span = self[start:end]
|
cdef Span span = self[start:end]
|
||||||
# Get LexemeC for newly merged token
|
# Get LexemeC for newly merged token
|
||||||
new_orth = ''.join([t.string for t in span])
|
new_orth = ''.join([t.text_with_ws for t in span])
|
||||||
if span[-1].whitespace_:
|
new_orth = new_orth[:-len(span[-1].whitespace_)]
|
||||||
new_orth = new_orth[:-1]
|
|
||||||
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
|
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
|
||||||
# House the new merged token where it starts
|
# House the new merged token where it starts
|
||||||
cdef TokenC* token = &self.data[start]
|
cdef TokenC* token = &self.data[start]
|
||||||
|
@ -508,16 +507,26 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||||
cdef TokenC* head
|
cdef TokenC* head
|
||||||
cdef TokenC* child
|
cdef TokenC* child
|
||||||
cdef int i
|
cdef int i
|
||||||
|
# Set number of left/right children to 0. We'll increment it in the loops.
|
||||||
|
for i in range(length):
|
||||||
|
tokens[i].l_kids = 0
|
||||||
|
tokens[i].r_kids = 0
|
||||||
|
tokens[i].l_edge = i
|
||||||
|
tokens[i].r_edge = i
|
||||||
# Set left edges
|
# Set left edges
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
child = &tokens[i]
|
child = &tokens[i]
|
||||||
head = &tokens[i + child.head]
|
head = &tokens[i + child.head]
|
||||||
if child < head and child.l_edge < head.l_edge:
|
if child < head:
|
||||||
head.l_edge = child.l_edge
|
if child.l_edge < head.l_edge:
|
||||||
|
head.l_edge = child.l_edge
|
||||||
|
head.l_kids += 1
|
||||||
|
|
||||||
# Set right edges --- same as above, but iterate in reverse
|
# Set right edges --- same as above, but iterate in reverse
|
||||||
for i in range(length-1, -1, -1):
|
for i in range(length-1, -1, -1):
|
||||||
child = &tokens[i]
|
child = &tokens[i]
|
||||||
head = &tokens[i + child.head]
|
head = &tokens[i + child.head]
|
||||||
if child > head and child.r_edge > head.r_edge:
|
if child > head:
|
||||||
head.r_edge = child.r_edge
|
if child.r_edge > head.r_edge:
|
||||||
|
head.r_edge = child.r_edge
|
||||||
|
head.r_kids += 1
|
||||||
|
|
|
@ -109,3 +109,30 @@ def test_set_ents(EN):
|
||||||
assert ent.label_ == 'PRODUCT'
|
assert ent.label_ == 'PRODUCT'
|
||||||
assert ent.start == 2
|
assert ent.start == 2
|
||||||
assert ent.end == 4
|
assert ent.end == 4
|
||||||
|
|
||||||
|
|
||||||
|
def test_merge(EN):
|
||||||
|
doc = EN('WKRO played songs by the beach boys all night')
|
||||||
|
|
||||||
|
assert len(doc) == 9
|
||||||
|
# merge 'The Beach Boys'
|
||||||
|
doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), 'NAMED', 'LEMMA', 'TYPE')
|
||||||
|
assert len(doc) == 7
|
||||||
|
|
||||||
|
assert doc[4].text == 'the beach boys'
|
||||||
|
assert doc[4].text_with_ws == 'the beach boys '
|
||||||
|
assert doc[4].tag_ == 'NAMED'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_merge_children(EN):
|
||||||
|
"""Test that attachments work correctly after merging."""
|
||||||
|
doc = EN('WKRO played songs by the beach boys all night')
|
||||||
|
# merge 'The Beach Boys'
|
||||||
|
doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), 'NAMED', 'LEMMA', 'TYPE')
|
||||||
|
|
||||||
|
for word in doc:
|
||||||
|
if word.i < word.head.i:
|
||||||
|
assert word in list(word.head.lefts)
|
||||||
|
elif word.i > word.head.i:
|
||||||
|
assert word in list(word.head.rights)
|
||||||
|
|
Loading…
Reference in New Issue