* Fix Issue #122: Incorrect calculation of children after Doc.merge()

This commit is contained in:
Matthew Honnibal 2015-10-18 17:17:27 +11:00
parent 454c1996d0
commit a7e6c5ac8f
2 changed files with 44 additions and 8 deletions

View File

@ -447,9 +447,8 @@ cdef class Doc:
cdef Span span = self[start:end]
# Get LexemeC for newly merged token
new_orth = ''.join([t.string for t in span])
if span[-1].whitespace_:
new_orth = new_orth[:-1]
new_orth = ''.join([t.text_with_ws for t in span])
new_orth = new_orth[:-len(span[-1].whitespace_)]
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
# House the new merged token where it starts
cdef TokenC* token = &self.data[start]
@ -508,16 +507,26 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
cdef TokenC* head
cdef TokenC* child
cdef int i
# Set number of left/right children to 0. We'll increment it in the loops.
for i in range(length):
tokens[i].l_kids = 0
tokens[i].r_kids = 0
tokens[i].l_edge = i
tokens[i].r_edge = i
# Set left edges
for i in range(length):
child = &tokens[i]
head = &tokens[i + child.head]
if child < head and child.l_edge < head.l_edge:
head.l_edge = child.l_edge
if child < head:
if child.l_edge < head.l_edge:
head.l_edge = child.l_edge
head.l_kids += 1
# Set right edges --- same as above, but iterate in reverse
for i in range(length-1, -1, -1):
child = &tokens[i]
head = &tokens[i + child.head]
if child > head and child.r_edge > head.r_edge:
head.r_edge = child.r_edge
if child > head:
if child.r_edge > head.r_edge:
head.r_edge = child.r_edge
head.r_kids += 1

View File

@ -109,3 +109,30 @@ def test_set_ents(EN):
assert ent.label_ == 'PRODUCT'
assert ent.start == 2
assert ent.end == 4
def test_merge(EN):
doc = EN('WKRO played songs by the beach boys all night')
assert len(doc) == 9
# merge 'The Beach Boys'
doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), 'NAMED', 'LEMMA', 'TYPE')
assert len(doc) == 7
assert doc[4].text == 'the beach boys'
assert doc[4].text_with_ws == 'the beach boys '
assert doc[4].tag_ == 'NAMED'
@pytest.mark.models
def test_merge_children(EN):
"""Test that attachments work correctly after merging."""
doc = EN('WKRO played songs by the beach boys all night')
# merge 'The Beach Boys'
doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), 'NAMED', 'LEMMA', 'TYPE')
for word in doc:
if word.i < word.head.i:
assert word in list(word.head.lefts)
elif word.i > word.head.i:
assert word in list(word.head.rights)