* Fix Issue #122: Incorrect calculation of children after Doc.merge()

2015-10-18 17:17:27 +11:00 · 2015-10-18 17:17:27 +11:00 · a7e6c5ac8f
parent 454c1996d0
commit a7e6c5ac8f
2 changed files with 44 additions and 8 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -447,9 +447,8 @@ cdef class Doc:
        cdef Span span = self[start:end]
        # Get LexemeC for newly merged token
-        new_orth = ''.join([t.string for t in span])
+        new_orth = ''.join([t.text_with_ws for t in span])
-        if span[-1].whitespace_:
+        new_orth = new_orth[:-len(span[-1].whitespace_)]
            new_orth = new_orth[:-1]
        cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
        # House the new merged token where it starts
        cdef TokenC* token = &self.data[start]
@ -508,16 +507,26 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
    cdef TokenC* head
    cdef TokenC* child
    cdef int i
    # Set number of left/right children to 0. We'll increment it in the loops.
    for i in range(length):
        tokens[i].l_kids = 0
        tokens[i].r_kids = 0
        tokens[i].l_edge = i
        tokens[i].r_edge = i
    # Set left edges
    for i in range(length):
        child = &tokens[i]
        head = &tokens[i + child.head]
-        if child < head and child.l_edge < head.l_edge:
+        if child < head:
-            head.l_edge = child.l_edge
+            if child.l_edge < head.l_edge:
                head.l_edge = child.l_edge
            head.l_kids += 1
    # Set right edges --- same as above, but iterate in reverse
    for i in range(length-1, -1, -1):
        child = &tokens[i]
        head = &tokens[i + child.head]
-        if child > head and child.r_edge > head.r_edge:
+        if child > head:
-            head.r_edge = child.r_edge
+            if child.r_edge > head.r_edge:
-
+                head.r_edge = child.r_edge
            head.r_kids += 1
--- a/tests/tokens/test_tokens_api.py
+++ b/tests/tokens/test_tokens_api.py
@ -109,3 +109,30 @@ def test_set_ents(EN):
    assert ent.label_ == 'PRODUCT'
    assert ent.start == 2
    assert ent.end == 4
 def test_merge(EN):
    doc = EN('WKRO played songs by the beach boys all night')
    assert len(doc) == 9
    # merge 'The Beach Boys'
    doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), 'NAMED', 'LEMMA', 'TYPE')
    assert len(doc) == 7
    assert doc[4].text == 'the beach boys'
    assert doc[4].text_with_ws == 'the beach boys '
    assert doc[4].tag_ == 'NAMED'
@pytest.mark.models
 def test_merge_children(EN):
    """Test that attachments work correctly after merging."""
    doc = EN('WKRO played songs by the beach boys all night')
    # merge 'The Beach Boys'
    doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), 'NAMED', 'LEMMA', 'TYPE')
    for word in doc:
        if word.i < word.head.i:
            assert word in list(word.head.lefts)
        elif word.i > word.head.i:
            assert word in list(word.head.rights)