From 1759abf1e5703cb568e0f2de56ed1beb0388a6f6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 19 Sep 2018 14:50:06 +0200 Subject: [PATCH] Fix bug in sentence starts for non-projective parses The set_children_from_heads function assumed parse trees were projective. However, non-projective parses may be passed in during deserialization, or after deprojectivising. This caused incorrect sentence boundaries to be set for non-projective parses. Close #2772. --- spacy/tests/regression/test_issue2772.py | 1 - spacy/tokens/doc.pyx | 41 +++++++++++++----------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/spacy/tests/regression/test_issue2772.py b/spacy/tests/regression/test_issue2772.py index 5420b20d8..c9e0cf0f2 100644 --- a/spacy/tests/regression/test_issue2772.py +++ b/spacy/tests/regression/test_issue2772.py @@ -2,7 +2,6 @@ import pytest from ..util import get_doc -@pytest.mark.xfail def test_issue2772(en_vocab): words = 'When we write or communicate virtually , we can hide our true feelings .'.split() # A tree with a non-projective (i.e. crossing) arc diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index d5c392750..bfebe9304 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -993,25 +993,28 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: tokens[i].r_kids = 0 tokens[i].l_edge = i tokens[i].r_edge = i - # Set left edges - for i in range(length): - child = &tokens[i] - head = &tokens[i + child.head] - if child < head: - head.l_kids += 1 - if child.l_edge < head.l_edge: - head.l_edge = child.l_edge - - # Set right edges --- same as above, but iterate in reverse - for i in range(length-1, -1, -1): - child = &tokens[i] - head = &tokens[i + child.head] - if child > head: - head.r_kids += 1 - if child.r_edge > head.r_edge: - head.r_edge = child.r_edge - - + # Twice, for non-projectivity + for _ in range(2): + # Set left edges + for i in range(length): + child = &tokens[i] + head = &tokens[i + child.head] + if child < head: + head.l_kids += 1 + if child.l_edge < head.l_edge: + head.l_edge = child.l_edge + if child.r_edge > head.r_edge: + head.r_edge = child.r_edge + # Set right edges --- same as above, but iterate in reverse + for i in range(length-1, -1, -1): + child = &tokens[i] + head = &tokens[i + child.head] + if child > head: + head.r_kids += 1 + if child.r_edge > head.r_edge: + head.r_edge = child.r_edge + if child.l_edge < head.l_edge: + head.l_edge = child.l_edge # Set sentence starts for i in range(length): if tokens[i].head == 0 and tokens[i].dep != 0: