From 7c2d2deaa76f133954e809f10b25164dc9477577 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matt@spacy.io>
Date: Mon, 25 Apr 2016 16:39:28 +0000
Subject: [PATCH 1/2] * Revise transition system so that the Break transition
 retains sole responsibility for setting sentence boundaries. Re Issue #322

---
 spacy/syntax/arc_eager.pyx | 76 ++------------------------------------
 1 file changed, 4 insertions(+), 72 deletions(-)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 1b76f724c..7213fbbf7 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -22,7 +22,6 @@ from ._state cimport StateC, is_space_token
 
 DEF NON_MONOTONIC = True
 DEF USE_BREAK = True
-DEF USE_ROOT_ARC_SEGMENT = True
 
 cdef weight_t MIN_SCORE = -90000
 
@@ -91,8 +90,6 @@ cdef weight_t arc_cost(StateClass stcls, const GoldParseC* gold, int head, int c
 cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
     if gold.labels[child] == -1:
         return True
-    elif USE_ROOT_ARC_SEGMENT and _is_gold_root(gold, head) and _is_gold_root(gold, child):
-        return True
     elif gold.heads[child] == head:
         return True
     else:
@@ -235,25 +232,12 @@ cdef class Break:
             return False
         elif st.stack_depth() < 1:
             return False
-        # It is okay to predict a sentence boundary if the top item on the stack
-        # and the first item on the buffer are adjacent tokens. If this is not the
-        # case, it is still okay if there are only space tokens in between.
-        # This is checked by testing whether the head of a space token immediately
-        # preceding the first item in the buffer is the top item on the stack.
-        # Intervening space tokens must be attached to the previous non-space token.
-        # Therefore, if the head of a space token that immediately precedes the first
-        # item on the buffer is the top item on the stack, a sentence boundary can be
-        # predicted.
-        elif (st.S(0) + 1) != st.B(0) \
-        and not (is_space_token(st.safe_get(st.B(0)-1)) and st.H(st.B(0)-1) == st.S(0)):
-            # Must break at the token boundary
-            return False
         else:
             return True
 
     @staticmethod
     cdef int transition(StateC* st, int label) nogil:
-        st.set_break(st.B(0))
+        st.set_break(st.B_(0).l_edge)
         st.fast_forward()
 
     @staticmethod
@@ -295,8 +279,8 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil:
 cdef class ArcEager(TransitionSystem):
     @classmethod
     def get_labels(cls, gold_parses):
-        move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {'ROOT': True},
-                       LEFT: {'ROOT': True}, BREAK: {'ROOT': True}}
+        move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {},
+                       LEFT: {}, BREAK: {'ROOT': True}}
         for raw_text, sents in gold_parses:
             for (ids, words, tags, heads, labels, iob), ctnts in sents:
                 for child, head, label in zip(ids, heads, labels):
@@ -394,59 +378,7 @@ cdef class ArcEager(TransitionSystem):
         st.fast_forward()
 
     cdef int finalize_state(self, StateC* st) nogil:
-        cdef int i
-        cdef int orig_head_id
-        cdef TokenC* orig_head
-        cdef int new_edge
-        cdef int child_i
-        cdef TokenC* head_i
-        for i in range(st.length):
-            if st._sent[i].head == 0 and st._sent[i].dep == 0:
-                st._sent[i].dep = self.root_label
-            # If we're not using the Break transition, we segment via root-labelled
-            # arcs between the root words.
-            elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == self.root_label:
-                orig_head_id = st._sent[i].head
-                orig_head = &st._sent[orig_head_id]
-                if i < orig_head_id: # i is left dependent
-                    orig_head.l_kids -= 1
-                    if i == orig_head.l_edge: # i is left-most child
-                        # find the second left-most child and make it the new l_edge
-                        new_edge = orig_head_id
-                        child_i = i
-                        while child_i < orig_head_id:
-                            if st._sent[child_i].head == orig_head_id:
-                                new_edge = child_i
-                            child_i += 1
-                        # then walk up the path to root and update the l_edges of all ancestors
-                        # the logic here works because the tree is guaranteed to be projective
-                        head_i = &st._sent[orig_head.head]
-                        while head_i.l_edge == orig_head.l_edge:
-                            head_i.l_edge = new_edge
-                            head_i = &st._sent[head_i.head]
-                        orig_head.l_edge = new_edge
-
-                elif i > orig_head_id: # i is right dependent
-                    orig_head.r_kids -= 1
-                    if i == orig_head.r_edge:
-                        # find the second right-most child and make it the new r_edge
-                        new_edge = orig_head_id
-                        child_i = i
-                        while child_i > orig_head_id:
-                            if st._sent[child_i].head == orig_head_id:
-                                new_edge = child_i
-                            child_i -= 1
-                        # then walk up the path to root and update the l_edges of all ancestors
-                        # the logic here works because the tree is guaranteed to be projective
-                        head_i = &st._sent[orig_head.head]
-                        while head_i.r_edge == orig_head.r_edge:
-                            head_i.r_edge = new_edge
-                            head_i = &st._sent[head_i.head]
-                        orig_head.r_edge = new_edge
-
-                # note that this can create non-projective trees if there are arcs
-                # between nodes on both sides of the new root node
-                st._sent[i].head = 0
+        pass
 
     cdef int set_valid(self, int* output, const StateC* st) nogil:
         cdef bint[N_MOVES] is_valid

From 77609588b6d39bf4e57ba5ea1fdf94c7490f0fe2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matt@spacy.io>
Date: Mon, 25 Apr 2016 19:39:19 +0000
Subject: [PATCH 2/2] * Fix assignment of root label to words left as root
 implicitly, after parsing ends.

---
 spacy/syntax/arc_eager.pyx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 7213fbbf7..3f2bc7c65 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -378,7 +378,10 @@ cdef class ArcEager(TransitionSystem):
         st.fast_forward()
 
     cdef int finalize_state(self, StateC* st) nogil:
-        pass
+        cdef int i
+        for i in range(st.length):
+            if st._sent[i].head == 0 and st._sent[i].dep == 0:
+                st._sent[i].dep = self.root_label
 
     cdef int set_valid(self, int* output, const StateC* st) nogil:
         cdef bint[N_MOVES] is_valid