* Merge updated/simplified Break approach

2016-04-25 19:44:42 +00:00 · 2016-04-25 19:44:42 +00:00 · 97b2bba249
parent feb65fcaa1 77609588b6
commit 97b2bba249
1 changed files with 3 additions and 76 deletions
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -22,7 +22,6 @@ from ._state cimport StateC, is_space_token

 DEF NON_MONOTONIC = True
 DEF USE_BREAK = True
-DEF USE_ROOT_ARC_SEGMENT = True

 cdef weight_t MIN_SCORE = -90000

@ -91,8 +90,6 @@ cdef weight_t arc_cost(StateClass stcls, const GoldParseC* gold, int head, int c
 cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
    if gold.labels[child] == -1:
        return True
-    elif USE_ROOT_ARC_SEGMENT and _is_gold_root(gold, head) and _is_gold_root(gold, child):
-        return True
    elif gold.heads[child] == head:
        return True
    else:
@ -235,25 +232,12 @@ cdef class Break:
            return False
        elif st.stack_depth() < 1:
            return False
-        # It is okay to predict a sentence boundary if the top item on the stack
-        # and the first item on the buffer are adjacent tokens. If this is not the
-        # case, it is still okay if there are only space tokens in between.
-        # This is checked by testing whether the head of a space token immediately
-        # preceding the first item in the buffer is the top item on the stack.
-        # Intervening space tokens must be attached to the previous non-space token.
-        # Therefore, if the head of a space token that immediately precedes the first
-        # item on the buffer is the top item on the stack, a sentence boundary can be
-        # predicted.
-        elif (st.S(0) + 1) != st.B(0) \
-        and not (is_space_token(st.safe_get(st.B(0)-1)) and st.H(st.B(0)-1) == st.S(0)):
-            # Must break at the token boundary
-            return False
        else:
            return True

    @staticmethod
    cdef int transition(StateC* st, int label) nogil:
-        st.set_break(st.B(0))
+        st.set_break(st.B_(0).l_edge)
        st.fast_forward()

    @staticmethod
@ -295,8 +279,8 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil:
 cdef class ArcEager(TransitionSystem):
    @classmethod
    def get_labels(cls, gold_parses):
-        move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {'ROOT': True},
-                       LEFT: {'ROOT': True}, BREAK: {'ROOT': True}}
+        move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {},
+                       LEFT: {}, BREAK: {'ROOT': True}}
        for raw_text, sents in gold_parses:
            for (ids, words, tags, heads, labels, iob), ctnts in sents:
                for child, head, label in zip(ids, heads, labels):
@ -395,66 +379,9 @@ cdef class ArcEager(TransitionSystem):

    cdef int finalize_state(self, StateC* st) nogil:
        cdef int i
-        cdef int orig_head_id
-        cdef TokenC* orig_head
-        cdef int new_edge
-        cdef int child_i
-        cdef int head_i
        for i in range(st.length):
            if st._sent[i].head == 0 and st._sent[i].dep == 0:
                st._sent[i].dep = self.root_label
-            # If we're not using the Break transition, we segment via root-labelled
-            # arcs between the root words.
-            elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == self.root_label:
-                orig_head_id = i + st._sent[i].head
-                orig_head = &st._sent[orig_head_id]
-                if i < orig_head_id: # i is left dependent
-                    orig_head.l_kids -= 1
-                    if i == orig_head.l_edge: # i is left-most child
-                        # find the second left-most child and make it the new l_edge
-                        new_edge = orig_head_id
-                        child_i = i+1
-                        while child_i < orig_head_id:
-                            if child_i + st._sent[child_i].head == orig_head_id:
-                                new_edge = child_i
-                                break
-                            child_i += 1
-                        # then walk up the path to root and update the l_edges of all ancestors
-                        # the logic here works because the tree is guaranteed to be projective
-                        head_i = orig_head_id + orig_head.head
-                        while st._sent[head_i].l_edge == orig_head.l_edge:
-                            st._sent[head_i].l_edge = new_edge
-                            if st._sent[head_i].head == 0:
-                                break
-                            head_i += st._sent[head_i].head
-                        orig_head.l_edge = new_edge
-
-                elif i > orig_head_id: # i is right dependent
-                    orig_head.r_kids -= 1
-                    if i == orig_head.r_edge:
-                        # find the second right-most child and make it the new r_edge
-                        new_edge = orig_head_id
-                        child_i = i-1
-                        while child_i > orig_head_id:
-                            if child_i + st._sent[child_i].head == orig_head_id:
-                                new_edge = child_i
-                                break
-                            child_i -= 1
-                        # then walk up the path to root and update the r_edges of all ancestors
-                        # the logic here works because the tree is guaranteed to be projective
-                        head_i = orig_head_id + orig_head.head
-                        while st._sent[head_i].r_edge == orig_head.r_edge:
-                            st._sent[head_i].r_edge = new_edge
-                            if st._sent[head_i].head == 0:
-                                break
-                            head_i += st._sent[head_i].head
-                        orig_head.r_edge = new_edge
-
-                # note that this may create non-projective trees if there are arcs
-                # between nodes on both sides of the new root node
-                st._sent[i].head = 0
-                st._sent[st._sent[i].l_edge].sent_start = True
-

    cdef int set_valid(self, int* output, const StateC* st) nogil:
        cdef bint[N_MOVES] is_valid