From 6ab7e4059034b62cd1735f85baa39f5b0e18c89a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 18 Dec 2014 11:33:25 +1100
Subject: [PATCH] * Add non-monotonic parsing with cost-sensitive update. 92.26
 on Y&M set

---
 spacy/syntax/_parse_features.pyx |   2 +-
 spacy/syntax/_state.pxd          |   8 +-
 spacy/syntax/_state.pyx          |  23 ++++--
 spacy/syntax/arc_eager.pxd       |  12 ++-
 spacy/syntax/arc_eager.pyx       | 135 +++++++++++++++++--------------
 spacy/syntax/parser.pyx          |  25 +++---
 6 files changed, 118 insertions(+), 87 deletions(-)

diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx
index d87db9c9c..b9aee28c5 100644
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@@ -45,7 +45,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
         # the source that are set to 1.
         context[4] = token.lex.cluster & 63
         context[5] = token.lex.cluster & 15
-        context[6] = token.dep_tag
+        context[6] = token.dep_tag if has_head(token) else 0
 
 
 cdef int fill_context(atom_t* context, State* state) except -1:
diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index 51ba406bd..9e0426a29 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -70,10 +70,10 @@ cdef inline bint is_final(const State *s) nogil:
     return at_eol(s) # The stack will be attached to root anyway
 
 
-cdef int children_in_buffer(const State *s, const int head, int* gold) except -1
-cdef int head_in_buffer(const State *s, const int child, int* gold) except -1
-cdef int children_in_stack(const State *s, const int head, int* gold) except -1
-cdef int head_in_stack(const State *s, const int child, int* gold) except -1
+cdef int children_in_buffer(const State *s, const int head, const int* gold) except -1
+cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1
+cdef int children_in_stack(const State *s, const int head, const int* gold) except -1
+cdef int head_in_stack(const State *s, const int child, const int* gold) except -1
 
 cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL
 
diff --git a/spacy/syntax/_state.pyx b/spacy/syntax/_state.pyx
index b8abece8b..e00e5f6a2 100644
--- a/spacy/syntax/_state.pyx
+++ b/spacy/syntax/_state.pyx
@@ -6,6 +6,10 @@ from ..lexeme cimport EMPTY_LEXEME
 from ..tokens cimport TokenC
 
 
+DEF PADDING = 5
+DEF NON_MONOTONIC = True
+
+
 cdef int add_dep(State *s, int head, int child, int label) except -1:
     cdef int dist = head - child
     s.sent[child].head = dist
@@ -32,9 +36,14 @@ cdef int push_stack(State *s) except -1:
     s.stack[0] = s.i
     s.stack_len += 1
     s.i += 1
+    if at_eol(s):
+        while s.stack_len != 0:
+            if not has_head(get_s0(s)):
+                get_s0(s).dep_tag = 0
+            pop_stack(s)
 
 
-cdef int children_in_buffer(const State *s, int head, int* gold) except -1:
+cdef int children_in_buffer(const State *s, int head, const int* gold) except -1:
     # Golds holds an array of head offsets --- the head of word i is i - golds[i]
     # Iterate over the tokens of the queue, and check whether their gold head is
     # our target
@@ -46,20 +55,21 @@ cdef int children_in_buffer(const State *s, int head, int* gold) except -1:
     return n
 
 
-cdef int head_in_buffer(const State *s, const int child, int* gold) except -1:
+cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1:
     return gold[child] >= s.i
 
 
-cdef int children_in_stack(const State *s, const int head, int* gold) except -1:
+cdef int children_in_stack(const State *s, const int head, const int* gold) except -1:
     cdef int i
     cdef int n = 0
     for i in range(s.stack_len):
         if gold[s.stack[-i]] == head:
-            n += 1
+            if NON_MONOTONIC or not has_head(get_s0(s)):
+                n += 1
     return n
 
 
-cdef int head_in_stack(const State *s, const int child, int* gold) except -1:
+cdef int head_in_stack(const State *s, const int child, const int* gold) except -1:
     cdef int i
     for i in range(s.stack_len):
         if gold[child] == s.stack[-i]:
@@ -104,9 +114,6 @@ cdef int count_right_kids(const TokenC* head) nogil:
 
 
 
-DEF PADDING = 5
-
-
 cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL:
     cdef int padded_len = sent_length + PADDING + PADDING
     cdef State* s = <State*>mem.alloc(1, sizeof(State))
diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd
index 6a316c90e..da8163e51 100644
--- a/spacy/syntax/arc_eager.pxd
+++ b/spacy/syntax/arc_eager.pxd
@@ -7,8 +7,11 @@ from ._state cimport State
 
 
 cdef struct Transition:
+    int clas
     int move
     int label
+    int cost
+    weight_t score
 
 
 cdef class TransitionSystem:
@@ -18,7 +21,8 @@ cdef class TransitionSystem:
 
     cdef const Transition* _moves
 
-    cdef int best_valid(self, const weight_t* scores, const State* s) except -1
-    cdef int best_gold(self, const weight_t* scores, const State* s,
-                       int* gold_heads, int* gold_labels) except -1
-    cdef int transition(self, State *s, const int clas) except -1
+    cdef Transition best_valid(self, const weight_t* scores, const State* s) except *
+    cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
+                              const State* s,
+                              const int* gold_heads, const int* gold_labels) except *
+    cdef int transition(self, State *s, const Transition* t) except -1
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index cde21dc7f..33ec87919 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -7,6 +7,8 @@ from ._state cimport head_in_stack, children_in_stack
 
 from ..tokens cimport TokenC
 
+DEF NON_MONOTONIC = True
+
 
 cdef enum:
     SHIFT
@@ -25,22 +27,30 @@ cdef inline bint _can_right(const State* s) nogil:
 
 
 cdef inline bint _can_left(const State* s) nogil:
-    return s.stack_len >= 1 and not has_head(get_s0(s))
+    if NON_MONOTONIC:
+        return s.stack_len >= 1
+    else:
+        return s.stack_len >= 1 and not has_head(get_s0(s))
 
 
 cdef inline bint _can_reduce(const State* s) nogil:
-    return s.stack_len >= 2 and has_head(get_s0(s))
+    if NON_MONOTONIC:
+        return s.stack_len >= 2
+    else:
+        return s.stack_len >= 2 and has_head(get_s0(s))
 
 
-cdef int _shift_cost(const State* s, int* gold) except -1:
+cdef int _shift_cost(const State* s, const int* gold) except -1:
     assert not at_eol(s)
     cost = 0
     cost += head_in_stack(s, s.i, gold)
     cost += children_in_stack(s, s.i, gold)
+    if NON_MONOTONIC:
+        cost += gold[s.stack[0]] == s.i
     return cost
 
 
-cdef int _right_cost(const State* s, int* gold) except -1:
+cdef int _right_cost(const State* s, const int* gold) except -1:
     assert s.stack_len >= 1
     cost = 0
     if gold[s.i] == s.stack[0]:
@@ -48,10 +58,12 @@ cdef int _right_cost(const State* s, int* gold) except -1:
     cost += head_in_buffer(s, s.i, gold)
     cost += children_in_stack(s, s.i, gold)
     cost += head_in_stack(s, s.i, gold)
+    if NON_MONOTONIC:
+        cost += gold[s.stack[0]] == s.i
     return cost
 
 
-cdef int _left_cost(const State* s, int* gold) except -1:
+cdef int _left_cost(const State* s, const int* gold) except -1:
     assert s.stack_len >= 1
     cost = 0
     if gold[s.stack[0]] == s.i:
@@ -59,11 +71,17 @@ cdef int _left_cost(const State* s, int* gold) except -1:
 
     cost += head_in_buffer(s, s.stack[0], gold)
     cost += children_in_buffer(s, s.stack[0], gold)
+    if NON_MONOTONIC and s.stack_len >= 2:
+        cost += gold[s.stack[0]] == s.stack[-1]
     return cost
 
 
-cdef int _reduce_cost(const State* s, int* gold) except -1:
-    return children_in_buffer(s, s.stack[0], gold)
+cdef int _reduce_cost(const State* s, const int* gold) except -1:
+    cdef int cost = 0
+    cost += children_in_buffer(s, s.stack[0], gold)
+    if NON_MONOTONIC:
+        cost += head_in_buffer(s, s.stack[0], gold)
+    return cost
 
 
 cdef class TransitionSystem:
@@ -80,9 +98,11 @@ cdef class TransitionSystem:
         cdef int i = 0
         moves[i].move = SHIFT
         moves[i].label = 0
+        moves[i].clas = i
         i += 1
         moves[i].move = REDUCE
         moves[i].label = 0
+        moves[i].clas = i
         i += 1
         self.label_ids = {'ROOT': 0}
         cdef int label_id
@@ -90,17 +110,21 @@ cdef class TransitionSystem:
             label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
             moves[i].move = LEFT
             moves[i].label = label_id
+            moves[i].clas = i
             i += 1
         for label_str in right_labels:
             label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
             moves[i].move = RIGHT
             moves[i].label = label_id
+            moves[i].clas = i
             i += 1
         self._moves = moves
 
-    cdef int transition(self, State *s, const int clas) except -1:
-        cdef const Transition* t = &self._moves[clas]
+    cdef int transition(self, State *s, const Transition* t) except -1:
         if t.move == SHIFT:
+            # Set the dep label, in case we need it after we reduce
+            if NON_MONOTONIC:
+                get_s0(s).dep_tag = t.label
             push_stack(s)
         elif t.move == LEFT:
             add_dep(s, s.i, s.stack[0], t.label)
@@ -109,11 +133,12 @@ cdef class TransitionSystem:
             add_dep(s, s.stack[0], s.i, t.label)
             push_stack(s)
         elif t.move == REDUCE:
+            add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep_tag)
             pop_stack(s)
         else:
             raise StandardError(t.move)
 
-    cdef int best_valid(self, const weight_t* scores, const State* s) except -1:
+    cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
         cdef bint[N_MOVES] valid
         valid[SHIFT] = _can_shift(s)
         valid[LEFT] = _can_left(s)
@@ -122,69 +147,61 @@ cdef class TransitionSystem:
 
         cdef int best = -1
         cdef weight_t score = 0
+        cdef weight_t best_r_score = -9000
+        cdef int best_r_label = -1
         cdef int i
         for i in range(self.n_moves):
             if valid[self._moves[i].move] and (best == -1 or scores[i] > score):
                 best = i
                 score = scores[i]
+            if self._moves[i].move == RIGHT and scores[i] > best_r_score:
+                best_r_label = self._moves[i].label
         assert best >= 0
-        return best
+        cdef Transition t = self._moves[best]
+        t.score = score
+        if t.move == SHIFT:
+            t.label = best_r_label
+        return t
 
-    cdef int best_gold(self, const weight_t* scores, const State* s,
-                       int* gold_heads, int* gold_labels) except -1:
+    cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
+                              const State* s,
+                              const int* gold_heads, const int* gold_labels) except *:
+        # If we can create a gold dependency, only one action can be correct
         cdef int[N_MOVES] unl_costs
         unl_costs[SHIFT] = _shift_cost(s, gold_heads) if _can_shift(s) else -1
         unl_costs[LEFT] = _left_cost(s, gold_heads) if _can_left(s) else -1
         unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1
         unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1
 
-        #s0_buff_head = head_in_buffer(s, get_s0(s), gold_heads)
-        #s0_stack_head = head_in_stack(s, get_s0(s), gold_heads)
-        #s0_buff_kids = children_in_buffer(s, get_s0(s), gold_heads)
-        #s0_stack_kids = children_in_stack(s, get_s0(s), gold_heads)
+        guess.cost = unl_costs[guess.move]
+        cdef Transition t
+        cdef int target_label
+        cdef int i
+        if gold_heads[s.stack[0]] == s.i:
+            target_label = gold_labels[s.stack[0]]
+            if guess.move == LEFT:
+                guess.cost += guess.label != target_label
+            for i in range(self.n_moves):
+                t = self._moves[i]
+                if t.move == LEFT and t.label == target_label:
+                    return t
+        elif gold_heads[s.i] == s.stack[0]:
+            target_label = gold_labels[s.i]
+            if guess.move == RIGHT:
+                guess.cost += guess.label != target_label
+            for i in range(self.n_moves):
+                t = self._moves[i]
+                if t.move == RIGHT and t.label == target_label:
+                    return t
 
-        #n0_buff_head = head_in_buffer(s, get_n0(s), gold_heads)
-        #n0_stack_head = head_in_stack(s, get_n0(s), gold_heads)
-        #n0_buff_kids = children_in_buffer(s, get_n0(s), gold_heads)
-        #n0_stack_kids = children_in_buffer(s, get_n0(s), gold_heads)
-
-        cdef int cost
-        cdef int move
-        cdef int label
         cdef int best = -1
         cdef weight_t score = -9000
-        cdef int i
         for i in range(self.n_moves):
-            move = self._moves[i].move
-            label = self._moves[i].label
-            if unl_costs[move] == 0: 
-                if move == SHIFT or move == REDUCE:
-                    cost = 0
-                elif move == LEFT:
-                    if gold_heads[s.stack[0]] == s.i and gold_labels[s.stack[0]] != -1:
-                        cost = label != gold_labels[s.stack[0]]
-                    else:
-                        cost = 0
-                elif move == RIGHT:
-                    if gold_heads[s.i] == s.stack[0] and gold_labels[s.i] != -1:
-                        cost = label != gold_labels[s.i]
-                    else:
-                        cost = 0
-                else:
-                    raise StandardError("Unknown Move")
-                if cost == 0 and (best == -1 or scores[i] > score):
-                    best = i
-                    score = scores[i]
- 
-        if best < 0:
-            print unl_costs[SHIFT], unl_costs[REDUCE], unl_costs[LEFT], unl_costs[RIGHT]
-            print s.stack_len
-            print has_head(get_s0(s))
-            print s.sent[s.stack[0]].head
-            print s.stack[0], s.i
-            print gold_heads[s.stack[0]], gold_heads[s.i]
-            print gold_labels[s.i]
-            print children_in_buffer(s, s.stack[0], gold_heads)
-            print head_in_buffer(s, s.stack[0], gold_heads)
-            raise StandardError 
-        return best
+            t = self._moves[i]
+            if unl_costs[t.move] == 0 and (best == -1 or scores[i] > score):
+                best = i
+                score = scores[i]
+        t = self._moves[best]
+        t.score = score
+        assert best >= 0
+        return t
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 7c207bd0c..a0bb7485a 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -24,7 +24,7 @@ from thinc.learner cimport LinearModel
 
 from ..tokens cimport Tokens, TokenC
 
-from .arc_eager cimport TransitionSystem
+from .arc_eager cimport TransitionSystem, Transition
 
 from ._state cimport init_state, State, is_final, get_idx, get_s0, get_s1
 
@@ -70,7 +70,7 @@ cdef class GreedyParser:
         cdef:
             Feature* feats
             const weight_t* scores
-            int guess
+            Transition guess
 
         cdef atom_t[CONTEXT_SIZE] context
         cdef int n_feats
@@ -81,13 +81,15 @@ cdef class GreedyParser:
             feats = self.extractor.get_feats(context, &n_feats)
             scores = self.model.get_scores(feats, n_feats)
             guess = self.moves.best_valid(scores, state)
-            self.moves.transition(state, guess)
+            self.moves.transition(state, &guess)
         return 0
 
     def train_sent(self, Tokens tokens, list gold_heads, list gold_labels):
         cdef:
             Feature* feats
             weight_t* scores
+            Transition guess
+            Transition gold
 
         cdef int n_feats
         cdef atom_t[CONTEXT_SIZE] context
@@ -105,17 +107,18 @@ cdef class GreedyParser:
             feats = self.extractor.get_feats(context, &n_feats)
             scores = self.model.get_scores(feats, n_feats)
             guess = self.moves.best_valid(scores, state)
-            best = self.moves.best_gold(scores, state, heads_array, labels_array)
-            counts = _get_counts(guess, best, feats, n_feats)
+            best = self.moves.best_gold(&guess, scores, state, heads_array, labels_array)
+            counts = _get_counts(guess.clas, best.clas, feats, n_feats, guess.cost)
             self.model.update(counts)
-            self.moves.transition(state, guess)
+            self.moves.transition(state, &guess)
         cdef int n_corr = 0
         for i in range(tokens.length):
             n_corr += (i + state.sent[i].head) == gold_heads[i]
         return n_corr
 
 
-cdef dict _get_counts(int guess, int best, const Feature* feats, const int n_feats):
+cdef dict _get_counts(int guess, int best, const Feature* feats, const int n_feats,
+                      int inc):
     if guess == best:
         return {}
 
@@ -125,10 +128,10 @@ cdef dict _get_counts(int guess, int best, const Feature* feats, const int n_fea
     for i in range(n_feats):
         key = (feats[i].i, feats[i].key)
         if key in gold_counts:
-            gold_counts[key] += feats[i].value
-            guess_counts[key] -= feats[i].value
+            gold_counts[key] += (feats[i].value * inc)
+            guess_counts[key] -= (feats[i].value * inc)
         else:
-            gold_counts[key] = feats[i].value
-            guess_counts[key] = -feats[i].value
+            gold_counts[key] = (feats[i].value * inc)
+            guess_counts[key] = -(feats[i].value * inc)
     return {guess: guess_counts, best: gold_counts}