From 6ab7e4059034b62cd1735f85baa39f5b0e18c89a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 18 Dec 2014 11:33:25 +1100 Subject: [PATCH] * Add non-monotonic parsing with cost-sensitive update. 92.26 on Y&M set --- spacy/syntax/_parse_features.pyx | 2 +- spacy/syntax/_state.pxd | 8 +- spacy/syntax/_state.pyx | 23 ++++-- spacy/syntax/arc_eager.pxd | 12 ++- spacy/syntax/arc_eager.pyx | 135 +++++++++++++++++-------------- spacy/syntax/parser.pyx | 25 +++--- 6 files changed, 118 insertions(+), 87 deletions(-) diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index d87db9c9c..b9aee28c5 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -45,7 +45,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: # the source that are set to 1. context[4] = token.lex.cluster & 63 context[5] = token.lex.cluster & 15 - context[6] = token.dep_tag + context[6] = token.dep_tag if has_head(token) else 0 cdef int fill_context(atom_t* context, State* state) except -1: diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 51ba406bd..9e0426a29 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -70,10 +70,10 @@ cdef inline bint is_final(const State *s) nogil: return at_eol(s) # The stack will be attached to root anyway -cdef int children_in_buffer(const State *s, const int head, int* gold) except -1 -cdef int head_in_buffer(const State *s, const int child, int* gold) except -1 -cdef int children_in_stack(const State *s, const int head, int* gold) except -1 -cdef int head_in_stack(const State *s, const int child, int* gold) except -1 +cdef int children_in_buffer(const State *s, const int head, const int* gold) except -1 +cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1 +cdef int children_in_stack(const State *s, const int head, const int* gold) except -1 +cdef int head_in_stack(const State *s, const int child, const int* gold) except -1 cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL diff --git a/spacy/syntax/_state.pyx b/spacy/syntax/_state.pyx index b8abece8b..e00e5f6a2 100644 --- a/spacy/syntax/_state.pyx +++ b/spacy/syntax/_state.pyx @@ -6,6 +6,10 @@ from ..lexeme cimport EMPTY_LEXEME from ..tokens cimport TokenC +DEF PADDING = 5 +DEF NON_MONOTONIC = True + + cdef int add_dep(State *s, int head, int child, int label) except -1: cdef int dist = head - child s.sent[child].head = dist @@ -32,9 +36,14 @@ cdef int push_stack(State *s) except -1: s.stack[0] = s.i s.stack_len += 1 s.i += 1 + if at_eol(s): + while s.stack_len != 0: + if not has_head(get_s0(s)): + get_s0(s).dep_tag = 0 + pop_stack(s) -cdef int children_in_buffer(const State *s, int head, int* gold) except -1: +cdef int children_in_buffer(const State *s, int head, const int* gold) except -1: # Golds holds an array of head offsets --- the head of word i is i - golds[i] # Iterate over the tokens of the queue, and check whether their gold head is # our target @@ -46,20 +55,21 @@ cdef int children_in_buffer(const State *s, int head, int* gold) except -1: return n -cdef int head_in_buffer(const State *s, const int child, int* gold) except -1: +cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1: return gold[child] >= s.i -cdef int children_in_stack(const State *s, const int head, int* gold) except -1: +cdef int children_in_stack(const State *s, const int head, const int* gold) except -1: cdef int i cdef int n = 0 for i in range(s.stack_len): if gold[s.stack[-i]] == head: - n += 1 + if NON_MONOTONIC or not has_head(get_s0(s)): + n += 1 return n -cdef int head_in_stack(const State *s, const int child, int* gold) except -1: +cdef int head_in_stack(const State *s, const int child, const int* gold) except -1: cdef int i for i in range(s.stack_len): if gold[child] == s.stack[-i]: @@ -104,9 +114,6 @@ cdef int count_right_kids(const TokenC* head) nogil: -DEF PADDING = 5 - - cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL: cdef int padded_len = sent_length + PADDING + PADDING cdef State* s = mem.alloc(1, sizeof(State)) diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index 6a316c90e..da8163e51 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -7,8 +7,11 @@ from ._state cimport State cdef struct Transition: + int clas int move int label + int cost + weight_t score cdef class TransitionSystem: @@ -18,7 +21,8 @@ cdef class TransitionSystem: cdef const Transition* _moves - cdef int best_valid(self, const weight_t* scores, const State* s) except -1 - cdef int best_gold(self, const weight_t* scores, const State* s, - int* gold_heads, int* gold_labels) except -1 - cdef int transition(self, State *s, const int clas) except -1 + cdef Transition best_valid(self, const weight_t* scores, const State* s) except * + cdef Transition best_gold(self, Transition* guess, const weight_t* scores, + const State* s, + const int* gold_heads, const int* gold_labels) except * + cdef int transition(self, State *s, const Transition* t) except -1 diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index cde21dc7f..33ec87919 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -7,6 +7,8 @@ from ._state cimport head_in_stack, children_in_stack from ..tokens cimport TokenC +DEF NON_MONOTONIC = True + cdef enum: SHIFT @@ -25,22 +27,30 @@ cdef inline bint _can_right(const State* s) nogil: cdef inline bint _can_left(const State* s) nogil: - return s.stack_len >= 1 and not has_head(get_s0(s)) + if NON_MONOTONIC: + return s.stack_len >= 1 + else: + return s.stack_len >= 1 and not has_head(get_s0(s)) cdef inline bint _can_reduce(const State* s) nogil: - return s.stack_len >= 2 and has_head(get_s0(s)) + if NON_MONOTONIC: + return s.stack_len >= 2 + else: + return s.stack_len >= 2 and has_head(get_s0(s)) -cdef int _shift_cost(const State* s, int* gold) except -1: +cdef int _shift_cost(const State* s, const int* gold) except -1: assert not at_eol(s) cost = 0 cost += head_in_stack(s, s.i, gold) cost += children_in_stack(s, s.i, gold) + if NON_MONOTONIC: + cost += gold[s.stack[0]] == s.i return cost -cdef int _right_cost(const State* s, int* gold) except -1: +cdef int _right_cost(const State* s, const int* gold) except -1: assert s.stack_len >= 1 cost = 0 if gold[s.i] == s.stack[0]: @@ -48,10 +58,12 @@ cdef int _right_cost(const State* s, int* gold) except -1: cost += head_in_buffer(s, s.i, gold) cost += children_in_stack(s, s.i, gold) cost += head_in_stack(s, s.i, gold) + if NON_MONOTONIC: + cost += gold[s.stack[0]] == s.i return cost -cdef int _left_cost(const State* s, int* gold) except -1: +cdef int _left_cost(const State* s, const int* gold) except -1: assert s.stack_len >= 1 cost = 0 if gold[s.stack[0]] == s.i: @@ -59,11 +71,17 @@ cdef int _left_cost(const State* s, int* gold) except -1: cost += head_in_buffer(s, s.stack[0], gold) cost += children_in_buffer(s, s.stack[0], gold) + if NON_MONOTONIC and s.stack_len >= 2: + cost += gold[s.stack[0]] == s.stack[-1] return cost -cdef int _reduce_cost(const State* s, int* gold) except -1: - return children_in_buffer(s, s.stack[0], gold) +cdef int _reduce_cost(const State* s, const int* gold) except -1: + cdef int cost = 0 + cost += children_in_buffer(s, s.stack[0], gold) + if NON_MONOTONIC: + cost += head_in_buffer(s, s.stack[0], gold) + return cost cdef class TransitionSystem: @@ -80,9 +98,11 @@ cdef class TransitionSystem: cdef int i = 0 moves[i].move = SHIFT moves[i].label = 0 + moves[i].clas = i i += 1 moves[i].move = REDUCE moves[i].label = 0 + moves[i].clas = i i += 1 self.label_ids = {'ROOT': 0} cdef int label_id @@ -90,17 +110,21 @@ cdef class TransitionSystem: label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) moves[i].move = LEFT moves[i].label = label_id + moves[i].clas = i i += 1 for label_str in right_labels: label_id = self.label_ids.setdefault(label_str, len(self.label_ids)) moves[i].move = RIGHT moves[i].label = label_id + moves[i].clas = i i += 1 self._moves = moves - cdef int transition(self, State *s, const int clas) except -1: - cdef const Transition* t = &self._moves[clas] + cdef int transition(self, State *s, const Transition* t) except -1: if t.move == SHIFT: + # Set the dep label, in case we need it after we reduce + if NON_MONOTONIC: + get_s0(s).dep_tag = t.label push_stack(s) elif t.move == LEFT: add_dep(s, s.i, s.stack[0], t.label) @@ -109,11 +133,12 @@ cdef class TransitionSystem: add_dep(s, s.stack[0], s.i, t.label) push_stack(s) elif t.move == REDUCE: + add_dep(s, s.stack[-1], s.stack[0], get_s0(s).dep_tag) pop_stack(s) else: raise StandardError(t.move) - cdef int best_valid(self, const weight_t* scores, const State* s) except -1: + cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: cdef bint[N_MOVES] valid valid[SHIFT] = _can_shift(s) valid[LEFT] = _can_left(s) @@ -122,69 +147,61 @@ cdef class TransitionSystem: cdef int best = -1 cdef weight_t score = 0 + cdef weight_t best_r_score = -9000 + cdef int best_r_label = -1 cdef int i for i in range(self.n_moves): if valid[self._moves[i].move] and (best == -1 or scores[i] > score): best = i score = scores[i] + if self._moves[i].move == RIGHT and scores[i] > best_r_score: + best_r_label = self._moves[i].label assert best >= 0 - return best + cdef Transition t = self._moves[best] + t.score = score + if t.move == SHIFT: + t.label = best_r_label + return t - cdef int best_gold(self, const weight_t* scores, const State* s, - int* gold_heads, int* gold_labels) except -1: + cdef Transition best_gold(self, Transition* guess, const weight_t* scores, + const State* s, + const int* gold_heads, const int* gold_labels) except *: + # If we can create a gold dependency, only one action can be correct cdef int[N_MOVES] unl_costs unl_costs[SHIFT] = _shift_cost(s, gold_heads) if _can_shift(s) else -1 unl_costs[LEFT] = _left_cost(s, gold_heads) if _can_left(s) else -1 unl_costs[RIGHT] = _right_cost(s, gold_heads) if _can_right(s) else -1 unl_costs[REDUCE] = _reduce_cost(s, gold_heads) if _can_reduce(s) else -1 - #s0_buff_head = head_in_buffer(s, get_s0(s), gold_heads) - #s0_stack_head = head_in_stack(s, get_s0(s), gold_heads) - #s0_buff_kids = children_in_buffer(s, get_s0(s), gold_heads) - #s0_stack_kids = children_in_stack(s, get_s0(s), gold_heads) + guess.cost = unl_costs[guess.move] + cdef Transition t + cdef int target_label + cdef int i + if gold_heads[s.stack[0]] == s.i: + target_label = gold_labels[s.stack[0]] + if guess.move == LEFT: + guess.cost += guess.label != target_label + for i in range(self.n_moves): + t = self._moves[i] + if t.move == LEFT and t.label == target_label: + return t + elif gold_heads[s.i] == s.stack[0]: + target_label = gold_labels[s.i] + if guess.move == RIGHT: + guess.cost += guess.label != target_label + for i in range(self.n_moves): + t = self._moves[i] + if t.move == RIGHT and t.label == target_label: + return t - #n0_buff_head = head_in_buffer(s, get_n0(s), gold_heads) - #n0_stack_head = head_in_stack(s, get_n0(s), gold_heads) - #n0_buff_kids = children_in_buffer(s, get_n0(s), gold_heads) - #n0_stack_kids = children_in_buffer(s, get_n0(s), gold_heads) - - cdef int cost - cdef int move - cdef int label cdef int best = -1 cdef weight_t score = -9000 - cdef int i for i in range(self.n_moves): - move = self._moves[i].move - label = self._moves[i].label - if unl_costs[move] == 0: - if move == SHIFT or move == REDUCE: - cost = 0 - elif move == LEFT: - if gold_heads[s.stack[0]] == s.i and gold_labels[s.stack[0]] != -1: - cost = label != gold_labels[s.stack[0]] - else: - cost = 0 - elif move == RIGHT: - if gold_heads[s.i] == s.stack[0] and gold_labels[s.i] != -1: - cost = label != gold_labels[s.i] - else: - cost = 0 - else: - raise StandardError("Unknown Move") - if cost == 0 and (best == -1 or scores[i] > score): - best = i - score = scores[i] - - if best < 0: - print unl_costs[SHIFT], unl_costs[REDUCE], unl_costs[LEFT], unl_costs[RIGHT] - print s.stack_len - print has_head(get_s0(s)) - print s.sent[s.stack[0]].head - print s.stack[0], s.i - print gold_heads[s.stack[0]], gold_heads[s.i] - print gold_labels[s.i] - print children_in_buffer(s, s.stack[0], gold_heads) - print head_in_buffer(s, s.stack[0], gold_heads) - raise StandardError - return best + t = self._moves[i] + if unl_costs[t.move] == 0 and (best == -1 or scores[i] > score): + best = i + score = scores[i] + t = self._moves[best] + t.score = score + assert best >= 0 + return t diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 7c207bd0c..a0bb7485a 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -24,7 +24,7 @@ from thinc.learner cimport LinearModel from ..tokens cimport Tokens, TokenC -from .arc_eager cimport TransitionSystem +from .arc_eager cimport TransitionSystem, Transition from ._state cimport init_state, State, is_final, get_idx, get_s0, get_s1 @@ -70,7 +70,7 @@ cdef class GreedyParser: cdef: Feature* feats const weight_t* scores - int guess + Transition guess cdef atom_t[CONTEXT_SIZE] context cdef int n_feats @@ -81,13 +81,15 @@ cdef class GreedyParser: feats = self.extractor.get_feats(context, &n_feats) scores = self.model.get_scores(feats, n_feats) guess = self.moves.best_valid(scores, state) - self.moves.transition(state, guess) + self.moves.transition(state, &guess) return 0 def train_sent(self, Tokens tokens, list gold_heads, list gold_labels): cdef: Feature* feats weight_t* scores + Transition guess + Transition gold cdef int n_feats cdef atom_t[CONTEXT_SIZE] context @@ -105,17 +107,18 @@ cdef class GreedyParser: feats = self.extractor.get_feats(context, &n_feats) scores = self.model.get_scores(feats, n_feats) guess = self.moves.best_valid(scores, state) - best = self.moves.best_gold(scores, state, heads_array, labels_array) - counts = _get_counts(guess, best, feats, n_feats) + best = self.moves.best_gold(&guess, scores, state, heads_array, labels_array) + counts = _get_counts(guess.clas, best.clas, feats, n_feats, guess.cost) self.model.update(counts) - self.moves.transition(state, guess) + self.moves.transition(state, &guess) cdef int n_corr = 0 for i in range(tokens.length): n_corr += (i + state.sent[i].head) == gold_heads[i] return n_corr -cdef dict _get_counts(int guess, int best, const Feature* feats, const int n_feats): +cdef dict _get_counts(int guess, int best, const Feature* feats, const int n_feats, + int inc): if guess == best: return {} @@ -125,10 +128,10 @@ cdef dict _get_counts(int guess, int best, const Feature* feats, const int n_fea for i in range(n_feats): key = (feats[i].i, feats[i].key) if key in gold_counts: - gold_counts[key] += feats[i].value - guess_counts[key] -= feats[i].value + gold_counts[key] += (feats[i].value * inc) + guess_counts[key] -= (feats[i].value * inc) else: - gold_counts[key] = feats[i].value - guess_counts[key] = -feats[i].value + gold_counts[key] = (feats[i].value * inc) + guess_counts[key] = -(feats[i].value * inc) return {guess: guess_counts, best: gold_counts}