From 3b70b304b2f4f6925808dd3c8c0b2057732ba57d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 24 Mar 2015 04:27:20 +0100 Subject: [PATCH] * Add words to gold_tuples from gold conll file --- spacy/syntax/arc_eager.pyx | 13 ++++++++++--- spacy/syntax/conll.pxd | 1 + spacy/syntax/conll.pyx | 8 ++++---- spacy/syntax/ner.pyx | 5 ++--- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 461d4f5f7..18c7ea8a9 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -44,7 +44,7 @@ cdef class ArcEager(TransitionSystem): def get_labels(cls, gold_parses): move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {}, LEFT: {}, BREAK: {'ROOT': True}} - for raw_text, segmented, (ids, tags, heads, labels, iob) in gold_parses: + for raw_text, segmented, (ids, words, tags, heads, labels, iob) in gold_parses: for i, (head, label) in enumerate(zip(heads, labels)): if label != 'ROOT': if head > i: @@ -69,6 +69,13 @@ cdef class ArcEager(TransitionSystem): if self.c[i].move == move and self.c[i].label == label: return self.c[i] + def move_name(self, int move, int label): + label_str = self.strings[label] + if label_str: + return MOVE_NAMES[move] + '-' + label_str + else: + return MOVE_NAMES[move] + cdef Transition init_transition(self, int clas, int move, int label) except *: # TODO: Apparent Cython bug here when we try to use the Transition() # constructor with the function pointers @@ -129,8 +136,8 @@ cdef int _do_right(const Transition* self, State* state) except -1: cdef int _do_reduce(const Transition* self, State* state) except -1: - # TODO: Huh? Is this some weirdness from the non-monotonic? - add_dep(state, state.stack[-1], state.stack[0], get_s0(state).dep) + if NON_MONOTONIC and not has_head(get_s0(state)): + add_dep(state, state.stack[-1], state.stack[0], get_s0(state).dep) pop_stack(state) diff --git a/spacy/syntax/conll.pxd b/spacy/syntax/conll.pxd index 60583969b..815920ea6 100644 --- a/spacy/syntax/conll.pxd +++ b/spacy/syntax/conll.pxd @@ -13,6 +13,7 @@ cdef class GoldParse: cdef readonly list tags cdef readonly list heads cdef readonly list labels + cdef readonly dict orths cdef readonly list ner cdef readonly list ents diff --git a/spacy/syntax/conll.pyx b/spacy/syntax/conll.pyx index 3170ac09c..9a980dc5b 100644 --- a/spacy/syntax/conll.pyx +++ b/spacy/syntax/conll.pyx @@ -30,7 +30,7 @@ def read_docparse_file(loc): iob_ents.append(iob_ent) tokenized = [s.replace('', ' ').split(' ') for s in tok_text.split('')] - sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents))) + sents.append((raw_text, tokenized, (ids, words, tags, heads, labels, iob_ents))) return sents def _parse_line(line): @@ -63,12 +63,14 @@ cdef class GoldParse: self.heads = [-1] * len(tokens) self.labels = ['MISSING'] * len(tokens) self.ner = ['O'] * len(tokens) + self.orths = {} idx_map = {token.idx: token.i for token in tokens} self.ents = [] ent_start = None ent_label = None - for idx, tag, head, label, ner in zip(*annot_tuples): + for idx, orth, tag, head, label, ner in zip(*annot_tuples): + self.orths[idx] = orth if idx < tokens[0].idx: pass elif idx > tokens[-1].idx: @@ -133,5 +135,3 @@ def _map_indices_to_tokens(ids, heads): else: mapped.append(ids.index(head)) return mapped - - diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 1ce9b29e8..5c49fa83d 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -74,7 +74,7 @@ cdef class BiluoPushDown(TransitionSystem): move_labels = {MISSING: {'': True}, BEGIN: {}, IN: {}, LAST: {}, UNIT: {}, OUT: {'': True}} moves = ('M', 'B', 'I', 'L', 'U') - for (raw_text, toks, (ids, tags, heads, labels, biluo)) in gold_tuples: + for (raw_text, toks, (ids, words, tags, heads, labels, biluo)) in gold_tuples: for i, ner_tag in enumerate(biluo): if ner_tag != 'O' and ner_tag != '-': move_str, label = ner_tag.split('-') @@ -87,8 +87,7 @@ cdef class BiluoPushDown(TransitionSystem): elif move == 'MISSING': return 'M' else: - labels = {id_: name for name, id_ in self.label_ids.items()} - return MOVE_NAMES[move] + '-' + labels[label] + return MOVE_NAMES[move] + '-' + self.strings[label] cdef int preprocess_gold(self, GoldParse gold) except -1: for i in range(gold.length):