* Add words to gold_tuples from gold conll file

2015-03-24 04:27:20 +01:00 · 2015-03-24 04:27:20 +01:00 · 3b70b304b2
parent 2e12dec76e
commit 3b70b304b2
4 changed files with 17 additions and 10 deletions
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -44,7 +44,7 @@ cdef class ArcEager(TransitionSystem):
    def get_labels(cls, gold_parses):
        move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {},
                       LEFT: {}, BREAK: {'ROOT': True}}
-        for raw_text, segmented, (ids, tags, heads, labels, iob) in gold_parses:
+        for raw_text, segmented, (ids, words, tags, heads, labels, iob) in gold_parses:
            for i, (head, label) in enumerate(zip(heads, labels)):
                if label != 'ROOT':
                    if head > i:
@ -69,6 +69,13 @@ cdef class ArcEager(TransitionSystem):
            if self.c[i].move == move and self.c[i].label == label:
                return self.c[i]
    def move_name(self, int move, int label):
        label_str = self.strings[label]
        if label_str:
            return MOVE_NAMES[move] + '-' + label_str
        else:
            return MOVE_NAMES[move]
    cdef Transition init_transition(self, int clas, int move, int label) except *:
        # TODO: Apparent Cython bug here when we try to use the Transition()
        # constructor with the function pointers
@ -129,8 +136,8 @@ cdef int _do_right(const Transition* self, State* state) except -1:
 cdef int _do_reduce(const Transition* self, State* state) except -1:
-    # TODO: Huh? Is this some weirdness from the non-monotonic?
+    if NON_MONOTONIC and not has_head(get_s0(state)):
-    add_dep(state, state.stack[-1], state.stack[0], get_s0(state).dep)
+        add_dep(state, state.stack[-1], state.stack[0], get_s0(state).dep)
    pop_stack(state)
--- a/spacy/syntax/conll.pxd
+++ b/spacy/syntax/conll.pxd
@ -13,6 +13,7 @@ cdef class GoldParse:
    cdef readonly list tags
    cdef readonly list heads
    cdef readonly list labels
    cdef readonly dict orths
    cdef readonly list ner
    cdef readonly list ents
--- a/spacy/syntax/conll.pyx
+++ b/spacy/syntax/conll.pyx
@ -30,7 +30,7 @@ def read_docparse_file(loc):
            iob_ents.append(iob_ent)
        tokenized = [s.replace('<SEP>', ' ').split(' ')
                     for s in tok_text.split('<SENT>')]
-        sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents)))
+        sents.append((raw_text, tokenized, (ids, words, tags, heads, labels, iob_ents)))
    return sents
 def _parse_line(line):
@ -63,12 +63,14 @@ cdef class GoldParse:
        self.heads = [-1] * len(tokens)
        self.labels = ['MISSING'] * len(tokens)
        self.ner = ['O'] * len(tokens)
        self.orths = {}
        idx_map = {token.idx: token.i for token in tokens}
        self.ents = []
        ent_start = None
        ent_label = None
-        for idx, tag, head, label, ner in zip(*annot_tuples):
+        for idx, orth, tag, head, label, ner in zip(*annot_tuples):
            self.orths[idx] = orth
            if idx < tokens[0].idx:
                pass
            elif idx > tokens[-1].idx:
@ -133,5 +135,3 @@ def _map_indices_to_tokens(ids, heads):
        else:
            mapped.append(ids.index(head))
    return mapped
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -74,7 +74,7 @@ cdef class BiluoPushDown(TransitionSystem):
        move_labels = {MISSING: {'': True}, BEGIN: {}, IN: {}, LAST: {}, UNIT: {},
                       OUT: {'': True}}
        moves = ('M', 'B', 'I', 'L', 'U')
-        for (raw_text, toks, (ids, tags, heads, labels, biluo)) in gold_tuples:
+        for (raw_text, toks, (ids, words, tags, heads, labels, biluo)) in gold_tuples:
            for i, ner_tag in enumerate(biluo):
                if ner_tag != 'O' and ner_tag != '-':
                    move_str, label = ner_tag.split('-')
@ -87,8 +87,7 @@ cdef class BiluoPushDown(TransitionSystem):
        elif move == 'MISSING':
            return 'M'
        else:
-            labels = {id_: name for name, id_ in self.label_ids.items()}
+            return MOVE_NAMES[move] + '-' + self.strings[label]
            return MOVE_NAMES[move] + '-' + labels[label]
    cdef int preprocess_gold(self, GoldParse gold) except -1:
        for i in range(gold.length):