* Add words to gold_tuples from gold conll file

This commit is contained in:
Matthew Honnibal 2015-03-24 04:27:20 +01:00
parent 2e12dec76e
commit 3b70b304b2
4 changed files with 17 additions and 10 deletions

View File

@ -44,7 +44,7 @@ cdef class ArcEager(TransitionSystem):
def get_labels(cls, gold_parses):
move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {},
LEFT: {}, BREAK: {'ROOT': True}}
for raw_text, segmented, (ids, tags, heads, labels, iob) in gold_parses:
for raw_text, segmented, (ids, words, tags, heads, labels, iob) in gold_parses:
for i, (head, label) in enumerate(zip(heads, labels)):
if label != 'ROOT':
if head > i:
@ -69,6 +69,13 @@ cdef class ArcEager(TransitionSystem):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
def move_name(self, int move, int label):
label_str = self.strings[label]
if label_str:
return MOVE_NAMES[move] + '-' + label_str
else:
return MOVE_NAMES[move]
cdef Transition init_transition(self, int clas, int move, int label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers
@ -129,8 +136,8 @@ cdef int _do_right(const Transition* self, State* state) except -1:
cdef int _do_reduce(const Transition* self, State* state) except -1:
# TODO: Huh? Is this some weirdness from the non-monotonic?
add_dep(state, state.stack[-1], state.stack[0], get_s0(state).dep)
if NON_MONOTONIC and not has_head(get_s0(state)):
add_dep(state, state.stack[-1], state.stack[0], get_s0(state).dep)
pop_stack(state)

View File

@ -13,6 +13,7 @@ cdef class GoldParse:
cdef readonly list tags
cdef readonly list heads
cdef readonly list labels
cdef readonly dict orths
cdef readonly list ner
cdef readonly list ents

View File

@ -30,7 +30,7 @@ def read_docparse_file(loc):
iob_ents.append(iob_ent)
tokenized = [s.replace('<SEP>', ' ').split(' ')
for s in tok_text.split('<SENT>')]
sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents)))
sents.append((raw_text, tokenized, (ids, words, tags, heads, labels, iob_ents)))
return sents
def _parse_line(line):
@ -63,12 +63,14 @@ cdef class GoldParse:
self.heads = [-1] * len(tokens)
self.labels = ['MISSING'] * len(tokens)
self.ner = ['O'] * len(tokens)
self.orths = {}
idx_map = {token.idx: token.i for token in tokens}
self.ents = []
ent_start = None
ent_label = None
for idx, tag, head, label, ner in zip(*annot_tuples):
for idx, orth, tag, head, label, ner in zip(*annot_tuples):
self.orths[idx] = orth
if idx < tokens[0].idx:
pass
elif idx > tokens[-1].idx:
@ -133,5 +135,3 @@ def _map_indices_to_tokens(ids, heads):
else:
mapped.append(ids.index(head))
return mapped

View File

@ -74,7 +74,7 @@ cdef class BiluoPushDown(TransitionSystem):
move_labels = {MISSING: {'': True}, BEGIN: {}, IN: {}, LAST: {}, UNIT: {},
OUT: {'': True}}
moves = ('M', 'B', 'I', 'L', 'U')
for (raw_text, toks, (ids, tags, heads, labels, biluo)) in gold_tuples:
for (raw_text, toks, (ids, words, tags, heads, labels, biluo)) in gold_tuples:
for i, ner_tag in enumerate(biluo):
if ner_tag != 'O' and ner_tag != '-':
move_str, label = ner_tag.split('-')
@ -87,8 +87,7 @@ cdef class BiluoPushDown(TransitionSystem):
elif move == 'MISSING':
return 'M'
else:
labels = {id_: name for name, id_ in self.label_ids.items()}
return MOVE_NAMES[move] + '-' + labels[label]
return MOVE_NAMES[move] + '-' + self.strings[label]
cdef int preprocess_gold(self, GoldParse gold) except -1:
for i in range(gold.length):