Fix GoldParse class

This commit is contained in:
Matthew Honnibal 2016-10-16 11:41:36 +02:00
parent e5151056cf
commit 52b48b415e
1 changed files with 5 additions and 15 deletions

View File

@ -228,7 +228,7 @@ cdef class GoldParse:
if tags is None: if tags is None:
tags = [None for _ in doc] tags = [None for _ in doc]
if heads is None: if heads is None:
heads = [None for _ in doc] heads = [token.i for token in doc]
if deps is None: if deps is None:
deps = [None for _ in doc] deps = [None for _ in doc]
if entities is None: if entities is None:
@ -261,12 +261,12 @@ cdef class GoldParse:
self.orig_annot = list(zip(*annot_tuples)) self.orig_annot = list(zip(*annot_tuples))
for i, gold_i in enumerate(self.cand_to_gold): for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].isspace(): if doc[i].text.isspace():
self.tags[i] = 'SP' self.tags[i] = 'SP'
self.heads[i] = None self.heads[i] = None
self.labels[i] = None self.labels[i] = None
self.ner[i] = 'O' self.ner[i] = 'O'
elif gold_i is None: if gold_i is None:
pass pass
else: else:
self.tags[i] = tags[gold_i] self.tags[i] = tags[gold_i]
@ -307,7 +307,7 @@ def biluo_tags_from_offsets(doc, entities):
tags (list): tags (list):
A list of unicode strings, describing the tags. Each tag string will A list of unicode strings, describing the tags. Each tag string will
be of the form either "", "O" or "{action}-{label}", where action is one be of the form either "", "O" or "{action}-{label}", where action is one
of "B", "I", "L", "U". The empty string "" is used where the entity of "B", "I", "L", "U". The string "-" is used where the entity
offsets don't align with the tokenization in the Doc object. The offsets don't align with the tokenization in the Doc object. The
training algorithm will view these as missing values. "O" denotes training algorithm will view these as missing values. "O" denotes
a non-entity token. "B" denotes the beginning of a multi-token entity, a non-entity token. "B" denotes the beginning of a multi-token entity,
@ -325,7 +325,7 @@ def biluo_tags_from_offsets(doc, entities):
''' '''
starts = {token.idx: token.i for token in doc} starts = {token.idx: token.i for token in doc}
ends = {token.idx+len(token): token.i for token in doc} ends = {token.idx+len(token): token.i for token in doc}
biluo = ['' for _ in doc] biluo = ['-' for _ in doc]
# Handle entity cases # Handle entity cases
for start_char, end_char, label in entities: for start_char, end_char, label in entities:
start_token = starts.get(start_char) start_token = starts.get(start_char)
@ -355,13 +355,3 @@ def biluo_tags_from_offsets(doc, entities):
def is_punct_label(label): def is_punct_label(label):
return label == 'P' or label.lower() == 'punct' return label == 'P' or label.lower() == 'punct'