mirror of https://github.com/explosion/spaCy.git
Fix GoldParse class
This commit is contained in:
parent
e5151056cf
commit
52b48b415e
|
@ -228,7 +228,7 @@ cdef class GoldParse:
|
||||||
if tags is None:
|
if tags is None:
|
||||||
tags = [None for _ in doc]
|
tags = [None for _ in doc]
|
||||||
if heads is None:
|
if heads is None:
|
||||||
heads = [None for _ in doc]
|
heads = [token.i for token in doc]
|
||||||
if deps is None:
|
if deps is None:
|
||||||
deps = [None for _ in doc]
|
deps = [None for _ in doc]
|
||||||
if entities is None:
|
if entities is None:
|
||||||
|
@ -261,12 +261,12 @@ cdef class GoldParse:
|
||||||
self.orig_annot = list(zip(*annot_tuples))
|
self.orig_annot = list(zip(*annot_tuples))
|
||||||
|
|
||||||
for i, gold_i in enumerate(self.cand_to_gold):
|
for i, gold_i in enumerate(self.cand_to_gold):
|
||||||
if doc[i].isspace():
|
if doc[i].text.isspace():
|
||||||
self.tags[i] = 'SP'
|
self.tags[i] = 'SP'
|
||||||
self.heads[i] = None
|
self.heads[i] = None
|
||||||
self.labels[i] = None
|
self.labels[i] = None
|
||||||
self.ner[i] = 'O'
|
self.ner[i] = 'O'
|
||||||
elif gold_i is None:
|
if gold_i is None:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
self.tags[i] = tags[gold_i]
|
self.tags[i] = tags[gold_i]
|
||||||
|
@ -307,7 +307,7 @@ def biluo_tags_from_offsets(doc, entities):
|
||||||
tags (list):
|
tags (list):
|
||||||
A list of unicode strings, describing the tags. Each tag string will
|
A list of unicode strings, describing the tags. Each tag string will
|
||||||
be of the form either "", "O" or "{action}-{label}", where action is one
|
be of the form either "", "O" or "{action}-{label}", where action is one
|
||||||
of "B", "I", "L", "U". The empty string "" is used where the entity
|
of "B", "I", "L", "U". The string "-" is used where the entity
|
||||||
offsets don't align with the tokenization in the Doc object. The
|
offsets don't align with the tokenization in the Doc object. The
|
||||||
training algorithm will view these as missing values. "O" denotes
|
training algorithm will view these as missing values. "O" denotes
|
||||||
a non-entity token. "B" denotes the beginning of a multi-token entity,
|
a non-entity token. "B" denotes the beginning of a multi-token entity,
|
||||||
|
@ -325,7 +325,7 @@ def biluo_tags_from_offsets(doc, entities):
|
||||||
'''
|
'''
|
||||||
starts = {token.idx: token.i for token in doc}
|
starts = {token.idx: token.i for token in doc}
|
||||||
ends = {token.idx+len(token): token.i for token in doc}
|
ends = {token.idx+len(token): token.i for token in doc}
|
||||||
biluo = ['' for _ in doc]
|
biluo = ['-' for _ in doc]
|
||||||
# Handle entity cases
|
# Handle entity cases
|
||||||
for start_char, end_char, label in entities:
|
for start_char, end_char, label in entities:
|
||||||
start_token = starts.get(start_char)
|
start_token = starts.get(start_char)
|
||||||
|
@ -355,13 +355,3 @@ def biluo_tags_from_offsets(doc, entities):
|
||||||
|
|
||||||
def is_punct_label(label):
|
def is_punct_label(label):
|
||||||
return label == 'P' or label.lower() == 'punct'
|
return label == 'P' or label.lower() == 'punct'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue