Fix NER gold-standard around whitespace

This commit is contained in:
Matthew Honnibal 2019-08-29 14:33:07 +02:00
parent 216f63a987
commit 6511e1d8d3
1 changed files with 13 additions and 2 deletions

View File

@ -635,7 +635,7 @@ cdef class GoldParse:
self.tags[i] = "_SP" self.tags[i] = "_SP"
self.heads[i] = None self.heads[i] = None
self.labels[i] = None self.labels[i] = None
self.ner[i] = "O" self.ner[i] = None
self.morphology[i] = set() self.morphology[i] = set()
if gold_i is None: if gold_i is None:
if i in i2j_multi: if i in i2j_multi:
@ -686,9 +686,20 @@ cdef class GoldParse:
self.labels[i] = deps[gold_i] self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i] self.ner[i] = entities[gold_i]
# Prevent whitespace that isn't within entities from being tagged as
# an entity.
for i in range(len(self.ner)):
if self.tags[i] == "_SP":
prev_ner = self.ner[i-1] if i >= 1 else None
next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
if prev_ner == "O" or next_ner == "O":
self.ner[i] = "O"
cycle = nonproj.contains_cycle(self.heads) cycle = nonproj.contains_cycle(self.heads)
if cycle is not None: if cycle is not None:
raise ValueError(Errors.E069.format(cycle=cycle, cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), doc_tokens=" ".join(words[:50]))) raise ValueError(Errors.E069.format(cycle=cycle,
cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
doc_tokens=" ".join(words[:50])))
def __len__(self): def __len__(self):
"""Get the number of gold-standard tokens. """Get the number of gold-standard tokens.