mirror of https://github.com/explosion/spaCy.git
Questionable fix for parser training bug with misaligned sentences (#6694)
* Questionable fix for parser training bug with misaligned sentences * Fix Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
330f9818c0
commit
7b3f0c6f1b
|
@ -204,6 +204,32 @@ cdef class ArcEagerGold:
|
|||
def update(self, StateClass stcls):
|
||||
update_gold_state(&self.c, stcls.c)
|
||||
|
||||
def _get_aligned_sent_starts(example):
|
||||
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
||||
If the reference has not sentence starts, return a list of None values.
|
||||
|
||||
This function is slightly different from the one on Example, because we also
|
||||
check whether the reference sentences align across multiple sentences,
|
||||
and return missing values if they do. This prevents a problem where you have
|
||||
the start of a sentence merged onto a token that belongs to two sentences.
|
||||
"""
|
||||
if example.y.has_annotation("SENT_START"):
|
||||
align = example.alignment.y2x
|
||||
sent_starts = [False] * len(example.x)
|
||||
seen_words = set()
|
||||
for y_sent in example.y.sents:
|
||||
x_indices = list(align[y_sent.start : y_sent.end].dataXd)
|
||||
if any(x_idx in seen_words for x_idx in x_indices):
|
||||
# If there are any tokens in X that align across two sentences,
|
||||
# regard the sentence annotations as missing, as we can't
|
||||
# reliably use them.
|
||||
return [None] * len(example.x)
|
||||
seen_words.update(x_indices)
|
||||
sent_starts[x_indices[0]] = True
|
||||
return sent_starts
|
||||
else:
|
||||
return [None] * len(example.x)
|
||||
|
||||
|
||||
cdef int check_state_gold(char state_bits, char flag) nogil:
|
||||
cdef char one = 1
|
||||
|
|
|
@ -200,10 +200,6 @@ cdef class Example:
|
|||
def get_aligned_sent_starts(self):
|
||||
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
||||
If the reference has not sentence starts, return a list of None values.
|
||||
|
||||
The aligned sentence starts use the get_aligned_spans method, rather
|
||||
than aligning the list of tags, so that it handles cases where a mistaken
|
||||
tokenization starts the sentence.
|
||||
"""
|
||||
if self.y.has_annotation("SENT_START"):
|
||||
align = self.alignment.y2x
|
||||
|
|
Loading…
Reference in New Issue