Questionable fix for parser training bug with misaligned sentences (#6694)

* Questionable fix for parser training bug with misaligned sentences

* Fix

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
Matthew Honnibal 2021-01-16 00:18:24 +11:00 committed by GitHub
parent 330f9818c0
commit 7b3f0c6f1b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 26 additions and 4 deletions

View File

@ -204,6 +204,32 @@ cdef class ArcEagerGold:
def update(self, StateClass stcls): def update(self, StateClass stcls):
update_gold_state(&self.c, stcls.c) update_gold_state(&self.c, stcls.c)
def _get_aligned_sent_starts(example):
"""Get list of SENT_START attributes aligned to the predicted tokenization.
If the reference has not sentence starts, return a list of None values.
This function is slightly different from the one on Example, because we also
check whether the reference sentences align across multiple sentences,
and return missing values if they do. This prevents a problem where you have
the start of a sentence merged onto a token that belongs to two sentences.
"""
if example.y.has_annotation("SENT_START"):
align = example.alignment.y2x
sent_starts = [False] * len(example.x)
seen_words = set()
for y_sent in example.y.sents:
x_indices = list(align[y_sent.start : y_sent.end].dataXd)
if any(x_idx in seen_words for x_idx in x_indices):
# If there are any tokens in X that align across two sentences,
# regard the sentence annotations as missing, as we can't
# reliably use them.
return [None] * len(example.x)
seen_words.update(x_indices)
sent_starts[x_indices[0]] = True
return sent_starts
else:
return [None] * len(example.x)
cdef int check_state_gold(char state_bits, char flag) nogil: cdef int check_state_gold(char state_bits, char flag) nogil:
cdef char one = 1 cdef char one = 1

View File

@ -200,10 +200,6 @@ cdef class Example:
def get_aligned_sent_starts(self): def get_aligned_sent_starts(self):
"""Get list of SENT_START attributes aligned to the predicted tokenization. """Get list of SENT_START attributes aligned to the predicted tokenization.
If the reference has not sentence starts, return a list of None values. If the reference has not sentence starts, return a list of None values.
The aligned sentence starts use the get_aligned_spans method, rather
than aligning the list of tags, so that it handles cases where a mistaken
tokenization starts the sentence.
""" """
if self.y.has_annotation("SENT_START"): if self.y.has_annotation("SENT_START"):
align = self.alignment.y2x align = self.alignment.y2x