mirror of https://github.com/explosion/spaCy.git
Questionable fix for parser training bug with misaligned sentences (#6694)
* Questionable fix for parser training bug with misaligned sentences * Fix Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
330f9818c0
commit
7b3f0c6f1b
|
@ -204,6 +204,32 @@ cdef class ArcEagerGold:
|
||||||
def update(self, StateClass stcls):
|
def update(self, StateClass stcls):
|
||||||
update_gold_state(&self.c, stcls.c)
|
update_gold_state(&self.c, stcls.c)
|
||||||
|
|
||||||
|
def _get_aligned_sent_starts(example):
|
||||||
|
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
||||||
|
If the reference has not sentence starts, return a list of None values.
|
||||||
|
|
||||||
|
This function is slightly different from the one on Example, because we also
|
||||||
|
check whether the reference sentences align across multiple sentences,
|
||||||
|
and return missing values if they do. This prevents a problem where you have
|
||||||
|
the start of a sentence merged onto a token that belongs to two sentences.
|
||||||
|
"""
|
||||||
|
if example.y.has_annotation("SENT_START"):
|
||||||
|
align = example.alignment.y2x
|
||||||
|
sent_starts = [False] * len(example.x)
|
||||||
|
seen_words = set()
|
||||||
|
for y_sent in example.y.sents:
|
||||||
|
x_indices = list(align[y_sent.start : y_sent.end].dataXd)
|
||||||
|
if any(x_idx in seen_words for x_idx in x_indices):
|
||||||
|
# If there are any tokens in X that align across two sentences,
|
||||||
|
# regard the sentence annotations as missing, as we can't
|
||||||
|
# reliably use them.
|
||||||
|
return [None] * len(example.x)
|
||||||
|
seen_words.update(x_indices)
|
||||||
|
sent_starts[x_indices[0]] = True
|
||||||
|
return sent_starts
|
||||||
|
else:
|
||||||
|
return [None] * len(example.x)
|
||||||
|
|
||||||
|
|
||||||
cdef int check_state_gold(char state_bits, char flag) nogil:
|
cdef int check_state_gold(char state_bits, char flag) nogil:
|
||||||
cdef char one = 1
|
cdef char one = 1
|
||||||
|
|
|
@ -200,10 +200,6 @@ cdef class Example:
|
||||||
def get_aligned_sent_starts(self):
|
def get_aligned_sent_starts(self):
|
||||||
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
||||||
If the reference has not sentence starts, return a list of None values.
|
If the reference has not sentence starts, return a list of None values.
|
||||||
|
|
||||||
The aligned sentence starts use the get_aligned_spans method, rather
|
|
||||||
than aligning the list of tags, so that it handles cases where a mistaken
|
|
||||||
tokenization starts the sentence.
|
|
||||||
"""
|
"""
|
||||||
if self.y.has_annotation("SENT_START"):
|
if self.y.has_annotation("SENT_START"):
|
||||||
align = self.alignment.y2x
|
align = self.alignment.y2x
|
||||||
|
|
Loading…
Reference in New Issue