Questionable fix for parser training bug with misaligned sentences (#6694)

* Questionable fix for parser training bug with misaligned sentences * Fix Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2021-01-16 00:18:24 +11:00 · 2021-01-16 00:18:24 +11:00 · 7b3f0c6f1b
parent 330f9818c0
commit 7b3f0c6f1b
2 changed files with 26 additions and 4 deletions
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -204,6 +204,32 @@ cdef class ArcEagerGold:
    def update(self, StateClass stcls):
        update_gold_state(&self.c, stcls.c)
 def _get_aligned_sent_starts(example):
    """Get list of SENT_START attributes aligned to the predicted tokenization.
    If the reference has not sentence starts, return a list of None values.
    This function is slightly different from the one on Example, because we also
    check whether the reference sentences align across multiple sentences,
    and return missing values if they do. This prevents a problem where you have
    the start of a sentence merged onto a token that belongs to two sentences.
    """
    if example.y.has_annotation("SENT_START"):
        align = example.alignment.y2x
        sent_starts = [False] * len(example.x)
        seen_words = set()
        for y_sent in example.y.sents:
            x_indices = list(align[y_sent.start : y_sent.end].dataXd)
            if any(x_idx in seen_words for x_idx in x_indices):
                # If there are any tokens in X that align across two sentences,
                # regard the sentence annotations as missing, as we can't
                # reliably use them.
                return [None] * len(example.x)
            seen_words.update(x_indices)
            sent_starts[x_indices[0]] = True
        return sent_starts
    else:
        return [None] * len(example.x)
 cdef int check_state_gold(char state_bits, char flag) nogil:
    cdef char one = 1
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -200,10 +200,6 @@ cdef class Example:
    def get_aligned_sent_starts(self):
        """Get list of SENT_START attributes aligned to the predicted tokenization.
        If the reference has not sentence starts, return a list of None values.
        The aligned sentence starts use the get_aligned_spans method, rather
        than aligning the list of tags, so that it handles cases where a mistaken
        tokenization starts the sentence.
        """
        if self.y.has_annotation("SENT_START"):
            align = self.alignment.y2x