Questionable fix for parser training bug with misaligned sentences (#6694)

* Questionable fix for parser training bug with misaligned sentences * Fix Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2021-01-16 00:18:24 +11:00 · 2021-01-16 00:18:24 +11:00 · 7b3f0c6f1b
parent 330f9818c0
commit 7b3f0c6f1b
2 changed files with 26 additions and 4 deletions
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -204,6 +204,32 @@ cdef class ArcEagerGold:
    def update(self, StateClass stcls):
        update_gold_state(&self.c, stcls.c)

+def _get_aligned_sent_starts(example):
+    """Get list of SENT_START attributes aligned to the predicted tokenization.
+    If the reference has not sentence starts, return a list of None values.
+
+    This function is slightly different from the one on Example, because we also
+    check whether the reference sentences align across multiple sentences,
+    and return missing values if they do. This prevents a problem where you have
+    the start of a sentence merged onto a token that belongs to two sentences.
+    """
+    if example.y.has_annotation("SENT_START"):
+        align = example.alignment.y2x
+        sent_starts = [False] * len(example.x)
+        seen_words = set()
+        for y_sent in example.y.sents:
+            x_indices = list(align[y_sent.start : y_sent.end].dataXd)
+            if any(x_idx in seen_words for x_idx in x_indices):
+                # If there are any tokens in X that align across two sentences,
+                # regard the sentence annotations as missing, as we can't
+                # reliably use them.
+                return [None] * len(example.x)
+            seen_words.update(x_indices)
+            sent_starts[x_indices[0]] = True
+        return sent_starts
+    else:
+        return [None] * len(example.x)
+

 cdef int check_state_gold(char state_bits, char flag) nogil:
    cdef char one = 1
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -200,10 +200,6 @@ cdef class Example:
    def get_aligned_sent_starts(self):
        """Get list of SENT_START attributes aligned to the predicted tokenization.
        If the reference has not sentence starts, return a list of None values.
-
-        The aligned sentence starts use the get_aligned_spans method, rather
-        than aligning the list of tags, so that it handles cases where a mistaken
-        tokenization starts the sentence.
        """
        if self.y.has_annotation("SENT_START"):
            align = self.alignment.y2x