From c362006cb982543f8093050bb91c71bd591b7fbe Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 8 Apr 2021 17:24:52 +0900
Subject: [PATCH] Fix is_sent_start when converting from JSON (fix #7635)
 (#7655)

Data in the JSON format is split into sentences, and each sentence is
saved with is_sent_start flags. Currently the flags are 1 for the first
token and 0 for the others. When deserialized this results in a pattern
of True, None, None, None... which makes single-sentence documents look
as though they haven't had sentence boundaries set.

Since items saved in JSON format have been split into sentences already,
the is_sent_start values should all be True or False.
---
 spacy/training/gold_io.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index 327748d01..69654e2c7 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -121,7 +121,7 @@ def json_to_annotations(doc):
                 if i == 0:
                     sent_starts.append(1)
                 else:
-                    sent_starts.append(0)
+                    sent_starts.append(-1)
             if "brackets" in sent:
                 brackets.extend((b["first"] + sent_start_i,
                                  b["last"] + sent_start_i, b["label"])