From c362006cb982543f8093050bb91c71bd591b7fbe Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 8 Apr 2021 17:24:52 +0900 Subject: [PATCH] Fix is_sent_start when converting from JSON (fix #7635) (#7655) Data in the JSON format is split into sentences, and each sentence is saved with is_sent_start flags. Currently the flags are 1 for the first token and 0 for the others. When deserialized this results in a pattern of True, None, None, None... which makes single-sentence documents look as though they haven't had sentence boundaries set. Since items saved in JSON format have been split into sentences already, the is_sent_start values should all be True or False. --- spacy/training/gold_io.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx index 327748d01..69654e2c7 100644 --- a/spacy/training/gold_io.pyx +++ b/spacy/training/gold_io.pyx @@ -121,7 +121,7 @@ def json_to_annotations(doc): if i == 0: sent_starts.append(1) else: - sent_starts.append(0) + sent_starts.append(-1) if "brackets" in sent: brackets.extend((b["first"] + sent_start_i, b["last"] + sent_start_i, b["label"])