From 64adda32029b867b25bc6f3313863abfc70a6fd1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 29 May 2020 23:21:55 +0200
Subject: [PATCH] Revert "Remove peeking from Parser.begin_training (#5456)"

This reverts commit 9393253b66b5f9fc6c5e58806cf261da5afd1778.

The model shouldn't need to see all examples, and actually in v3 there's
no equivalent step. All examples are provided to the component, for the
component to do stuff like figuring out the labels. The model just needs
to do stuff like shape inference.
---
 spacy/syntax/nn_parser.pyx | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index fafa492c6..d5c6bf2a8 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -9,6 +9,7 @@ import numpy
 cimport cython.parallel
 import numpy.random
 cimport numpy as np
+from itertools import islice
 from cpython.ref cimport PyObject, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from libc.math cimport exp
@@ -620,15 +621,15 @@ cdef class Parser:
             self.model, cfg = self.Model(self.moves.n_moves, **cfg)
             if sgd is None:
                 sgd = self.create_optimizer()
-            docs = []
-            golds = []
-            for raw_text, annots_brackets in get_gold_tuples():
+            doc_sample = []
+            gold_sample = []
+            for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
                 for annots, brackets in annots_brackets:
                     ids, words, tags, heads, deps, ents = annots
-                    docs.append(Doc(self.vocab, words=words))
-                    golds.append(GoldParse(docs[-1], words=words, tags=tags,
-                                           heads=heads, deps=deps, entities=ents))
-            self.model.begin_training(docs, golds)
+                    doc_sample.append(Doc(self.vocab, words=words))
+                    gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags,
+                                                 heads=heads, deps=deps, entities=ents))
+            self.model.begin_training(doc_sample, gold_sample)
             if pipeline is not None:
                 self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
             link_vectors_to_models(self.vocab)