From 64adda32029b867b25bc6f3313863abfc70a6fd1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 29 May 2020 23:21:55 +0200 Subject: [PATCH] Revert "Remove peeking from Parser.begin_training (#5456)" This reverts commit 9393253b66b5f9fc6c5e58806cf261da5afd1778. The model shouldn't need to see all examples, and actually in v3 there's no equivalent step. All examples are provided to the component, for the component to do stuff like figuring out the labels. The model just needs to do stuff like shape inference. --- spacy/syntax/nn_parser.pyx | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index fafa492c6..d5c6bf2a8 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -9,6 +9,7 @@ import numpy cimport cython.parallel import numpy.random cimport numpy as np +from itertools import islice from cpython.ref cimport PyObject, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from libc.math cimport exp @@ -620,15 +621,15 @@ cdef class Parser: self.model, cfg = self.Model(self.moves.n_moves, **cfg) if sgd is None: sgd = self.create_optimizer() - docs = [] - golds = [] - for raw_text, annots_brackets in get_gold_tuples(): + doc_sample = [] + gold_sample = [] + for raw_text, annots_brackets in islice(get_gold_tuples(), 1000): for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots - docs.append(Doc(self.vocab, words=words)) - golds.append(GoldParse(docs[-1], words=words, tags=tags, - heads=heads, deps=deps, entities=ents)) - self.model.begin_training(docs, golds) + doc_sample.append(Doc(self.vocab, words=words)) + gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags, + heads=heads, deps=deps, entities=ents)) + self.model.begin_training(doc_sample, gold_sample) if pipeline is not None: self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg) link_vectors_to_models(self.vocab)