From 58f19421b1a1630099c65cc0e651c0fa21a9f052 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 29 Aug 2020 03:46:50 +0200 Subject: [PATCH] Return empty batch from tok2vec listener if no doc.tensor --- spacy/pipeline/tok2vec.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index f2d138cf7..dad66ddb3 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -295,4 +295,19 @@ def forward(model: Tok2VecListener, inputs, is_train: bool): model.verify_inputs(inputs) return model._outputs, model._backprop else: - return [doc.tensor for doc in inputs], lambda dX: [] + # This is pretty grim, but it's hard to do better :(. + # It's hard to avoid relying on the doc.tensor attribute, because the + # pipeline components can batch the data differently during prediction. + # That doesn't happen in update, where the nlp object works on batches + # of data. + # When the components batch differently, we don't receive a matching + # prediction from the upstream, so we can't predict. + if not all(doc.tensor.size for doc in inputs): + # But we do need to do *something* if the tensor hasn't been set. + # The compromise is to at least return data of the right shape, + # so the output is valid. + width = model.get_dim("nO") + outputs = [model.ops.alloc2f(len(doc), width) for doc in inputs] + else: + outputs = [doc.tensor for doc in inputs] + return outputs, lambda dX: []