From 58f19421b1a1630099c65cc0e651c0fa21a9f052 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 29 Aug 2020 03:46:50 +0200
Subject: [PATCH] Return empty batch from tok2vec listener if no doc.tensor

---
 spacy/pipeline/tok2vec.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index f2d138cf7..dad66ddb3 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -295,4 +295,19 @@ def forward(model: Tok2VecListener, inputs, is_train: bool):
         model.verify_inputs(inputs)
         return model._outputs, model._backprop
     else:
-        return [doc.tensor for doc in inputs], lambda dX: []
+        # This is pretty grim, but it's hard to do better :(.
+        # It's hard to avoid relying on the doc.tensor attribute, because the
+        # pipeline components can batch the data differently during prediction.
+        # That doesn't happen in update, where the nlp object works on batches
+        # of data.
+        # When the components batch differently, we don't receive a matching
+        # prediction from the upstream, so we can't predict.
+        if not all(doc.tensor.size for doc in inputs):
+            # But we do need to do *something* if the tensor hasn't been set.
+            # The compromise is to at least return data of the right shape,
+            # so the output is valid.
+            width = model.get_dim("nO")
+            outputs = [model.ops.alloc2f(len(doc), width) for doc in inputs]
+        else:
+            outputs = [doc.tensor for doc in inputs]
+        return outputs, lambda dX: []