diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index 96c5b676e..54eedf69e 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -224,8 +224,9 @@ def train_from_config(
 
 
 def create_train_batches(nlp, corpus, cfg):
+    is_first = True
     while True:
-        train_examples = list(corpus.train_dataset(
+        train_examples = corpus.train_dataset(
             nlp,
             noise_level=0.0,
             orth_variant_level=cfg["orth_variant_level"],
@@ -323,6 +324,8 @@ def train_while_improving(
             for subbatch in subdivide_batch(batch, accumulate_gradient):
                 nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
             for name, proc in nlp.pipeline:
+        for name, proc in nlp.pipeline:
+            if hasattr(proc, "model"):
                 proc.model.finish_update(optimizer)
         optimizer.step_schedules()
         if not (step % eval_frequency):
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 69f5bd6f6..60d22a1ab 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -474,7 +474,11 @@ cdef class precompute_hiddens:
             # This will usually be on GPU
             d_best = ops.asarray(d_best)
             # Fix nans (which can occur from unseen classes.)
-            d_best[ops.xp.isnan(d_best)] = 0.
+            try:
+                d_best[ops.xp.isnan(d_best)] = 0.
+            except:
+                print(ops.xp.isnan(d_best))
+                raise
             if self.activation == "maxout":
                 mask_ = ops.asarray(mask)
                 return ops.backprop_maxout(d_best, mask_, self.nP)
diff --git a/spacy/util.py b/spacy/util.py
index f39813694..7f35c2f7c 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -598,16 +598,9 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0
             try:
                 example = next(examples)
             except StopIteration:
-                if oversize:
-                    examples = iter(oversize)
-                    oversize = []
-                    if batch:
-                        yield batch
-                    break
-                else:
-                    if batch:
-                        yield batch
-                    return
+                if batch:
+                    yield batch
+                return
             n_words = count_words(example.doc)
             if n_words < (batch_size + tol_size):
                 batch_size -= n_words