From b37b58a73e74e173f4efc674a9ac2d4bb978e422 Mon Sep 17 00:00:00 2001 From: "Hinrich B. Winther" Date: Tue, 13 Apr 2021 11:18:52 +0200 Subject: [PATCH] Fix Checkpoint issue when using Horovod distributed backend (PyTorchLightning#6947) (#6958) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Adrian Wälchli Co-authored-by: Adrian Wälchli --- pytorch_lightning/plugins/training_type/horovod.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 8d0add27cb..415de21aca 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -150,7 +150,7 @@ class HorovodPlugin(ParallelPlugin): if reduce_op in (None, "avg", "mean"): reduce_op = hvd.Average - elif reduce_op == "sum": + elif reduce_op in ("sum", ReduceOp.SUM): reduce_op = hvd.Sum else: raise ValueError(f"unrecognized `reduce_op`: {reduce_op}")