Fix TensorBoardLogger creating redundant experiment when finalizing (#14762)
Co-authored-by: Kushashwa Ravi Shrimali <kushashwaravishrimali@gmail.com>
This commit is contained in:
parent
63ef41e0d0
commit
72fac45ff2
|
@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
- Fixed the availability check for the neptune-client package ([#14714](https://github.com/Lightning-AI/lightning/pull/14714))
|
||||
- Break HPU Graphs into two parts (forward + backward as one and optimizer as another) for better performance ([#14656](https://github.com/Lightning-AI/lightning/pull/14656))
|
||||
- Fixed torchscript error with ensembles of LightningModules ([#14657](https://github.com/Lightning-AI/lightning/pull/14657), [#14724](https://github.com/Lightning-AI/lightning/pull/14724))
|
||||
- Fixed an issue with `TensorBoardLogger.finalize` creating a new experiment when none was created during the Trainer's execution ([#14762](https://github.com/Lightning-AI/lightning/pull/14762))
|
||||
|
||||
|
||||
## [1.7.6] - 2022-09-13
|
||||
|
|
|
@ -267,8 +267,9 @@ class TensorBoardLogger(Logger):
|
|||
|
||||
@rank_zero_only
|
||||
def finalize(self, status: str) -> None:
|
||||
self.experiment.flush()
|
||||
self.experiment.close()
|
||||
if self._experiment is not None:
|
||||
self.experiment.flush()
|
||||
self.experiment.close()
|
||||
self.save()
|
||||
|
||||
@property
|
||||
|
|
|
@ -869,18 +869,23 @@ def test_checkpoint_repeated_strategy(tmpdir):
|
|||
"limit_test_batches": 2,
|
||||
"enable_progress_bar": False,
|
||||
"enable_model_summary": False,
|
||||
"log_every_n_steps": 1,
|
||||
"default_root_dir": tmpdir,
|
||||
}
|
||||
trainer = Trainer(**trainer_kwargs, callbacks=[checkpoint_callback])
|
||||
trainer.fit(model)
|
||||
assert os.listdir(tmpdir) == ["epoch=00.ckpt"]
|
||||
assert set(os.listdir(tmpdir)) == {"epoch=00.ckpt", "lightning_logs"}
|
||||
|
||||
for idx in range(4):
|
||||
# load from checkpoint
|
||||
trainer = pl.Trainer(**trainer_kwargs, default_root_dir=tmpdir)
|
||||
trainer = Trainer(**trainer_kwargs)
|
||||
trainer.fit(model, ckpt_path=checkpoint_callback.best_model_path)
|
||||
trainer.test(ckpt_path=checkpoint_callback.best_model_path, verbose=False)
|
||||
|
||||
assert set(os.listdir(tmpdir)) == {"epoch=00.ckpt", "lightning_logs"}
|
||||
assert set(os.listdir(tmpdir / "lightning_logs")) == {f"version_{i}" for i in range(4)}
|
||||
|
||||
# no new versions created after the initial fit, because the ones that resume from ckpt do not log anything
|
||||
assert set(os.listdir(tmpdir / "lightning_logs")) == {"version_0"}
|
||||
|
||||
|
||||
def test_checkpoint_repeated_strategy_extended(tmpdir):
|
||||
|
@ -891,6 +896,7 @@ def test_checkpoint_repeated_strategy_extended(tmpdir):
|
|||
def validation_step(self, batch, batch_idx):
|
||||
output = self.layer(batch)
|
||||
loss = self.loss(batch, output)
|
||||
self.log("val_loss", loss)
|
||||
return {"val_loss": loss}
|
||||
|
||||
def validation_epoch_end(self, *_):
|
||||
|
@ -930,7 +936,7 @@ def test_checkpoint_repeated_strategy_extended(tmpdir):
|
|||
limit_test_batches=4,
|
||||
callbacks=[checkpoint_cb],
|
||||
)
|
||||
trainer = pl.Trainer(**trainer_config)
|
||||
trainer = Trainer(**trainer_config)
|
||||
assert_trainer_init(trainer)
|
||||
|
||||
model = ExtendedBoringModel()
|
||||
|
|
|
@ -290,7 +290,17 @@ def test_tensorboard_with_accummulated_gradients(mock_log_metrics, tmpdir):
|
|||
def test_tensorboard_finalize(summary_writer, tmpdir):
|
||||
"""Test that the SummaryWriter closes in finalize."""
|
||||
logger = TensorBoardLogger(save_dir=tmpdir)
|
||||
assert logger._experiment is None
|
||||
logger.finalize("any")
|
||||
|
||||
# no log calls, no experiment created -> nothing to flush
|
||||
summary_writer.assert_not_called()
|
||||
|
||||
logger = TensorBoardLogger(save_dir=tmpdir)
|
||||
logger.log_metrics({"flush_me": 11.1}) # trigger creation of an experiment
|
||||
logger.finalize("any")
|
||||
|
||||
# finalize flushes to experiment directory
|
||||
summary_writer().flush.assert_called()
|
||||
summary_writer().close.assert_called()
|
||||
|
||||
|
|
Loading…
Reference in New Issue