# Copyright The PyTorch Lightning team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import random from pathlib import Path import pytest import torch import torch.distributed as dist import torch.multiprocessing as mp import tests.helpers.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.trainer.connectors.logger_connector.result import _Sync from pytorch_lightning.utilities.distributed import sync_ddp_if_available from tests.helpers import BoringDataModule, BoringModel from tests.helpers.runif import RunIf def _setup_ddp(rank, worldsize): import os os.environ["MASTER_ADDR"] = "localhost" # initialize the process group dist.init_process_group("gloo", rank=rank, world_size=worldsize) def _ddp_test_fn(rank, worldsize): _setup_ddp(rank, worldsize) tensor = torch.tensor([1.0]) sync = _Sync(sync_ddp_if_available, should=True, op='SUM') actual = sync(tensor) assert actual.item() == dist.get_world_size(), "Result-Log does not work properly with DDP and Tensors" @RunIf(skip_windows=True) def test_result_reduce_ddp(): """Make sure result logging works with DDP""" tutils.set_random_master_port() worldsize = 2 mp.spawn(_ddp_test_fn, args=(worldsize, ), nprocs=worldsize) @pytest.mark.parametrize(["option", "do_train", "gpus"], [ pytest.param(0, True, 0, id='full_loop'), pytest.param(0, False, 0, id='test_only'), pytest.param( 1, False, 0, id='test_only_mismatching_tensor', marks=pytest.mark.xfail(raises=ValueError, match="Mism.*") ), pytest.param(2, False, 0, id='mix_of_tensor_dims'), pytest.param(3, False, 0, id='string_list_predictions'), pytest.param(4, False, 0, id='int_list_predictions'), pytest.param(5, False, 0, id='nested_list_predictions'), pytest.param(6, False, 0, id='dict_list_predictions'), pytest.param(7, True, 0, id='write_dict_predictions'), pytest.param(0, True, 1, id='full_loop_single_gpu', marks=RunIf(min_gpus=1)) ]) def test_write_predictions(tmpdir, option: int, do_train: bool, gpus: int): class CustomBoringModel(BoringModel): def test_step(self, batch, batch_idx, optimizer_idx=None): output = self(batch) test_loss = self.loss(batch, output) self.log('test_loss', test_loss) batch_size = batch.size(0) lst_of_str = [random.choice(['dog', 'cat']) for i in range(batch_size)] lst_of_int = [random.randint(500, 1000) for i in range(batch_size)] lst_of_lst = [[x] for x in lst_of_int] lst_of_dict = [{k: v} for k, v in zip(lst_of_str, lst_of_int)] prediction_file = getattr(self, 'prediction_file', 'predictions.pt') lazy_ids = torch.arange(batch_idx * batch_size, batch_idx * batch_size + batch_size) # Base if option == 0: self.write_prediction('idxs', lazy_ids, prediction_file) self.write_prediction('preds', output, prediction_file) # Check mismatching tensor len elif option == 1: self.write_prediction('idxs', torch.cat((lazy_ids, lazy_ids)), prediction_file) self.write_prediction('preds', output, prediction_file) # write multi-dimension elif option == 2: self.write_prediction('idxs', lazy_ids, prediction_file) self.write_prediction('preds', output, prediction_file) self.write_prediction('x', batch, prediction_file) # write str list elif option == 3: self.write_prediction('idxs', lazy_ids, prediction_file) self.write_prediction('vals', lst_of_str, prediction_file) # write int list elif option == 4: self.write_prediction('idxs', lazy_ids, prediction_file) self.write_prediction('vals', lst_of_int, prediction_file) # write nested list elif option == 5: self.write_prediction('idxs', lazy_ids, prediction_file) self.write_prediction('vals', lst_of_lst, prediction_file) # write dict list elif option == 6: self.write_prediction('idxs', lazy_ids, prediction_file) self.write_prediction('vals', lst_of_dict, prediction_file) elif option == 7: self.write_prediction_dict({'idxs': lazy_ids, 'preds': output}, prediction_file) prediction_file = Path(tmpdir) / 'predictions.pt' dm = BoringDataModule() model = CustomBoringModel() model.test_epoch_end = None model.prediction_file = prediction_file.as_posix() trainer = Trainer( default_root_dir=tmpdir, max_epochs=3, weights_summary=None, deterministic=True, gpus=gpus, ) # Prediction file shouldn't exist yet because we haven't done anything assert not prediction_file.exists() if do_train: trainer.fit(model, dm) assert trainer.state.finished, f"Training failed with {trainer.state}" trainer.test(datamodule=dm) else: trainer.test(model, datamodule=dm) # check prediction file now exists and is of expected length assert prediction_file.exists() predictions = torch.load(prediction_file) assert len(predictions) == len(dm.random_test)