diff --git a/examples/new_project_templates/trainer_gpu_cluster_template.py b/examples/new_project_templates/trainer_gpu_cluster_template.py index 70813ceb64..98cf87af33 100644 --- a/examples/new_project_templates/trainer_gpu_cluster_template.py +++ b/examples/new_project_templates/trainer_gpu_cluster_template.py @@ -46,7 +46,6 @@ def main(hparams, cluster, results_dict): # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') - print(os.environ['SLURM_SRUN_COMM_HOST']) model = LightningTemplateModel(hparams) print('model built') diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index d07f52d5dc..4478081a20 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -361,6 +361,8 @@ class Trainer(TrainerIO): # saves the ip to disk ip_table_name = f'.ip_meta_' + os.environ['SLURM_JOB_ID'] ip_file = os.path.join(ip_file_dir, ip_table_name) + os.makedirs(ip_file_dir, exist_ok=True) + if world_gpu_nb == 0: # get the proc 0 IP root_ip = subprocess.run(['hostname', '-I'], stdout=subprocess.PIPE).stdout.decode('utf-8')