From d2a717d31e72db58b67fa6c6c704565f0da818c8 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 8 Jul 2019 14:14:36 -0400 Subject: [PATCH] using slurm flag to fine node nb --- examples/new_project_templates/trainer_gpu_cluster_template.py | 1 - pytorch_lightning/models/trainer.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/new_project_templates/trainer_gpu_cluster_template.py b/examples/new_project_templates/trainer_gpu_cluster_template.py index 70813ceb64..98cf87af33 100644 --- a/examples/new_project_templates/trainer_gpu_cluster_template.py +++ b/examples/new_project_templates/trainer_gpu_cluster_template.py @@ -46,7 +46,6 @@ def main(hparams, cluster, results_dict): # 1 INIT LIGHTNING MODEL # ------------------------ print('loading model...') - print(os.environ['SLURM_SRUN_COMM_HOST']) model = LightningTemplateModel(hparams) print('model built') diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index d07f52d5dc..4478081a20 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -361,6 +361,8 @@ class Trainer(TrainerIO): # saves the ip to disk ip_table_name = f'.ip_meta_' + os.environ['SLURM_JOB_ID'] ip_file = os.path.join(ip_file_dir, ip_table_name) + os.makedirs(ip_file_dir, exist_ok=True) + if world_gpu_nb == 0: # get the proc 0 IP root_ip = subprocess.run(['hostname', '-I'], stdout=subprocess.PIPE).stdout.decode('utf-8')