From 08e1ab64b528a15c7c165c946989c94dde7a4b87 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 15:53:45 -0400 Subject: [PATCH 01/17] fixed nccl init --- pytorch_lightning/models/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 725ac14781..64fbf16bb5 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -375,6 +375,7 @@ class Trainer(TrainerIO): :param tries: :return: """ + sleep(self.proc_rank*2) root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node os.environ['MASTER_PORT'] = f'{port}' From 91b869d04300bda906306a15f018520e745a75ff Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 15:55:28 -0400 Subject: [PATCH 02/17] fixed nccl init --- pytorch_lightning/models/trainer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 64fbf16bb5..fb08a3376b 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -375,7 +375,10 @@ class Trainer(TrainerIO): :param tries: :return: """ - sleep(self.proc_rank*2) + # hack to get nccl to stop throwing error... seems to be an nccl race condition + if self.proc_rank > 0: + sleep(10.0) + root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node os.environ['MASTER_PORT'] = f'{port}' From d99b121379f3706e6d97393ef34f4efdec8d9d8c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 15:59:12 -0400 Subject: [PATCH 03/17] fixed nccl init --- pytorch_lightning/models/trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index fb08a3376b..193ebc01a1 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -382,7 +382,8 @@ class Trainer(TrainerIO): root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node os.environ['MASTER_PORT'] = f'{port}' - dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) + # dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) + dist.init_process_group("nccl") def __run_pretrain_routine(self, model): """ From c244599ae8c9575d34f7f25942ed06a1ccd87f2a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 15:59:33 -0400 Subject: [PATCH 04/17] fixed nccl init --- pytorch_lightning/models/trainer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 193ebc01a1..d7b0b9c083 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -375,9 +375,6 @@ class Trainer(TrainerIO): :param tries: :return: """ - # hack to get nccl to stop throwing error... seems to be an nccl race condition - if self.proc_rank > 0: - sleep(10.0) root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node From c84700814d2fc7cb3697909d174caa5453579f47 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 16:03:17 -0400 Subject: [PATCH 05/17] fixed nccl init --- pytorch_lightning/models/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index d7b0b9c083..3520f23ec5 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -379,8 +379,8 @@ class Trainer(TrainerIO): root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node os.environ['MASTER_PORT'] = f'{port}' - # dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) - dist.init_process_group("nccl") + dist.init_process_group("nccl", rank=self.proc_rank) + # dist.init_process_group("nccl") def __run_pretrain_routine(self, model): """ From 0bd81db5387d596fb3c7c76b9d3110f78cdc0ede Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 16:05:46 -0400 Subject: [PATCH 06/17] fixed nccl init --- pytorch_lightning/models/trainer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 3520f23ec5..6c5eaa82c5 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -367,7 +367,7 @@ class Trainer(TrainerIO): # continue training routine self.__run_pretrain_routine(model) - def __init_tcp_connection(self, port=12975): + def __init_tcp_connection(self, port=12945): """ Connect all procs in the world using the env:// init Use the first node as the root address @@ -379,8 +379,7 @@ class Trainer(TrainerIO): root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node os.environ['MASTER_PORT'] = f'{port}' - dist.init_process_group("nccl", rank=self.proc_rank) - # dist.init_process_group("nccl") + dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) def __run_pretrain_routine(self, model): """ From 6219f24a03469569dddabf6be56d990d78923503 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 16:07:57 -0400 Subject: [PATCH 07/17] fixed nccl init --- pytorch_lightning/models/trainer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 6c5eaa82c5..b02ba332b9 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -367,7 +367,7 @@ class Trainer(TrainerIO): # continue training routine self.__run_pretrain_routine(model) - def __init_tcp_connection(self, port=12945): + def __init_tcp_connection(self): """ Connect all procs in the world using the env:// init Use the first node as the root address @@ -375,6 +375,10 @@ class Trainer(TrainerIO): :param tries: :return: """ + try: + port = os.environ['MASTER_PORT'] + except Exception as e: + port = 12910 root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node From 3bf366bcd80636c56cf4a1106666a64df0400e52 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 16:08:23 -0400 Subject: [PATCH 08/17] fixed nccl init --- pytorch_lightning/models/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index b02ba332b9..f9d9634167 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -376,13 +376,13 @@ class Trainer(TrainerIO): :return: """ try: - port = os.environ['MASTER_PORT'] + os.environ['MASTER_PORT'] except Exception as e: port = 12910 + os.environ['MASTER_PORT'] = f'{port}' root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node - os.environ['MASTER_PORT'] = f'{port}' dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) def __run_pretrain_routine(self, model): From 7e54ad3f7c82114c1cbbc88e2c67d0760dd52c35 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 16:16:46 -0400 Subject: [PATCH 09/17] fixed nccl init --- pytorch_lightning/models/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index f9d9634167..679661e796 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -381,6 +381,8 @@ class Trainer(TrainerIO): port = 12910 os.environ['MASTER_PORT'] = f'{port}' + sleep(self.proc_rank * 2) + root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) From 4f5da45fae36adb4ee8a1c6076d100038dcf0e46 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 16:17:50 -0400 Subject: [PATCH 10/17] fixed nccl init --- pytorch_lightning/models/trainer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 679661e796..6e0e9591f7 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -381,6 +381,10 @@ class Trainer(TrainerIO): port = 12910 os.environ['MASTER_PORT'] = f'{port}' + print('-'*100) + print(f'PORT: {port}') + print('-'*100) + sleep(self.proc_rank * 2) root_node = os.environ['SLURM_NODELIST'].split(' ')[0] From faa2d4fa8b48ae974c4b573b6ebcb7ebd5796066 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 16:23:20 -0400 Subject: [PATCH 11/17] fixed nccl init --- pytorch_lightning/models/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 6e0e9591f7..68658c61b5 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -376,7 +376,7 @@ class Trainer(TrainerIO): :return: """ try: - os.environ['MASTER_PORT'] + port = os.environ['MASTER_PORT'] except Exception as e: port = 12910 os.environ['MASTER_PORT'] = f'{port}' From b7baa961862ac022450800e335da121bd3c86b8c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 16:29:44 -0400 Subject: [PATCH 12/17] fixed nccl init --- pytorch_lightning/models/trainer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 68658c61b5..9c58c05b8f 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -339,10 +339,7 @@ class Trainer(TrainerIO): # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table - print('-'*100) - print('INIT CONN') self.__init_tcp_connection() - print('-'*100) # CHOOSE OPTIMIZER # filter out the weights that were done on gpu so we can load on good old cpus From 9f41a9e8b78d8774334b48aeb7d0707f7c459de2 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 16:35:20 -0400 Subject: [PATCH 13/17] fixed nccl init --- pytorch_lightning/models/trainer.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 9c58c05b8f..c7bbc1a678 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -378,11 +378,7 @@ class Trainer(TrainerIO): port = 12910 os.environ['MASTER_PORT'] = f'{port}' - print('-'*100) - print(f'PORT: {port}') - print('-'*100) - - sleep(self.proc_rank * 2) + sleep(self.proc_rank*0.5) root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node From 8b0cda84e70e573c16eb69eae5cae60a910e209c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Jul 2019 10:13:52 -0400 Subject: [PATCH 14/17] added fallback local init --- pytorch_lightning/models/trainer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index c7bbc1a678..8fc4d5ee66 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -378,10 +378,14 @@ class Trainer(TrainerIO): port = 12910 os.environ['MASTER_PORT'] = f'{port}' - sleep(self.proc_rank*0.5) + try: + root_node = os.environ['SLURM_NODELIST'].split(' ')[0] + except Exception as e: + root_node = '127.0.0.2' - root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node + + sleep(self.proc_rank*0.5) dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) def __run_pretrain_routine(self, model): From 52a98d76d886d55863e0359e165d48f76c894e83 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Jul 2019 10:16:50 -0400 Subject: [PATCH 15/17] added fallback local init --- pytorch_lightning/models/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 8fc4d5ee66..b27996a797 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -597,7 +597,7 @@ class Trainer(TrainerIO): try: loss = output['loss'] except Exception as e: - if type(loss) is torch.Tensor: + if type(output) is torch.Tensor: loss = output self.__add_tqdm_metrics(model_specific_tqdm_metrics_dic) From 9ccfc7bd3339f448c5afc879229be691c82a2fca Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Jul 2019 22:03:36 -0400 Subject: [PATCH 16/17] added fallback local init --- pytorch_lightning/models/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index b27996a797..deccaabffc 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -291,7 +291,7 @@ class Trainer(TrainerIO): # when using gpus, first thing we do is spawn a new process between each worker # applies to single gpu, multi-gpu and multi-nodes - if self.on_gpu: + if self.on_gpu and len(self.data_parallel_device_ids) > 1: self.experiment = self.experiment.get_meta_copy() mp.spawn(self.dp_train, nprocs=len(self.data_parallel_device_ids), args=(model, )) else: From 7c688fbf2e33926fa33ee68f087e4e99c745b520 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Jul 2019 22:09:17 -0400 Subject: [PATCH 17/17] enabling gpu size = 1 to run without data parallel --- pytorch_lightning/models/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index deccaabffc..05bfb1e1dc 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -96,7 +96,7 @@ class Trainer(TrainerIO): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in self.data_parallel_device_ids]) - self.data_parallel = self.data_parallel_device_ids is not None and len(self.data_parallel_device_ids) > 0 + self.data_parallel = self.data_parallel_device_ids is not None and len(self.data_parallel_device_ids) > 1 # process info self.proc_rank = 0 @@ -291,7 +291,7 @@ class Trainer(TrainerIO): # when using gpus, first thing we do is spawn a new process between each worker # applies to single gpu, multi-gpu and multi-nodes - if self.on_gpu and len(self.data_parallel_device_ids) > 1: + if self.data_parallel: self.experiment = self.experiment.get_meta_copy() mp.spawn(self.dp_train, nprocs=len(self.data_parallel_device_ids), args=(model, )) else: