added multi-node proc 0 ip reading

This commit is contained in:
William Falcon 2019-07-08 09:42:13 -04:00
parent 79d9adf004
commit 3422f7610b
2 changed files with 11 additions and 7 deletions

View File

@ -89,18 +89,13 @@ def main(hparams, cluster, results_dict):
mode=hparams.model_save_monitor_mode mode=hparams.model_save_monitor_mode
) )
# gpus are ; separated for inside a node and , within nodes
gpu_list = None
if hparams.gpus is not None:
gpu_list = [int(x) for x in hparams.gpus.split(';')]
# configure trainer # configure trainer
trainer = Trainer( trainer = Trainer(
experiment=exp, experiment=exp,
cluster=cluster, cluster=cluster,
checkpoint_callback=checkpoint, checkpoint_callback=checkpoint,
early_stop_callback=early_stop, early_stop_callback=early_stop,
gpus=gpu_list, gpus=hparams.gpus,
nb_gpu_nodes=1 nb_gpu_nodes=1
) )

View File

@ -86,8 +86,17 @@ class Trainer(TrainerIO):
self.lr_schedulers = [] self.lr_schedulers = []
self.amp_level = amp_level self.amp_level = amp_level
self.print_nan_grads = print_nan_grads self.print_nan_grads = print_nan_grads
self.data_parallel_device_ids = gpus
self.data_parallel = gpus is not None and len(gpus) > 0 self.data_parallel = gpus is not None and len(gpus) > 0
self.data_parallel_device_ids = gpus
# gpus come in as a string.
# if gpus = -1 then use all available devices
# otherwise, split the string using commas
if gpus is not None:
if gpus == '-1':
self.data_parallel_device_ids = torch.cuda.device_count()
else:
self.data_parallel_device_ids = [int(x.strip()) for x in gpus.split(',')]
# process info # process info
self.proc_rank = 0 self.proc_rank = 0