added test for no dist sampler
This commit is contained in:
parent
5c21683566
commit
8064a77aa7
|
@ -19,7 +19,7 @@ import tqdm
|
||||||
from pytorch_lightning.root_module.memory import get_gpu_memory_map
|
from pytorch_lightning.root_module.memory import get_gpu_memory_map
|
||||||
from pytorch_lightning.root_module.model_saving import TrainerIO
|
from pytorch_lightning.root_module.model_saving import TrainerIO
|
||||||
from pytorch_lightning.pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel
|
from pytorch_lightning.pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel
|
||||||
from pytorch_lightning.utils.debugging import IncompatibleArgumentsException
|
from pytorch_lightning.utils.debugging import MisconfigurationException
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
|
@ -392,7 +392,7 @@ class Trainer(TrainerIO):
|
||||||
dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
|
dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
|
||||||
dataloader = Dataloader(dataset, sampler=dist_sampler)
|
dataloader = Dataloader(dataset, sampler=dist_sampler)
|
||||||
'''
|
'''
|
||||||
raise Exception(msg)
|
raise MisconfigurationException(msg)
|
||||||
|
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
# MODEL TRAINING
|
# MODEL TRAINING
|
||||||
|
@ -467,7 +467,7 @@ class Trainer(TrainerIO):
|
||||||
m = f'amp level {self.amp_level} with DataParallel is not supported. ' \
|
m = f'amp level {self.amp_level} with DataParallel is not supported. ' \
|
||||||
f'See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227. ' \
|
f'See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227. ' \
|
||||||
f'We recommend you switch to ddp if you want to use amp'
|
f'We recommend you switch to ddp if you want to use amp'
|
||||||
raise IncompatibleArgumentsException(m)
|
raise MisconfigurationException(m)
|
||||||
|
|
||||||
# run through amp wrapper
|
# run through amp wrapper
|
||||||
if self.use_amp:
|
if self.use_amp:
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import pdb
|
import pdb
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
class IncompatibleArgumentsException(Exception):
|
class MisconfigurationException(Exception):
|
||||||
pass
|
pass
|
|
@ -1,10 +1,11 @@
|
||||||
import pytest
|
import pytest
|
||||||
from pytorch_lightning import Trainer
|
from pytorch_lightning import Trainer
|
||||||
from pytorch_lightning.examples.new_project_templates.lightning_module_template import LightningTemplateModel
|
from pytorch_lightning.examples.new_project_templates.lightning_module_template import LightningTemplateModel
|
||||||
|
from pytorch_lightning.testing_models.lm_test_module import LightningTestModel
|
||||||
from argparse import Namespace
|
from argparse import Namespace
|
||||||
from test_tube import Experiment
|
from test_tube import Experiment
|
||||||
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
|
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
|
||||||
from pytorch_lightning.utils.debugging import IncompatibleArgumentsException
|
from pytorch_lightning.utils.debugging import MisconfigurationException
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import warnings
|
import warnings
|
||||||
import torch
|
import torch
|
||||||
|
@ -33,7 +34,8 @@ def test_cpu_model():
|
||||||
val_percent_check=0.4
|
val_percent_check=0.4
|
||||||
)
|
)
|
||||||
|
|
||||||
run_gpu_model_test(trainer_options, on_gpu=False)
|
model, hparams = get_model()
|
||||||
|
run_gpu_model_test(trainer_options, model, hparams, on_gpu=False)
|
||||||
|
|
||||||
|
|
||||||
def test_all_features_cpu_model():
|
def test_all_features_cpu_model():
|
||||||
|
@ -54,7 +56,8 @@ def test_all_features_cpu_model():
|
||||||
val_percent_check=0.4
|
val_percent_check=0.4
|
||||||
)
|
)
|
||||||
|
|
||||||
run_gpu_model_test(trainer_options, on_gpu=False)
|
model, hparams = get_model()
|
||||||
|
run_gpu_model_test(trainer_options, model, hparams, on_gpu=False)
|
||||||
|
|
||||||
|
|
||||||
def test_early_stopping_cpu_model():
|
def test_early_stopping_cpu_model():
|
||||||
|
@ -77,7 +80,8 @@ def test_early_stopping_cpu_model():
|
||||||
val_percent_check=0.4
|
val_percent_check=0.4
|
||||||
)
|
)
|
||||||
|
|
||||||
run_gpu_model_test(trainer_options, on_gpu=False)
|
model, hparams = get_model()
|
||||||
|
run_gpu_model_test(trainer_options, model, hparams, on_gpu=False)
|
||||||
|
|
||||||
|
|
||||||
def test_single_gpu_model():
|
def test_single_gpu_model():
|
||||||
|
@ -88,6 +92,7 @@ def test_single_gpu_model():
|
||||||
if not torch.cuda.is_available():
|
if not torch.cuda.is_available():
|
||||||
warnings.warn('test_single_gpu_model cannot run. Rerun on a GPU node to run this test')
|
warnings.warn('test_single_gpu_model cannot run. Rerun on a GPU node to run this test')
|
||||||
return
|
return
|
||||||
|
model, hparams = get_model()
|
||||||
|
|
||||||
trainer_options = dict(
|
trainer_options = dict(
|
||||||
progress_bar=False,
|
progress_bar=False,
|
||||||
|
@ -97,7 +102,7 @@ def test_single_gpu_model():
|
||||||
gpus=[0]
|
gpus=[0]
|
||||||
)
|
)
|
||||||
|
|
||||||
run_gpu_model_test(trainer_options)
|
run_gpu_model_test(trainer_options, model, hparams)
|
||||||
|
|
||||||
|
|
||||||
def test_multi_gpu_model_dp():
|
def test_multi_gpu_model_dp():
|
||||||
|
@ -111,7 +116,7 @@ def test_multi_gpu_model_dp():
|
||||||
if not torch.cuda.device_count() > 1:
|
if not torch.cuda.device_count() > 1:
|
||||||
warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
|
warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
|
||||||
return
|
return
|
||||||
|
model, hparams = get_model()
|
||||||
trainer_options = dict(
|
trainer_options = dict(
|
||||||
progress_bar=False,
|
progress_bar=False,
|
||||||
max_nb_epochs=1,
|
max_nb_epochs=1,
|
||||||
|
@ -120,7 +125,7 @@ def test_multi_gpu_model_dp():
|
||||||
gpus=[0, 1]
|
gpus=[0, 1]
|
||||||
)
|
)
|
||||||
|
|
||||||
run_gpu_model_test(trainer_options)
|
run_gpu_model_test(trainer_options, model, hparams)
|
||||||
|
|
||||||
|
|
||||||
def test_amp_gpu_dp():
|
def test_amp_gpu_dp():
|
||||||
|
@ -134,15 +139,15 @@ def test_amp_gpu_dp():
|
||||||
if not torch.cuda.device_count() > 1:
|
if not torch.cuda.device_count() > 1:
|
||||||
warnings.warn('test_amp_gpu_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
|
warnings.warn('test_amp_gpu_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
|
||||||
return
|
return
|
||||||
|
model, hparams = get_model()
|
||||||
trainer_options = dict(
|
trainer_options = dict(
|
||||||
max_nb_epochs=1,
|
max_nb_epochs=1,
|
||||||
gpus='0, 1', # test init with gpu string
|
gpus='0, 1', # test init with gpu string
|
||||||
distributed_backend='dp',
|
distributed_backend='dp',
|
||||||
use_amp=True
|
use_amp=True
|
||||||
)
|
)
|
||||||
with pytest.raises(IncompatibleArgumentsException):
|
with pytest.raises(MisconfigurationException):
|
||||||
run_gpu_model_test(trainer_options)
|
run_gpu_model_test(trainer_options, model, hparams)
|
||||||
|
|
||||||
|
|
||||||
def test_multi_gpu_model_ddp():
|
def test_multi_gpu_model_ddp():
|
||||||
|
@ -158,7 +163,7 @@ def test_multi_gpu_model_ddp():
|
||||||
return
|
return
|
||||||
|
|
||||||
os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
|
os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
|
||||||
|
model, hparams = get_model()
|
||||||
trainer_options = dict(
|
trainer_options = dict(
|
||||||
progress_bar=False,
|
progress_bar=False,
|
||||||
max_nb_epochs=1,
|
max_nb_epochs=1,
|
||||||
|
@ -168,7 +173,7 @@ def test_multi_gpu_model_ddp():
|
||||||
distributed_backend='ddp'
|
distributed_backend='ddp'
|
||||||
)
|
)
|
||||||
|
|
||||||
run_gpu_model_test(trainer_options)
|
run_gpu_model_test(trainer_options, model, hparams)
|
||||||
|
|
||||||
|
|
||||||
def test_amp_gpu_ddp():
|
def test_amp_gpu_ddp():
|
||||||
|
@ -185,6 +190,7 @@ def test_amp_gpu_ddp():
|
||||||
|
|
||||||
os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
|
os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
|
||||||
|
|
||||||
|
model, hparams = get_model()
|
||||||
trainer_options = dict(
|
trainer_options = dict(
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
max_nb_epochs=1,
|
max_nb_epochs=1,
|
||||||
|
@ -193,18 +199,14 @@ def test_amp_gpu_ddp():
|
||||||
use_amp=True
|
use_amp=True
|
||||||
)
|
)
|
||||||
|
|
||||||
run_gpu_model_test(trainer_options)
|
run_gpu_model_test(trainer_options, model, hparams)
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------------
|
def test_ddp_sampler_error():
|
||||||
# UTILS
|
"""
|
||||||
# ------------------------------------------------------------------------
|
Make sure DDP + AMP work
|
||||||
|
:return:
|
||||||
def run_gpu_model_test(trainer_options, on_gpu=True):
|
|
||||||
"""
|
"""
|
||||||
Make sure DDP + AMP work
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
if not torch.cuda.is_available():
|
if not torch.cuda.is_available():
|
||||||
warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
|
warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
|
||||||
return
|
return
|
||||||
|
@ -212,8 +214,34 @@ def run_gpu_model_test(trainer_options, on_gpu=True):
|
||||||
warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
|
warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
|
||||||
|
|
||||||
|
hparams = get_hparams()
|
||||||
|
model = LightningTestModel(hparams, force_remove_distributed_sampler=True)
|
||||||
|
|
||||||
|
trainer_options = dict(
|
||||||
|
progress_bar=True,
|
||||||
|
max_nb_epochs=1,
|
||||||
|
gpus=[0, 1],
|
||||||
|
distributed_backend='ddp',
|
||||||
|
use_amp=True
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(MisconfigurationException):
|
||||||
|
run_gpu_model_test(trainer_options, model, hparams)
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
# UTILS
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def run_gpu_model_test(trainer_options, model, hparams, on_gpu=True):
|
||||||
|
"""
|
||||||
|
Make sure DDP + AMP work
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
|
||||||
save_dir = init_save_dir()
|
save_dir = init_save_dir()
|
||||||
model, hparams = get_model()
|
|
||||||
|
|
||||||
# exp file to get meta
|
# exp file to get meta
|
||||||
exp = get_exp(False)
|
exp = get_exp(False)
|
||||||
|
@ -243,8 +271,7 @@ def run_gpu_model_test(trainer_options, on_gpu=True):
|
||||||
clear_save_dir()
|
clear_save_dir()
|
||||||
|
|
||||||
|
|
||||||
def get_model():
|
def get_hparams():
|
||||||
# set up model with these hyperparams
|
|
||||||
root_dir = os.path.dirname(os.path.realpath(__file__))
|
root_dir = os.path.dirname(os.path.realpath(__file__))
|
||||||
hparams = Namespace(**{'drop_prob': 0.2,
|
hparams = Namespace(**{'drop_prob': 0.2,
|
||||||
'batch_size': 32,
|
'batch_size': 32,
|
||||||
|
@ -254,6 +281,12 @@ def get_model():
|
||||||
'data_root': os.path.join(root_dir, 'mnist'),
|
'data_root': os.path.join(root_dir, 'mnist'),
|
||||||
'out_features': 10,
|
'out_features': 10,
|
||||||
'hidden_dim': 1000})
|
'hidden_dim': 1000})
|
||||||
|
return hparams
|
||||||
|
|
||||||
|
|
||||||
|
def get_model():
|
||||||
|
# set up model with these hyperparams
|
||||||
|
hparams = get_hparams()
|
||||||
model = LightningTemplateModel(hparams)
|
model = LightningTemplateModel(hparams)
|
||||||
|
|
||||||
return model, hparams
|
return model, hparams
|
||||||
|
|
Loading…
Reference in New Issue