diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index 07a9f5a74f..55d9fdb5fa 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -73,6 +73,47 @@ when needed. .. note:: For iterable datasets, we don't do this automatically. +Make Model Picklable +^^^^^^^^^^^^^^^^^^^^ +It's very likely your code is already `picklable `_, +so you don't have to do anything to make this change. +However, if you run distributed and see an error like this: + +.. code-block:: + + self._launch(process_obj) + File "/net/software/local/python/3.6.5/lib/python3.6/multiprocessing/popen_spawn_posix.py", line 47, + in _launch reduction.dump(process_obj, fp) + File "/net/software/local/python/3.6.5/lib/python3.6/multiprocessing/reduction.py", line 60, in dump + ForkingPickler(file, protocol).dump(obj) + _pickle.PicklingError: Can't pickle at 0x2b599e088ae8>: + attribute lookup on __main__ failed + +This means you have something in your model definition, transforms, optimizer, dataloader or callbacks +that is cannot be pickled. By pickled we mean the following would fail. + +.. code-block:: python + + import pickle + pickle.dump(some_object) + +This is a limitation of using multiple processes for distributed training within PyTorch. +To fix this issue, find your piece of code that cannot be pickled. The end of the stacktrace +is usually helpful. + +.. code-block:: + + self._launch(process_obj) + File "/net/software/local/python/3.6.5/lib/python3.6/multiprocessing/popen_spawn_posix.py", line 47, + in _launch reduction.dump(process_obj, fp) + File "/net/software/local/python/3.6.5/lib/python3.6/multiprocessing/reduction.py", line 60, in dump + ForkingPickler(file, protocol).dump(obj) + _pickle.PicklingError: Can't pickle [THIS IS THE THING TO FIND AND DELETE]: + attribute lookup on __main__ failed + +ie: in the stacktrace example here, there seems to be a lambda function somewhere in the user code +which cannot be pickled. + Distributed modes ----------------- Lightning allows multiple ways of training @@ -88,6 +129,8 @@ Data Parallel (dp) `DataParallel `_ splits a batch across k GPUs. That is, if you have a batch of 32 and use dp with 2 gpus, each GPU will process 16 samples, after which the root node will aggregate the results. +.. warning:: DP use is discouraged by PyTorch and Lightning. Use ddp which is more stable and at least 3x faster + .. code-block:: python # train on 1 GPU (using dp mode) @@ -167,7 +210,7 @@ Horovod can be configured in the training script to run with any number of GPUs When starting the training job, the driver application will then be used to specify the total number of worker processes: -.. code-block:: bash +.. code-block:: # run training with 4 GPUs on a single machine horovodrun -np 4 python train.py @@ -193,9 +236,9 @@ DP and ddp2 roughly do the following: gpu_3_batch = batch[24:] y_0 = model_copy_gpu_0(gpu_0_batch) - y_1 = model_copy_gpu_0(gpu_1_batch) - y_2 = model_copy_gpu_0(gpu_2_batch) - y_3 = model_copy_gpu_0(gpu_3_batch) + y_1 = model_copy_gpu_1(gpu_1_batch) + y_2 = model_copy_gpu_2(gpu_2_batch) + y_3 = model_copy_gpu_3(gpu_3_batch) return [y_0, y_1, y_2, y_3] diff --git a/pytorch_lightning/loggers/__init__.py b/pytorch_lightning/loggers/__init__.py index c753a048be..24769c7953 100644 --- a/pytorch_lightning/loggers/__init__.py +++ b/pytorch_lightning/loggers/__init__.py @@ -70,7 +70,11 @@ Call the logger anywhere except ``__init__`` in your ... def training_step(self, batch, batch_idx): ... # example ... self.logger.experiment.whatever_method_summary_writer_supports(...) -... + + # example if logger is a tensorboard logger + self.logger.experiment.add_image('images', grid, 0) + self.logger.experiment.add_graph(model, images) + ... def any_lightning_module_function_or_hook(self): ... self.logger.experiment.add_histogram(...)