From 48a76a785da99bbd8f258302b4341e1c2e4ec7df Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 15 Jun 2020 08:02:19 -0400 Subject: [PATCH] Performance docs (#2191) * add workers fix * add workers fix --- docs/source/index.rst | 1 + docs/source/performance.rst | 85 +++++++++++++++++++++++ pytorch_lightning/trainer/data_loading.py | 5 +- 3 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 docs/source/performance.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 0b0bfd0256..113665f24c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -13,6 +13,7 @@ PyTorch Lightning Documentation new-project introduction_guide + performance .. toctree:: :maxdepth: 2 diff --git a/docs/source/performance.rst b/docs/source/performance.rst new file mode 100644 index 0000000000..76b0d650ba --- /dev/null +++ b/docs/source/performance.rst @@ -0,0 +1,85 @@ +Fast Performance +================ +Here are some best practices to increase your performance. + +Dataloaders +----------- +When building your Dataloader set `num_workers` > 0 and `pin_memory=True` (only for GPUs). + +.. code-block:: python + + Dataloader(dataset, num_workers=8, pin_memory=True) + +num_workers +^^^^^^^^^^^ +The question of how many `num_workers` is tricky. Here's a summary of +some references, [`1 `_], and our suggestions. + +1. num_workers=0 means ONLY the main process will load batches (that can be a bottleneck). +2. num_workers=1 means ONLY one worker (just not the main process) will load data but it will still be slow. +3. The num_workers depends on the batch size and your machine. +4. A general place to start is to set `num_workers` equal to the number of CPUs on that machine. + +.. warning:: Increasing num_workers will ALSO increase your CPU memory consumption. + +The best thing to do is to increase the nun_workers slowly and stop once you see no more improvement in your training speed. + +Spawn +^^^^^ +When using `distributed_backend=ddp_spawn` (the ddp default) or TPU training, the way multiple GPUs/TPU cores are used is by calling `.spawn()` under the hood. +The problem is that PyTorch has issues with `num_workers` > 0 when using .spawn(). For this reason we recommend you +use `distributed_backend=ddp` so you can increase the `num_workers`, however your script has to be callable like so: + +.. code-block:: bash + + python my_program.py --gpus X + +.item(), .numpy(), .cpu() +------------------------- +Don't call .item() anywhere on your code. Use `.detach()` instead to remove the connected graph calls. Lightning +takes a great deal of care to be optimized for this. + +empty_cache() +------------- +Don't call this unnecessarily! Every time you call this ALL your GPUs have to wait to sync. + +construct tensors directly on device +------------------------------------ +LightningModules know what device they are on! construct tensors on the device directly to avoid CPU->Device transfer. + +.. code-block:: python + + # bad + t = tensor.rand(2, 2).cuda() + + # good (self is lightningModule) + t = tensor.rand(2,2, device=self.device) + +Use DDP not DP +-------------- +DP performs three GPU transfers for EVERY batch: + +1. Copy model to device. +2. Copy data to device. +3. Copy outputs of each device back to master. + +Whereas DDP only performs 1 transfer to sync gradients. Because of this, DDP is MUCH faster than DP. + +16-bit precision +---------------- +Use 16-bit to decrease the memory (and thus increase your batch size). On certain GPUs (V100s, 2080tis), 16-bit calculations are also faster. +However, know that 16-bit and multi-processing (any DDP) can have issues. Here are some common problems. + +1. `CUDA error: an illegal memory access was encountered `_. + The solution is likely setting a specific CUDA, CUDNN, PyTorch version combination. +2. `CUDA error: device-side assert triggered`. This is a general catch-all error. To see the actual error run your script like so: + + .. code-block:: bash + + # won't see what the error is + python main.py + + # will see what the error is + CUDA_LAUNCH_BLOCKING=1 python main.py + +We also recommend using 16-bit native found in PyTorch 1.6. Just install this version and Lightning will automatically use it. \ No newline at end of file diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 380e8257c1..d66ed38a56 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -1,6 +1,7 @@ import platform from abc import ABC, abstractmethod from typing import Union, List, Tuple, Callable, Optional +import multiprocessing import torch.distributed as torch_distrib from torch.utils.data import DataLoader, RandomSampler, SequentialSampler @@ -106,8 +107,10 @@ class TrainerDataLoadingMixin(ABC): '(this is a bottleneck of Python .spawn() and PyTorch') elif is_dataloader and dataloader.num_workers <= 2 and not on_windows and not using_spawn: + num_cpus = multiprocessing.cpu_count() rank_zero_warn(f'The dataloader, {name}, does not have many workers which may be a bottleneck.' - ' Consider increasing the value of the `num_workers` argument`' + ' Consider increasing the value of the `num_workers` argument` ' + f'(try {num_cpus} which is the number of cpus on this machine)' ' in the `DataLoader` init to improve performance.') elif is_dataloader and dataloader.num_workers == 0 and not on_windows and using_spawn: