From 48a76a785da99bbd8f258302b4341e1c2e4ec7df Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Mon, 15 Jun 2020 08:02:19 -0400
Subject: [PATCH] Performance docs (#2191)

* add workers fix

* add workers fix
---
 docs/source/index.rst                     |  1 +
 docs/source/performance.rst               | 85 +++++++++++++++++++++++
 pytorch_lightning/trainer/data_loading.py |  5 +-
 3 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/performance.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 0b0bfd0256..113665f24c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -13,6 +13,7 @@ PyTorch Lightning Documentation
 
    new-project
    introduction_guide
+   performance
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/source/performance.rst b/docs/source/performance.rst
new file mode 100644
index 0000000000..76b0d650ba
--- /dev/null
+++ b/docs/source/performance.rst
@@ -0,0 +1,85 @@
+Fast Performance
+================
+Here are some best practices to increase your performance.
+
+Dataloaders
+-----------
+When building your Dataloader set `num_workers` > 0 and `pin_memory=True` (only for GPUs).
+
+.. code-block:: python
+
+    Dataloader(dataset, num_workers=8, pin_memory=True)
+
+num_workers
+^^^^^^^^^^^
+The question of how many `num_workers` is tricky. Here's a summary of
+some references, [`1 <https://discuss.pytorch.org/t/guidelines-for-assigning-num-workers-to-dataloader/813>`_], and our suggestions.
+
+1. num_workers=0 means ONLY the main process will load batches (that can be a bottleneck).
+2. num_workers=1 means ONLY one worker (just not the main process) will load data but it will still be slow.
+3. The num_workers depends on the batch size and your machine.
+4. A general place to start is to set `num_workers` equal to the number of CPUs on that machine.
+
+.. warning:: Increasing num_workers will ALSO increase your CPU memory consumption.
+
+The best thing to do is to increase the nun_workers slowly and stop once you see no more improvement in your training speed.
+
+Spawn
+^^^^^
+When using `distributed_backend=ddp_spawn` (the ddp default) or TPU training, the way multiple GPUs/TPU cores are used is by calling `.spawn()` under the hood.
+The problem is that PyTorch has issues with `num_workers` > 0 when using .spawn(). For this reason we recommend you
+use `distributed_backend=ddp` so you can increase the `num_workers`, however your script has to be callable like so:
+
+.. code-block:: bash
+
+    python my_program.py --gpus X
+
+.item(), .numpy(), .cpu()
+-------------------------
+Don't call .item() anywhere on your code. Use `.detach()` instead to remove the connected graph calls. Lightning
+takes a great deal of care to be optimized for this.
+
+empty_cache()
+-------------
+Don't call this unnecessarily! Every time you call this ALL your GPUs have to wait to sync.
+
+construct tensors directly on device
+------------------------------------
+LightningModules know what device they are on! construct tensors on the device directly to avoid CPU->Device transfer.
+
+.. code-block:: python
+
+    # bad
+    t = tensor.rand(2, 2).cuda()
+
+    # good (self is lightningModule)
+    t = tensor.rand(2,2, device=self.device)
+
+Use DDP not DP
+--------------
+DP performs three GPU transfers for EVERY batch:
+
+1. Copy model to device.
+2. Copy data to device.
+3. Copy outputs of each device back to master.
+
+Whereas DDP only performs 1 transfer to sync gradients. Because of this, DDP is MUCH faster than DP.
+
+16-bit precision
+----------------
+Use 16-bit to decrease the memory (and thus increase your batch size). On certain GPUs (V100s, 2080tis), 16-bit calculations are also faster.
+However, know that 16-bit and multi-processing (any DDP) can have issues. Here are some common problems.
+
+1. `CUDA error: an illegal memory access was encountered <https://github.com/pytorch/pytorch/issues/21819>`_.
+    The solution is likely setting a specific CUDA, CUDNN, PyTorch version combination.
+2. `CUDA error: device-side assert triggered`. This is a general catch-all error. To see the actual error run your script like so:
+
+    .. code-block:: bash
+
+        # won't see what the error is
+        python main.py
+
+        # will see what the error is
+        CUDA_LAUNCH_BLOCKING=1 python main.py
+
+We also recommend using 16-bit native found in PyTorch 1.6. Just install this version and Lightning will automatically use it.
\ No newline at end of file
diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 380e8257c1..d66ed38a56 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -1,6 +1,7 @@
 import platform
 from abc import ABC, abstractmethod
 from typing import Union, List, Tuple, Callable, Optional
+import multiprocessing
 
 import torch.distributed as torch_distrib
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
@@ -106,8 +107,10 @@ class TrainerDataLoadingMixin(ABC):
                            '(this is a bottleneck of Python .spawn() and PyTorch')
 
         elif is_dataloader and dataloader.num_workers <= 2 and not on_windows and not using_spawn:
+            num_cpus = multiprocessing.cpu_count()
             rank_zero_warn(f'The dataloader, {name}, does not have many workers which may be a bottleneck.'
-                           ' Consider increasing the value of the `num_workers` argument`'
+                           ' Consider increasing the value of the `num_workers` argument` '
+                           f'(try {num_cpus} which is the number of cpus on this machine)'
                            ' in the `DataLoader` init to improve performance.')
 
         elif is_dataloader and dataloader.num_workers == 0 and not on_windows and using_spawn: