diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst index 53cb95bf9f..9140f52aba 100644 --- a/docs/source/training_tricks.rst +++ b/docs/source/training_tricks.rst @@ -39,7 +39,7 @@ Auto scaling of batch size -------------------------- Auto scaling of batch size may be enabled to find the largest batch size that fits into memory. Larger batch size often yields better estimates of gradients, but may also result in -longer training time. +longer training time. Inspired by https://github.com/BlackHC/toma. .. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer` diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index eed7a13ca9..19a473640e 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -32,24 +32,29 @@ def is_oom_error(exception): or is_out_of_cpu_memory(exception) +# based on https://github.com/BlackHC/toma/blob/master/toma/torch_cuda_memory.py def is_cuda_out_of_memory(exception): return isinstance(exception, RuntimeError) \ and len(exception.args) == 1 \ and "CUDA out of memory." in exception.args[0] +# based on https://github.com/BlackHC/toma/blob/master/toma/torch_cuda_memory.py def is_cudnn_snafu(exception): + # For/because of https://github.com/pytorch/pytorch/issues/4107 return isinstance(exception, RuntimeError) \ and len(exception.args) == 1 \ and "cuDNN error: CUDNN_STATUS_NOT_SUPPORTED." in exception.args[0] +# based on https://github.com/BlackHC/toma/blob/master/toma/cpu_memory.py def is_out_of_cpu_memory(exception): return isinstance(exception, RuntimeError) \ and len(exception.args) == 1 \ and "DefaultCPUAllocator: can't allocate memory" in exception.args[0] +# based on https://github.com/BlackHC/toma/blob/master/toma/torch_cuda_memory.py def garbage_collection_cuda(): """Garbage collection Torch (CUDA) memory.""" gc.collect()