lightning/pytorch_lightning/utilities/distributed.py

# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import warnings
from functools import wraps

import torch
from pytorch_lightning import _logger as log
from typing import Union, Optional, Any

if torch.distributed.is_available():
    from torch.distributed import ReduceOp
else:
    class ReduceOp:
        SUM = None


def rank_zero_only(fn):

    @wraps(fn)
    def wrapped_fn(*args, **kwargs):
        if rank_zero_only.rank == 0:
            return fn(*args, **kwargs)

    return wrapped_fn


# add the attribute to the function but don't overwrite in case Trainer has already set it
rank_zero_only.rank = getattr(rank_zero_only, 'rank', int(os.environ.get('LOCAL_RANK', 0)))


def _warn(*args, **kwargs):
    warnings.warn(*args, **kwargs)


def _info(*args, **kwargs):
    log.info(*args, **kwargs)


def _debug(*args, **kwargs):
    log.debug(*args, **kwargs)


rank_zero_debug = rank_zero_only(_debug)
rank_zero_info = rank_zero_only(_info)
rank_zero_warn = rank_zero_only(_warn)


def find_free_network_port() -> int:
    """
    Finds a free port on localhost.
    It is useful in single-node training when we don't want to connect to a real master node but
    have to set the `MASTER_PORT` environment variable.
    """
    import socket
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind(("", 0))
    s.listen(1)
    port = s.getsockname()[1]
    s.close()
    return port


def gather_all_tensors_if_available(result: Union[torch.Tensor], group: Optional[Any] = None):
    """
    Function to gather all tensors from several ddp processes onto a list that
    is broadcasted to all processes

    Args:
        result: the value to sync
        group: the process group to gather results from. Defaults to all processes (world)

    Return:
        gathered_result: list with size equal to the process group where
            gathered_result[i] corresponds to result tensor from process i

    """
    if torch.distributed.is_available() and torch.distributed.is_initialized():
        if group is None:
            group = torch.distributed.group.WORLD

        world_size = torch.distributed.get_world_size(group)

        gathered_result = [torch.zeros_like(result) for _ in range(world_size)]

        # sync and broadcast all
        torch.distributed.barrier(group=group)
        torch.distributed.all_gather(gathered_result, result, group)

        result = gathered_result
    return result


def sync_ddp_if_available(
    result: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None
) -> torch.Tensor:
    """
    Function to reduce the tensors from several ddp processes to one master process

    Args:
        result: the value to sync and reduce (typically tensor or number)
        group: the process group to gather results from. Defaults to all processes (world)
        reduce_op: the reduction operation. Defaults to sum.
            Can also be a string of 'avg', 'mean' to calculate the mean during reduction.

    Return:
        reduced value
    """

    if torch.distributed.is_available() and torch.distributed.is_initialized():
        divide_by_world_size = False

        if group is None:
            group = torch.distributed.group.WORLD

        if reduce_op is None:
            reduce_op = torch.distributed.ReduceOp.SUM
        elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"):
            reduce_op = torch.distributed.ReduceOp.SUM
            divide_by_world_size = True

        # sync all processes before reduction
        torch.distributed.barrier(group=group)
        torch.distributed.all_reduce(result, op=reduce_op, group=group, async_op=False)

        if divide_by_world_size:
            result = result / torch.distributed.get_world_size(group)

    return result
added copyright notices (#3062) 2020-08-20 02:03:22 +00:00			`# Copyright The PyTorch Lightning team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

clean imports (#2867) * clean imports * miss 2020-08-07 22:33:51 +00:00			`import os`
model checkpint on rank_zero_only & global rank state (#1408) * try delete in async or DDP us0-ecase * changelog * add model chekpoint rank * simple delete * flake8 * use global rank * chnagelog * fix review * fix import * proposal * proposal * proposal * improve proposal (fix problems with method call self) * cleaning Co-authored-by: Adrian Wälchli <adrian.waelchli@students.unibe.ch> Co-authored-by: William Falcon <waf2107@columbia.edu> 2020-04-24 21:21:00 +00:00			`import warnings`
clean imports (#2867) * clean imports * miss 2020-08-07 22:33:51 +00:00			`from functools import wraps`

revamp entire metrics (#3868) * removed metric Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * added new metrics Co-authored-by: Teddy Koker teddy.koker@gmail.com * pep8 Co-authored-by: Teddy Koker teddy.koker@gmail.com * pep8 Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * docs Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * docs Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * win ddp tests skip Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * win ddp tests skip Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * win ddp tests skip Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * win ddp tests skip Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * reset in compute, cache compute Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * reduce_ops handling Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * sync -> sync_dist, type annotations Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * wip docs Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * mean squared error * docstring * added mean ___ error metrics * added mean ___ error metrics * seperated files * accuracy doctest * gpu fix * remove unnecessary mixin * metric and accuracy docstring Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * metric docs Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * pep8, changelog Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * refactor dist utils, pep8 * refactor dist utils, pep8 Co-authored-by: Teddy Koker <teddy.koker@gmail.com> 2020-10-06 21:03:24 +00:00			`import torch`
sets default ddp mode to spawn (#2168) * set ddp_spawn as default * spawn message * spawn message * spawn message * spawn message * spawn message * spawn message * spawn message * spawn message 2020-06-13 07:47:45 +00:00			`from pytorch_lightning import _logger as log`
revamp entire metrics (#3868) * removed metric Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * added new metrics Co-authored-by: Teddy Koker teddy.koker@gmail.com * pep8 Co-authored-by: Teddy Koker teddy.koker@gmail.com * pep8 Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * docs Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * docs Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * win ddp tests skip Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * win ddp tests skip Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * win ddp tests skip Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * win ddp tests skip Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * reset in compute, cache compute Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * reduce_ops handling Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * sync -> sync_dist, type annotations Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * wip docs Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * mean squared error * docstring * added mean ___ error metrics * added mean ___ error metrics * seperated files * accuracy doctest * gpu fix * remove unnecessary mixin * metric and accuracy docstring Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * metric docs Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * pep8, changelog Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * refactor dist utils, pep8 * refactor dist utils, pep8 Co-authored-by: Teddy Koker <teddy.koker@gmail.com> 2020-10-06 21:03:24 +00:00			`from typing import Union, Optional, Any`

			`if torch.distributed.is_available():`
			`from torch.distributed import ReduceOp`
			`else:`
			`class ReduceOp:`
			`SUM = None`
model checkpint on rank_zero_only & global rank state (#1408) * try delete in async or DDP us0-ecase * changelog * add model chekpoint rank * simple delete * flake8 * use global rank * chnagelog * fix review * fix import * proposal * proposal * proposal * improve proposal (fix problems with method call self) * cleaning Co-authored-by: Adrian Wälchli <adrian.waelchli@students.unibe.ch> Co-authored-by: William Falcon <waf2107@columbia.edu> 2020-04-24 21:21:00 +00:00

			`def rank_zero_only(fn):`

			`@wraps(fn)`
			`def wrapped_fn(args, *kwargs):`
			`if rank_zero_only.rank == 0:`
			`return fn(args, *kwargs)`

			`return wrapped_fn`


Fix local rank zero casting (#2640) * Fix local rank zero casting The environment variable 'LOCAL_RANK' can be a string, causing the `if rank_zero_only.rank == 0` check to fail * Update distributed.py address comment 2020-07-19 00:12:06 +00:00			`# add the attribute to the function but don't overwrite in case Trainer has already set it`
			`rank_zero_only.rank = getattr(rank_zero_only, 'rank', int(os.environ.get('LOCAL_RANK', 0)))`
model checkpint on rank_zero_only & global rank state (#1408) * try delete in async or DDP us0-ecase * changelog * add model chekpoint rank * simple delete * flake8 * use global rank * chnagelog * fix review * fix import * proposal * proposal * proposal * improve proposal (fix problems with method call self) * cleaning Co-authored-by: Adrian Wälchli <adrian.waelchli@students.unibe.ch> Co-authored-by: William Falcon <waf2107@columbia.edu> 2020-04-24 21:21:00 +00:00
EvalResult support for val loop (PR 3/5) (#2651) * add EvalResult to support to val/test loops 2020-07-22 17:53:10 +00:00
model checkpint on rank_zero_only & global rank state (#1408) * try delete in async or DDP us0-ecase * changelog * add model chekpoint rank * simple delete * flake8 * use global rank * chnagelog * fix review * fix import * proposal * proposal * proposal * improve proposal (fix problems with method call self) * cleaning Co-authored-by: Adrian Wälchli <adrian.waelchli@students.unibe.ch> Co-authored-by: William Falcon <waf2107@columbia.edu> 2020-04-24 21:21:00 +00:00			`def _warn(args, *kwargs):`
			`warnings.warn(args, *kwargs)`


sets default ddp mode to spawn (#2168) * set ddp_spawn as default * spawn message * spawn message * spawn message * spawn message * spawn message * spawn message * spawn message * spawn message 2020-06-13 07:47:45 +00:00			`def _info(args, *kwargs):`
			`log.info(args, *kwargs)`


native amp (#2373) * native amp * typo * imports * apex 2020-06-27 01:45:13 +00:00			`def _debug(args, *kwargs):`
Fix log debug call (#3528) 2020-09-17 20:36:44 +00:00			`log.debug(args, *kwargs)`
native amp (#2373) * native amp * typo * imports * apex 2020-06-27 01:45:13 +00:00

			`rank_zero_debug = rank_zero_only(_debug)`
sets default ddp mode to spawn (#2168) * set ddp_spawn as default * spawn message * spawn message * spawn message * spawn message * spawn message * spawn message * spawn message * spawn message 2020-06-13 07:47:45 +00:00			`rank_zero_info = rank_zero_only(_info)`
model checkpint on rank_zero_only & global rank state (#1408) * try delete in async or DDP us0-ecase * changelog * add model chekpoint rank * simple delete * flake8 * use global rank * chnagelog * fix review * fix import * proposal * proposal * proposal * improve proposal (fix problems with method call self) * cleaning Co-authored-by: Adrian Wälchli <adrian.waelchli@students.unibe.ch> Co-authored-by: William Falcon <waf2107@columbia.edu> 2020-04-24 21:21:00 +00:00			`rank_zero_warn = rank_zero_only(_warn)`
ddp fix for trainer.test() + add basic ddp tests (#2997) * add ddp script variations * add ddp test * rename * shell * test * test * try call * try without subprocess * test * display the error * list all variations * try string * try copy env * debug * pythonpath * path * update test * change * simple ddp test * replace * remove random port * random port * str * clean up * check run spawn * clean up * docs * docs * update test * docs * changelog * changelog 2020-08-16 15:19:57 +00:00

			`def find_free_network_port() -> int:`
			`"""`
			`Finds a free port on localhost.`
			`It is useful in single-node training when we don't want to connect to a real master node but`
			have to set the `MASTER_PORT` environment variable.
			`"""`
			`import socket`
			`s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)`
			`s.bind(("", 0))`
			`s.listen(1)`
			`port = s.getsockname()[1]`
			`s.close()`
			`return port`
revamp entire metrics (#3868) * removed metric Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * added new metrics Co-authored-by: Teddy Koker teddy.koker@gmail.com * pep8 Co-authored-by: Teddy Koker teddy.koker@gmail.com * pep8 Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * docs Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * docs Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * win ddp tests skip Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * win ddp tests skip Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * win ddp tests skip Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * win ddp tests skip Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * reset in compute, cache compute Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * reduce_ops handling Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * sync -> sync_dist, type annotations Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * wip docs Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * mean squared error * docstring * added mean ___ error metrics * added mean ___ error metrics * seperated files * accuracy doctest * gpu fix * remove unnecessary mixin * metric and accuracy docstring Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * metric docs Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * pep8, changelog Co-authored-by: Teddy Koker <teddy.koker@gmail.com> * refactor dist utils, pep8 * refactor dist utils, pep8 Co-authored-by: Teddy Koker <teddy.koker@gmail.com> 2020-10-06 21:03:24 +00:00

			`def gather_all_tensors_if_available(result: Union[torch.Tensor], group: Optional[Any] = None):`
			`"""`
			`Function to gather all tensors from several ddp processes onto a list that`
			`is broadcasted to all processes`

			`Args:`
			`result: the value to sync`
			`group: the process group to gather results from. Defaults to all processes (world)`

			`Return:`
			`gathered_result: list with size equal to the process group where`
			`gathered_result[i] corresponds to result tensor from process i`

			`"""`
			`if torch.distributed.is_available() and torch.distributed.is_initialized():`
			`if group is None:`
			`group = torch.distributed.group.WORLD`

			`world_size = torch.distributed.get_world_size(group)`

			`gathered_result = [torch.zeros_like(result) for _ in range(world_size)]`

			`# sync and broadcast all`
			`torch.distributed.barrier(group=group)`
			`torch.distributed.all_gather(gathered_result, result, group)`

			`result = gathered_result`
			`return result`


			`def sync_ddp_if_available(`
			`result: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None`
			`) -> torch.Tensor:`
			`"""`
			`Function to reduce the tensors from several ddp processes to one master process`

			`Args:`
			`result: the value to sync and reduce (typically tensor or number)`
			`group: the process group to gather results from. Defaults to all processes (world)`
			`reduce_op: the reduction operation. Defaults to sum.`
			`Can also be a string of 'avg', 'mean' to calculate the mean during reduction.`

			`Return:`
			`reduced value`
			`"""`

			`if torch.distributed.is_available() and torch.distributed.is_initialized():`
			`divide_by_world_size = False`

			`if group is None:`
			`group = torch.distributed.group.WORLD`

			`if reduce_op is None:`
			`reduce_op = torch.distributed.ReduceOp.SUM`
			`elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"):`
			`reduce_op = torch.distributed.ReduceOp.SUM`
			`divide_by_world_size = True`

			`# sync all processes before reduction`
			`torch.distributed.barrier(group=group)`
			`torch.distributed.all_reduce(result, op=reduce_op, group=group, async_op=False)`

			`if divide_by_world_size:`
			`result = result / torch.distributed.get_world_size(group)`

			`return result`