2020-12-21 23:23:33 +00:00
|
|
|
# Copyright The PyTorch Lightning team.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
import functools
|
2021-03-29 17:59:20 +00:00
|
|
|
import os
|
2020-12-21 23:23:33 +00:00
|
|
|
import queue as q
|
|
|
|
import traceback
|
|
|
|
from multiprocessing import Process, Queue
|
2021-08-06 15:01:21 +00:00
|
|
|
from typing import Any, Callable, Union
|
2020-12-21 23:23:33 +00:00
|
|
|
|
2020-12-29 18:02:18 +00:00
|
|
|
from pytorch_lightning.utilities.imports import _XLA_AVAILABLE
|
2020-12-21 23:23:33 +00:00
|
|
|
|
|
|
|
if _XLA_AVAILABLE:
|
|
|
|
import torch_xla.core.xla_model as xm
|
2021-03-29 17:59:20 +00:00
|
|
|
|
2021-05-26 01:44:29 +00:00
|
|
|
# define TPU availability timeout in seconds
|
|
|
|
TPU_CHECK_TIMEOUT = 60
|
2020-12-21 23:23:33 +00:00
|
|
|
|
|
|
|
|
2021-08-06 15:01:21 +00:00
|
|
|
def inner_f(queue: Queue, func: Callable, *args: Any, **kwargs: Any) -> None: # pragma: no cover
|
2020-12-21 23:23:33 +00:00
|
|
|
try:
|
|
|
|
queue.put(func(*args, **kwargs))
|
2021-01-04 08:07:56 +00:00
|
|
|
# todo: specify the possible exception
|
2020-12-21 23:23:33 +00:00
|
|
|
except Exception:
|
|
|
|
traceback.print_exc()
|
|
|
|
queue.put(None)
|
|
|
|
|
|
|
|
|
2021-08-06 15:01:21 +00:00
|
|
|
def pl_multi_process(func: Callable) -> Callable:
|
2020-12-21 23:23:33 +00:00
|
|
|
@functools.wraps(func)
|
2021-08-06 15:01:21 +00:00
|
|
|
def wrapper(*args: Any, **kwargs: Any) -> Union[bool, Any]:
|
|
|
|
queue: Queue = Queue()
|
2020-12-21 23:23:33 +00:00
|
|
|
proc = Process(target=inner_f, args=(queue, func, *args), kwargs=kwargs)
|
|
|
|
proc.start()
|
2021-01-24 11:44:54 +00:00
|
|
|
proc.join(TPU_CHECK_TIMEOUT)
|
2020-12-21 23:23:33 +00:00
|
|
|
try:
|
|
|
|
return queue.get_nowait()
|
|
|
|
except q.Empty:
|
|
|
|
traceback.print_exc()
|
|
|
|
return False
|
|
|
|
|
|
|
|
return wrapper
|
|
|
|
|
|
|
|
|
|
|
|
class XLADeviceUtils:
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Used to detect the type of XLA device."""
|
2020-12-21 23:23:33 +00:00
|
|
|
|
2021-03-29 17:59:20 +00:00
|
|
|
_TPU_AVAILABLE = False
|
2020-12-21 23:23:33 +00:00
|
|
|
|
|
|
|
@staticmethod
|
2021-03-29 17:59:20 +00:00
|
|
|
@pl_multi_process
|
2020-12-21 23:23:33 +00:00
|
|
|
def _is_device_tpu() -> bool:
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Check if TPU devices are available.
|
2020-12-21 23:23:33 +00:00
|
|
|
|
|
|
|
Return:
|
2021-03-31 21:34:33 +00:00
|
|
|
A boolean value indicating if TPU devices are available
|
2020-12-21 23:23:33 +00:00
|
|
|
"""
|
2021-04-30 17:44:06 +00:00
|
|
|
# For the TPU Pod training process, for example, if we have
|
|
|
|
# TPU v3-32 with 4 VMs, the world size would be 4 and as
|
|
|
|
# we would have to use `torch_xla.distributed.xla_dist` for
|
|
|
|
# multiple VMs and TPU_CONFIG won't be available, running
|
|
|
|
# `xm.get_xla_supported_devices("TPU")` won't be possible.
|
2021-10-12 12:01:32 +00:00
|
|
|
return (xm.xrt_world_size() > 1) or bool(xm.get_xla_supported_devices("TPU"))
|
2020-12-21 23:23:33 +00:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def xla_available() -> bool:
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Check if XLA library is installed.
|
2020-12-21 23:23:33 +00:00
|
|
|
|
|
|
|
Return:
|
|
|
|
A boolean value indicating if a XLA is installed
|
|
|
|
"""
|
|
|
|
return _XLA_AVAILABLE
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def tpu_device_exists() -> bool:
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Runs XLA device check within a separate process.
|
2020-12-21 23:23:33 +00:00
|
|
|
|
|
|
|
Return:
|
|
|
|
A boolean value indicating if a TPU device exists on the system
|
|
|
|
"""
|
2021-07-26 11:37:35 +00:00
|
|
|
if os.getenv("PL_TPU_AVAILABLE", "0") == "1":
|
2021-03-29 17:59:20 +00:00
|
|
|
XLADeviceUtils._TPU_AVAILABLE = True
|
|
|
|
|
|
|
|
if XLADeviceUtils.xla_available() and not XLADeviceUtils._TPU_AVAILABLE:
|
|
|
|
|
|
|
|
XLADeviceUtils._TPU_AVAILABLE = XLADeviceUtils._is_device_tpu()
|
|
|
|
|
|
|
|
if XLADeviceUtils._TPU_AVAILABLE:
|
2021-07-26 11:37:35 +00:00
|
|
|
os.environ["PL_TPU_AVAILABLE"] = "1"
|
2021-03-29 17:59:20 +00:00
|
|
|
return XLADeviceUtils._TPU_AVAILABLE
|