From dec23733913e088dc03b470d7e4e5bca2440a28a Mon Sep 17 00:00:00 2001 From: Sherin Thomas Date: Tue, 25 Oct 2022 12:58:52 +0530 Subject: [PATCH] Better handling connection interruption (#15267) * config fixes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- src/lightning_app/utilities/network.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/lightning_app/utilities/network.py b/src/lightning_app/utilities/network.py index 17801552d0..f8d80af2dc 100644 --- a/src/lightning_app/utilities/network.py +++ b/src/lightning_app/utilities/network.py @@ -27,10 +27,10 @@ def find_free_network_port() -> int: return port -_CONNECTION_RETRY_TOTAL = 5 +_CONNECTION_RETRY_TOTAL = 2880 _CONNECTION_RETRY_BACKOFF_FACTOR = 0.5 -_DEFAULT_BACKOFF_MAX = 5 * 60 -_DEFAULT_REQUEST_TIMEOUT = 5 +_DEFAULT_BACKOFF_MAX = 5 * 60 # seconds +_DEFAULT_REQUEST_TIMEOUT = 30 # seconds def _configure_session() -> Session: @@ -128,7 +128,7 @@ class LightningClient(GridRestClient, metaclass=_MethodsRetryWrapperMeta): super().__init__(api_client=create_swagger_client()) -class TimeoutHTTPAdapter(HTTPAdapter): +class CustomRetryAdapter(HTTPAdapter): def __init__(self, *args, **kwargs): self.timeout = kwargs.pop("timeout", _DEFAULT_REQUEST_TIMEOUT) super().__init__(*args, **kwargs) @@ -158,13 +158,7 @@ def _http_method_logger_wrapper(func: Callable) -> Callable: class HTTPClient: """A wrapper class around the requests library which handles chores like logging, retries, and timeouts - automatically. - - TODO - exception handling on - 1. Persistent errors after retry (we'll retry for 120 sec) - 2. Other HTTP errors which are not handled by retry (we probably shouldn't handle it) - 3. Connection Refused Error (we should retry for ever in this case as well) - """ + automatically.""" def __init__( self, base_url: str, auth_token: Optional[str] = None, log_callback: Optional[Callable] = None @@ -172,6 +166,8 @@ class HTTPClient: self.base_url = base_url retry_strategy = Retry( # wait time between retries increases exponentially according to: backoff_factor * (2 ** (retry - 1)) + # but the the maximum wait time is 120 secs. By setting a large value (2880), we'll make sure clients + # are going to be alive for a very long time (~ 4 days) but retries every 120 seconds total=_CONNECTION_RETRY_TOTAL, backoff_factor=_CONNECTION_RETRY_BACKOFF_FACTOR, status_forcelist=[ @@ -183,7 +179,7 @@ class HTTPClient: 504, # Gateway Timeout ], ) - adapter = TimeoutHTTPAdapter(max_retries=retry_strategy, timeout=_DEFAULT_REQUEST_TIMEOUT) + adapter = CustomRetryAdapter(max_retries=retry_strategy, timeout=_DEFAULT_REQUEST_TIMEOUT) self.session = requests.Session() self.session.hooks = {"response": lambda r, *args, **kwargs: r.raise_for_status()}