[App] Fix idle timeout e2e (#16786)

This commit is contained in:
Ethan Harris 2023-02-17 01:52:46 +00:00 committed by GitHub
parent 57c1138525
commit 6e359dcc86
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 19 additions and 17 deletions

View File

@ -639,7 +639,10 @@ class LightningWork:
return WorkStatus(**status, count=len(timeout_statuses))
def on_exit(self):
"""Override this hook to add your logic when the work is exiting."""
"""Override this hook to add your logic when the work is exiting.
Note: This hook is not guaranteed to be called when running in the cloud.
"""
pass
def stop(self):

View File

@ -2,7 +2,7 @@ import pathlib
from lightning.app import CloudCompute, LightningApp, LightningFlow, LightningWork
from lightning.app.storage.path import _artifacts_path, _filesystem
from lightning.app.utilities.enum import WorkStageStatus, WorkStopReasons
from lightning.app.utilities.enum import WorkStageStatus
class SourceFileWriterWork(LightningWork):
@ -35,22 +35,21 @@ class RootFlow(LightningFlow):
if self.work.counter == 0:
self.work.run()
elif (
self.work.status.stage == WorkStageStatus.STOPPED
and self.work.status.reason == WorkStopReasons.SIGTERM_SIGNAL_HANDLER
and self.make_check
):
succeeded_status = self.work.statuses[-3]
stopped_status_pending = self.work.statuses[-2]
stopped_status_sigterm = self.work.statuses[-1]
assert succeeded_status.stage == WorkStageStatus.SUCCEEDED
assert stopped_status_pending.stage == WorkStageStatus.STOPPED
assert stopped_status_pending.reason == WorkStopReasons.PENDING
assert stopped_status_sigterm.stage == WorkStageStatus.STOPPED
assert stopped_status_sigterm.reason == WorkStopReasons.SIGTERM_SIGNAL_HANDLER
elif self.work.status.stage == WorkStageStatus.STOPPED and self.make_check:
succeeded_statuses = [status for status in self.work.statuses if status.stage == WorkStageStatus.SUCCEEDED]
# Ensure the work succeeded at some point
assert len(succeeded_statuses) > 0
succeeded_status = succeeded_statuses[-1]
stopped_statuses = [status for status in self.work.statuses if status.stage == WorkStageStatus.STOPPED]
# We want to check that the work started shutting down withing the required timeframe, so we take the first
# status that has `stage == STOPPED`.
stopped_status = stopped_statuses[0]
# Note: Account for the controlplane, k8s, SIGTERM handler delays.
assert (stopped_status_pending.timestamp - succeeded_status.timestamp) < 20
assert (stopped_status_sigterm.timestamp - stopped_status_pending.timestamp) < 120
assert (stopped_status.timestamp - succeeded_status.timestamp) < 20
fs = _filesystem()
destination_path = _artifacts_path(self.work) / pathlib.Path(*self.work.path.resolve().parts[1:])
assert fs.exists(destination_path)