From f3b509c913305a9bfe8d70b617c451cd399ec1a3 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Tue, 12 Feb 2019 15:52:23 -0800 Subject: [PATCH] client: sanity-check job runtime limits A job is assigned a max runtime as: max_elapsed_time = rp->wup->rsc_fpops_bound/rp->avp->flops The purpose is to eventually abort jobs that are in an infinite loop. Various problems (e.g. bad GPU peak FLOPS calculations) can cause this limit to be too small, e.g. one second, in which case the job is aborted almost immediately. In this change, if the calculated limit is < 2 minutes, it's assumed to be in error, a limit of 30 minutes is used instead, and an event log message is written. Of course the underlying problem still must be addressed. But this change will, in some cases, prevent a situation where thousands of jobs are dispatched and immediately aborted. --- client/app.cpp | 7 +++++++ client/client_state.h | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/client/app.cpp b/client/app.cpp index c5dd8d0de6..a7ccbec5db 100644 --- a/client/app.cpp +++ b/client/app.cpp @@ -283,6 +283,13 @@ int ACTIVE_TASK::init(RESULT* rp) { wup = rp->wup; app_version = rp->avp; max_elapsed_time = rp->wup->rsc_fpops_bound/rp->avp->flops; + if (max_elapsed_time < MIN_TIME_BOUND) { + msg_printf(wup->project, MSG_INFO, + "Elapsed time limit %f < %f; setting to %f", + max_elapsed_time, MIN_TIME_BOUND, DEFAULT_TIME_BOUND + ); + max_elapsed_time = DEFAULT_TIME_BOUND; + } max_disk_usage = rp->wup->rsc_disk_bound; max_mem_usage = rp->wup->rsc_memory_bound; get_slot_dir(slot, slot_dir, sizeof(slot_dir)); diff --git a/client/client_state.h b/client/client_state.h index 4e8294762f..b5c79bfcaf 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -624,6 +624,14 @@ extern THREAD throttle_thread; #define MAX_STARTUP_TIME 10 // if app startup takes longer than this, quit loop +#define MIN_TIME_BOUND 120. +#define DEFAULT_TIME_BOUND 1800. + // if ACTIVE_TASK::max_elapsed_time is < MIN, set it to DEFAULT + // This is a sanity check, so that bad values for + // wup->rsc_fpops_bound or avp->flops won't cause jobs + // to get aborted after a few seconds + // The values are a bit arbitrary. + //////// NETWORK #define CONNECT_ERROR_PERIOD 600.0