From 5e0d0db47cad2f1f27df4ae69175d5b813c21e90 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Thu, 6 Jun 2024 02:52:12 -0700 Subject: [PATCH] Client: don't start jobs that will exceed RAM Fixes #5641 --- client/cpu_sched.cpp | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/client/cpu_sched.cpp b/client/cpu_sched.cpp index cf178964a9..5b42c06832 100644 --- a/client/cpu_sched.cpp +++ b/client/cpu_sched.cpp @@ -1309,26 +1309,33 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { // until it reaches deadline pressure. // So we'll go with 1). - // skip jobs whose working set is too large to fit in available RAM + // skip jobs whose 'expected working set size' (EWSS) + // is too large to fit in available RAM. + // The EWSS is the max of + // 1) the recent average WSS of the job + // 2) if it hasn't run yet, + // the max WSS of other jobs of this app version + // 3) the WU's rsc_memory_bound + // This handles the case of CPDN, where jobs run with + // small WSS for a while and then get big. // - double wss = 0; + double ewss = 0; if (atp) { atp->too_large = false; - wss = atp->procinfo.working_set_size_smoothed; + ewss = atp->procinfo.working_set_size_smoothed; } else { - wss = rp->avp->max_working_set_size; + ewss = rp->avp->max_working_set_size; } - if (wss == 0) { - wss = rp->wup->rsc_memory_bound; - } - if (wss > ram_left) { + ewss = std::max(ewss, rp->wup->rsc_memory_bound); + + if (ewss > ram_left) { if (atp) { atp->too_large = true; } if (log_flags.cpu_sched_debug || log_flags.mem_usage_debug) { msg_printf(rp->project, MSG_INFO, - "[cpu_sched_debug] can't run %s: WS too big %.2fMB > %.2fMB", - rp->name, wss/MEGA, ram_left/MEGA + "[cpu_sched_debug] skipping %s: estimated WSS (%.2fMB) exceeds RAM left (%.2fMB)", + rp->name, ewss/MEGA, ram_left/MEGA ); } continue; @@ -1357,7 +1364,7 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { ncpus_used += rp->avp->avg_ncpus; atp->next_scheduler_state = CPU_SCHED_SCHEDULED; - ram_left -= wss; + ram_left -= ewss; if (have_max_concurrent) { max_concurrent_inc(rp); }