From 306605e6d88134b9cbe53214ba97d2140cfb819a Mon Sep 17 00:00:00 2001 From: David Anderson Date: Tue, 31 Oct 2017 12:14:50 -0700 Subject: [PATCH] scheduler: add config flag to estimate job runtime without stats The scheduler estimates job runtime based on statistics of past jobs for this (host, app version). This doesn't work well if the distribution of runtimes is very wide, as may be the case of universal apps. If this flag is set, runtime estimation is based solely on CPU/GPU peak FLOPS and job FLOPs estimate. --- sched/sched_config.cpp | 1 + sched/sched_config.h | 2 ++ sched/sched_version.cpp | 62 ++++++++++++++++++++++++++++------------- version.h | 1 + 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/sched/sched_config.cpp b/sched/sched_config.cpp index 7b7c595fbd..306951695a 100644 --- a/sched/sched_config.cpp +++ b/sched/sched_config.cpp @@ -299,6 +299,7 @@ int SCHED_CONFIG::parse(FILE* f) { if (xp.parse_double("maintenance_delay", maintenance_delay)) continue; if (xp.parse_bool("credit_by_app", credit_by_app)) continue; if (xp.parse_bool("keyword_sched", keyword_sched)) continue; + if (xp.parse_bool("rte_no_stats", rte_no_stats)) continue; //////////// SCHEDULER LOG FLAGS ///////// diff --git a/sched/sched_config.h b/sched/sched_config.h index 75757db866..f898144db3 100644 --- a/sched/sched_config.h +++ b/sched/sched_config.h @@ -182,6 +182,8 @@ struct SCHED_CONFIG { // store per-app credit info in credit_user and credit_team bool keyword_sched; // score jobs based on keywords + bool rte_no_stats; + // don't use statistics in job runtime estimation // time intervals double maintenance_delay; diff --git a/sched/sched_version.cpp b/sched/sched_version.cpp index 246de89ebc..c082c34abb 100644 --- a/sched/sched_version.cpp +++ b/sched/sched_version.cpp @@ -344,9 +344,32 @@ void estimate_flops_anon_platform() { // This prevents jobs from aborting with "time limit exceeded" // even if the estimate supplied by the plan class function is way off // + +#define RTE_HAV_STATS 1 +#define RTE_AV_STATS 2 +#define RTE_NO_STATS 3 + void estimate_flops(HOST_USAGE& hu, APP_VERSION& av) { - DB_HOST_APP_VERSION* havp = gavid_to_havp(av.id); - if (havp && havp->et.n > MIN_HOST_SAMPLES) { + int mode; + DB_HOST_APP_VERSION* havp = NULL; + + if (config.rte_no_stats) { + mode = RTE_NO_STATS; + } else { + havp = gavid_to_havp(av.id); + if (havp && havp->et.n > MIN_HOST_SAMPLES) { + mode = RTE_HAV_STATS; + } else { + if (av.pfc.n > MIN_VERSION_SAMPLES) { + mode = RTE_AV_STATS; + } else { + mode = RTE_NO_STATS; + } + } + } + + switch (mode) { + case RTE_HAV_STATS: double new_flops; if (config.estimate_flops_from_hav_pfc) { new_flops = hu.peak_flops / (havp->pfc.get_avg()+1e-18); @@ -387,24 +410,25 @@ void estimate_flops(HOST_USAGE& hu, APP_VERSION& av) { 1e-9/havp->et.get_avg() ); } - } else { - if (av.pfc.n > MIN_VERSION_SAMPLES) { - hu.projected_flops = hu.peak_flops/av.pfc.get_avg(); - if (config.debug_version_select) { - log_messages.printf(MSG_NORMAL, - "[version] [AV#%lu] (%s) adjusting projected flops based on PFC avg: %.2fG\n", - av.id, av.plan_class, hu.projected_flops/1e9 - ); - } - } else { - hu.projected_flops = g_reply->host.p_fpops * (hu.avg_ncpus + GPU_CPU_RATIO*hu.gpu_usage); - if (config.debug_version_select) { - log_messages.printf(MSG_NORMAL, - "[version] [AV#%lu] (%s) using conservative projected flops: %.2fG\n", - av.id, av.plan_class, hu.projected_flops/1e9 - ); - } + break; + case RTE_AV_STATS: + hu.projected_flops = hu.peak_flops/av.pfc.get_avg(); + if (config.debug_version_select) { + log_messages.printf(MSG_NORMAL, + "[version] [AV#%lu] (%s) adjusting projected flops based on PFC avg: %.2fG\n", + av.id, av.plan_class, hu.projected_flops/1e9 + ); } + break; + case RTE_NO_STATS: + hu.projected_flops = g_reply->host.p_fpops * (hu.avg_ncpus + GPU_CPU_RATIO*hu.gpu_usage); + if (config.debug_version_select) { + log_messages.printf(MSG_NORMAL, + "[version] [AV#%lu] (%s) using conservative projected flops: %.2fG\n", + av.id, av.plan_class, hu.projected_flops/1e9 + ); + } + break; } } diff --git a/version.h b/version.h index dd03f9455e..9dbed3bb53 100644 --- a/version.h +++ b/version.h @@ -46,3 +46,4 @@ #endif /* #if (defined(_WIN32) || defined(__APPLE__)) */ #endif /* #ifndef BOINC_VERSION_H */ +