scheduler: add <rte_no_stats> config flag to estimate job runtime without stats

The scheduler estimates job runtime based on statistics
of past jobs for this (host, app version).
This doesn't work well if the distribution of runtimes is very wide,
as may be the case of universal apps.

If this flag is set, runtime estimation is based solely on
CPU/GPU peak FLOPS and job FLOPs estimate.
This commit is contained in:
David Anderson 2017-10-31 12:14:50 -07:00
parent 28cc28b4a0
commit 306605e6d8
4 changed files with 47 additions and 19 deletions

View File

@ -299,6 +299,7 @@ int SCHED_CONFIG::parse(FILE* f) {
if (xp.parse_double("maintenance_delay", maintenance_delay)) continue; if (xp.parse_double("maintenance_delay", maintenance_delay)) continue;
if (xp.parse_bool("credit_by_app", credit_by_app)) continue; if (xp.parse_bool("credit_by_app", credit_by_app)) continue;
if (xp.parse_bool("keyword_sched", keyword_sched)) continue; if (xp.parse_bool("keyword_sched", keyword_sched)) continue;
if (xp.parse_bool("rte_no_stats", rte_no_stats)) continue;
//////////// SCHEDULER LOG FLAGS ///////// //////////// SCHEDULER LOG FLAGS /////////

View File

@ -182,6 +182,8 @@ struct SCHED_CONFIG {
// store per-app credit info in credit_user and credit_team // store per-app credit info in credit_user and credit_team
bool keyword_sched; bool keyword_sched;
// score jobs based on keywords // score jobs based on keywords
bool rte_no_stats;
// don't use statistics in job runtime estimation
// time intervals // time intervals
double maintenance_delay; double maintenance_delay;

View File

@ -344,9 +344,32 @@ void estimate_flops_anon_platform() {
// This prevents jobs from aborting with "time limit exceeded" // This prevents jobs from aborting with "time limit exceeded"
// even if the estimate supplied by the plan class function is way off // even if the estimate supplied by the plan class function is way off
// //
#define RTE_HAV_STATS 1
#define RTE_AV_STATS 2
#define RTE_NO_STATS 3
void estimate_flops(HOST_USAGE& hu, APP_VERSION& av) { void estimate_flops(HOST_USAGE& hu, APP_VERSION& av) {
DB_HOST_APP_VERSION* havp = gavid_to_havp(av.id); int mode;
if (havp && havp->et.n > MIN_HOST_SAMPLES) { DB_HOST_APP_VERSION* havp = NULL;
if (config.rte_no_stats) {
mode = RTE_NO_STATS;
} else {
havp = gavid_to_havp(av.id);
if (havp && havp->et.n > MIN_HOST_SAMPLES) {
mode = RTE_HAV_STATS;
} else {
if (av.pfc.n > MIN_VERSION_SAMPLES) {
mode = RTE_AV_STATS;
} else {
mode = RTE_NO_STATS;
}
}
}
switch (mode) {
case RTE_HAV_STATS:
double new_flops; double new_flops;
if (config.estimate_flops_from_hav_pfc) { if (config.estimate_flops_from_hav_pfc) {
new_flops = hu.peak_flops / (havp->pfc.get_avg()+1e-18); new_flops = hu.peak_flops / (havp->pfc.get_avg()+1e-18);
@ -387,24 +410,25 @@ void estimate_flops(HOST_USAGE& hu, APP_VERSION& av) {
1e-9/havp->et.get_avg() 1e-9/havp->et.get_avg()
); );
} }
} else { break;
if (av.pfc.n > MIN_VERSION_SAMPLES) { case RTE_AV_STATS:
hu.projected_flops = hu.peak_flops/av.pfc.get_avg(); hu.projected_flops = hu.peak_flops/av.pfc.get_avg();
if (config.debug_version_select) { if (config.debug_version_select) {
log_messages.printf(MSG_NORMAL, log_messages.printf(MSG_NORMAL,
"[version] [AV#%lu] (%s) adjusting projected flops based on PFC avg: %.2fG\n", "[version] [AV#%lu] (%s) adjusting projected flops based on PFC avg: %.2fG\n",
av.id, av.plan_class, hu.projected_flops/1e9 av.id, av.plan_class, hu.projected_flops/1e9
); );
}
} else {
hu.projected_flops = g_reply->host.p_fpops * (hu.avg_ncpus + GPU_CPU_RATIO*hu.gpu_usage);
if (config.debug_version_select) {
log_messages.printf(MSG_NORMAL,
"[version] [AV#%lu] (%s) using conservative projected flops: %.2fG\n",
av.id, av.plan_class, hu.projected_flops/1e9
);
}
} }
break;
case RTE_NO_STATS:
hu.projected_flops = g_reply->host.p_fpops * (hu.avg_ncpus + GPU_CPU_RATIO*hu.gpu_usage);
if (config.debug_version_select) {
log_messages.printf(MSG_NORMAL,
"[version] [AV#%lu] (%s) using conservative projected flops: %.2fG\n",
av.id, av.plan_class, hu.projected_flops/1e9
);
}
break;
} }
} }

View File

@ -46,3 +46,4 @@
#endif /* #if (defined(_WIN32) || defined(__APPLE__)) */ #endif /* #if (defined(_WIN32) || defined(__APPLE__)) */
#endif /* #ifndef BOINC_VERSION_H */ #endif /* #ifndef BOINC_VERSION_H */