diff --git a/sched/sched_config.cpp b/sched/sched_config.cpp index 89cb9c102d..224b18028b 100644 --- a/sched/sched_config.cpp +++ b/sched/sched_config.cpp @@ -171,6 +171,7 @@ int SCHED_CONFIG::parse(FILE* f) { if (xp.parse_bool("enable_assignment_multi", enable_assignment_multi)) continue; if (xp.parse_bool("job_size_matching", job_size_matching)) continue; if (xp.parse_bool("dont_send_jobs", dont_send_jobs)) continue; + if (xp.parse_bool("estimate_flops_from_hav_pfc", estimate_flops_from_hav_pfc)) continue; //////////// STUFF RELEVANT ONLY TO SCHEDULER STARTS HERE /////// diff --git a/sched/sched_config.h b/sched/sched_config.h index 4ef5ed5967..63979a4f34 100644 --- a/sched/sched_config.h +++ b/sched/sched_config.h @@ -176,6 +176,9 @@ struct SCHED_CONFIG { // DEPRECATED: use assignment instead bool workload_sim; // Do workload simulation in deciding whether to send a result + bool estimate_flops_from_hav_pfc; + // Use host_app_version peak flop count rather than elapsed time + // to calculate projected_flops when choosing version. // scheduler log flags // diff --git a/sched/sched_version.cpp b/sched/sched_version.cpp index 7c430ec76f..62d6af9551 100644 --- a/sched/sched_version.cpp +++ b/sched/sched_version.cpp @@ -322,8 +322,10 @@ void estimate_flops_anon_platform() { } // compute HOST_USAGE::projected_flops as best we can: -// 1) if we have statistics for (host, app version) elapsed time, -// use those. +// +// 1) if we have statistics for (host, app version) and +// is not set use elapsed time, +// otherwise use pfc_avg. // 2) if we have statistics for app version elapsed time, use those. // 3) else use a conservative estimate (p_fpops*(cpus+gpus)) // This prevents jobs from aborting with "time limit exceeded" @@ -332,7 +334,12 @@ void estimate_flops_anon_platform() { void estimate_flops(HOST_USAGE& hu, APP_VERSION& av) { DB_HOST_APP_VERSION* havp = gavid_to_havp(av.id); if (havp && havp->et.n > MIN_HOST_SAMPLES) { - double new_flops = 1./havp->et.get_avg(); + double new_flops; + if (config.estimate_flops_from_hav_pfc) { + new_flops = hu.peak_flops / (havp->pfc.get_avg()+1e-18); + } else { + new_flops = 1./havp->et.get_avg(); + } // cap this at ET_RATIO_LIMIT*projected, // in case we've had a bunch of short jobs recently // @@ -350,9 +357,21 @@ void estimate_flops(HOST_USAGE& hu, APP_VERSION& av) { hu.projected_flops = new_flops; if (config.debug_version_select) { + if (config.estimate_flops_from_hav_pfc) { + log_messages.printf(MSG_NORMAL, + "[version] [AV#%d] (%s) setting projected flops based on host_app_version pfc: %.2fG\n", + av.id, av.plan_class, hu.projected_flops/1e9 + ); + } else { + log_messages.printf(MSG_NORMAL, + "[version] [AV#%d] (%s) setting projected flops based on host elapsed time avg: %.2fG\n", + av.id, av.plan_class, hu.projected_flops/1e9 + ); + } log_messages.printf(MSG_NORMAL, - "[version] [AV#%d] (%s) setting projected flops based on host elapsed time avg: %.2fG\n", - av.id, av.plan_class, hu.projected_flops/1e9 + "[version] [AV#%d] (%s) comparison pfc: %.2fG et: %.2fG\n", + av.id, av.plan_class, hu.peak_flops/(havp->pfc.get_avg()+1e-18)/1e+9, + 1e-9/havp->et.get_avg() ); } } else {