- Added scheduler config boolean option <estimate_flops_from_hav_pfc> which

allows projected_flops to be calculated from host_app_version pfc rather
  than elapsed time.  This is valuable if result elapsed times are highly
  variable and dependent on input.
This commit is contained in:
Eric J Korpela 2013-04-30 16:30:27 -07:00
parent abb72ddc9e
commit fd5c8c6e82
3 changed files with 28 additions and 5 deletions

View File

@ -171,6 +171,7 @@ int SCHED_CONFIG::parse(FILE* f) {
if (xp.parse_bool("enable_assignment_multi", enable_assignment_multi)) continue;
if (xp.parse_bool("job_size_matching", job_size_matching)) continue;
if (xp.parse_bool("dont_send_jobs", dont_send_jobs)) continue;
if (xp.parse_bool("estimate_flops_from_hav_pfc", estimate_flops_from_hav_pfc)) continue;
//////////// STUFF RELEVANT ONLY TO SCHEDULER STARTS HERE ///////

View File

@ -176,6 +176,9 @@ struct SCHED_CONFIG {
// DEPRECATED: use assignment instead
bool workload_sim;
// Do workload simulation in deciding whether to send a result
bool estimate_flops_from_hav_pfc;
// Use host_app_version peak flop count rather than elapsed time
// to calculate projected_flops when choosing version.
// scheduler log flags
//

View File

@ -322,8 +322,10 @@ void estimate_flops_anon_platform() {
}
// compute HOST_USAGE::projected_flops as best we can:
// 1) if we have statistics for (host, app version) elapsed time,
// use those.
//
// 1) if we have statistics for (host, app version) and
// <estimate_flops_from_hav_pfc> is not set use elapsed time,
// otherwise use pfc_avg.
// 2) if we have statistics for app version elapsed time, use those.
// 3) else use a conservative estimate (p_fpops*(cpus+gpus))
// This prevents jobs from aborting with "time limit exceeded"
@ -332,7 +334,12 @@ void estimate_flops_anon_platform() {
void estimate_flops(HOST_USAGE& hu, APP_VERSION& av) {
DB_HOST_APP_VERSION* havp = gavid_to_havp(av.id);
if (havp && havp->et.n > MIN_HOST_SAMPLES) {
double new_flops = 1./havp->et.get_avg();
double new_flops;
if (config.estimate_flops_from_hav_pfc) {
new_flops = hu.peak_flops / (havp->pfc.get_avg()+1e-18);
} else {
new_flops = 1./havp->et.get_avg();
}
// cap this at ET_RATIO_LIMIT*projected,
// in case we've had a bunch of short jobs recently
//
@ -350,11 +357,23 @@ void estimate_flops(HOST_USAGE& hu, APP_VERSION& av) {
hu.projected_flops = new_flops;
if (config.debug_version_select) {
if (config.estimate_flops_from_hav_pfc) {
log_messages.printf(MSG_NORMAL,
"[version] [AV#%d] (%s) setting projected flops based on host_app_version pfc: %.2fG\n",
av.id, av.plan_class, hu.projected_flops/1e9
);
} else {
log_messages.printf(MSG_NORMAL,
"[version] [AV#%d] (%s) setting projected flops based on host elapsed time avg: %.2fG\n",
av.id, av.plan_class, hu.projected_flops/1e9
);
}
log_messages.printf(MSG_NORMAL,
"[version] [AV#%d] (%s) comparison pfc: %.2fG et: %.2fG\n",
av.id, av.plan_class, hu.peak_flops/(havp->pfc.get_avg()+1e-18)/1e+9,
1e-9/havp->et.get_avg()
);
}
} else {
if (av.pfc.n > MIN_VERSION_SAMPLES) {
hu.projected_flops = hu.peak_flops/av.pfc.get_avg();