server: use gpu_active_frac in scheduling decisions

On some hosts, gpu_active_frac may be much less than active_frac
(i.e., GPUs may be available much less than CPUs).
Use gpu_active_frac in the following places:

- scheduler: in estimating the elapsed time of jobs,
    to decide whether they can meet deadline
- scheduler: in computing the effective speed of a (host, app version),
    when deciding what size class it belongs to
- size_census: in computing effective speed of (host, app versions)

(Previously, we were just using active_frac in all these cases)
This commit is contained in:
David Anderson 2014-03-06 21:23:02 -08:00
parent 8ba022760b
commit 5381def663
5 changed files with 68 additions and 27 deletions

View File

@ -131,4 +131,14 @@ function drand() {
return ((double)rand())/getrandmax();
}
// kludge
//
function is_gpu($plan_class) {
if (strstr($plan_class, "ati")) return true;
if (strstr($plan_class, "cuda")) return true;
if (strstr($plan_class, "nvidia")) return true;
if (strstr($plan_class, "intel_gpu")) return true;
return false;
}
?>

View File

@ -37,7 +37,7 @@ function do_app($app) {
// joined to the host
$db = BoincDb::get();
$query = "select et_avg, host.on_frac, host.active_frac " .
$query = "select et_avg, host.on_frac, host.active_frac, host.gpu_active_frac, app_version.plan_class " .
" from DBNAME.host_app_version, DBNAME.host, DBNAME.app_version " .
" where host_app_version.app_version_id = app_version.id " .
" and app_version.appid = $app->id " .
@ -46,7 +46,17 @@ function do_app($app) {
$result = $db->do_query($query);
$a = array();
while ($x = mysql_fetch_object($result)) {
$a[] = (1/$x->et_avg) * $x->on_frac * $x->active_frac;
if (is_gpu($x->plan_class)) {
$av = $x->on_frac;
if ($x->gpu_active_frac) {
$av *= $x->gpu_active_frac;
} else {
$av *= $x->active_frac;
}
} else {
$av = $x->on_frac * $x->active_frac;
}
$a[] = (1/$x->et_avg) * $av;
}
mysql_free_result($result);
sort($a);

View File

@ -99,7 +99,7 @@ bool JOB::get_score(WU_RESULT& wu_result) {
}
if (app->n_size_classes > 1) {
double effective_speed = bavp->host_usage.projected_flops * g_reply->host.on_frac * g_reply->host.active_frac;
double effective_speed = bavp->host_usage.projected_flops * available_frac(*bavp);
int target_size = get_size_class(*app, effective_speed);
if (config.debug_send) {
log_messages.printf(MSG_NORMAL,

View File

@ -409,28 +409,46 @@ static double estimate_duration_unscaled(WORKUNIT& wu, BEST_APP_VERSION& bav) {
return rsc_fpops_est/bav.host_usage.projected_flops;
}
static inline void get_running_frac() {
double rf;
if (g_request->core_client_version<=41900) {
rf = g_reply->host.on_frac;
} else {
rf = g_reply->host.active_frac * g_reply->host.on_frac;
// Compute cpu_available_frac and gpu_available_frac.
// These are based on client-supplied data, so do sanity checks
//
#define FRAC_MIN 0.1
static inline void clamp_frac(double& frac, const char* name) {
if (frac > 1) {
if (config.debug_send) {
log_messages.printf(MSG_NORMAL,
"[send] %s=%f; setting to 1\n", name, frac
);
}
frac = 1;
} else if (frac < FRAC_MIN) {
if (config.debug_send) {
log_messages.printf(MSG_NORMAL,
"[send] %s=%f; setting to %f\n", name, frac, FRAC_MIN
);
}
frac = .01;
}
}
// clamp running_frac to a reasonable range
//
if (rf > 1) {
if (config.debug_send) {
log_messages.printf(MSG_NORMAL, "[send] running_frac=%f; setting to 1\n", rf);
}
rf = 1;
} else if (rf < .1) {
if (config.debug_send) {
log_messages.printf(MSG_NORMAL, "[send] running_frac=%f; setting to 0.1\n", rf);
}
rf = .1;
static inline void get_available_fracs() {
if (g_request->core_client_version<=41900) {
g_wreq->cpu_available_frac = g_reply->host.on_frac;
g_wreq->gpu_available_frac = g_reply->host.on_frac; // irrelevant
} else {
g_wreq->cpu_available_frac = g_reply->host.active_frac * g_reply->host.on_frac;
g_wreq->gpu_available_frac = g_reply->host.gpu_active_frac * g_reply->host.on_frac;
}
clamp_frac(g_wreq->cpu_available_frac, "CPU available fraction");
clamp_frac(g_wreq->gpu_available_frac, "GPU available fraction");
}
double available_frac(BEST_APP_VERSION& bav) {
if (bav.host_usage.uses_gpu()) {
return g_wreq->gpu_available_frac;
} else {
return g_wreq->cpu_available_frac;
}
g_wreq->running_frac = rf;
}
// estimate the amount of real time to complete this WU,
@ -440,7 +458,7 @@ static inline void get_running_frac() {
//
double estimate_duration(WORKUNIT& wu, BEST_APP_VERSION& bav) {
double edu = estimate_duration_unscaled(wu, bav);
double ed = edu/g_wreq->running_frac;
double ed = edu/available_frac(bav);
if (config.debug_send) {
log_messages.printf(MSG_NORMAL,
"[send] est. duration for WU %d: unscaled %.2f scaled %.2f\n",
@ -1376,7 +1394,7 @@ void send_work_setup() {
g_wreq->disk_available = max_allowable_disk();
get_mem_sizes();
get_running_frac();
get_available_fracs();
g_wreq->get_job_limits();
// do sanity checking on GPU scheduling parameters
@ -1464,9 +1482,10 @@ void send_work_setup() {
(int)g_request->global_prefs.work_buf_min()
);
log_messages.printf(MSG_NORMAL,
"[send] active_frac %f on_frac %f\n",
"[send] on_frac %f active_frac %f gpu_active_frac %f\n",
g_reply->host.on_frac,
g_reply->host.active_frac,
g_reply->host.on_frac
g_reply->host.gpu_active_frac
);
if (g_wreq->anonymous_platform) {
log_messages.printf(MSG_NORMAL,

View File

@ -415,7 +415,8 @@ struct WORK_REQ_BASE {
double disk_available;
double ram, usable_ram;
double running_frac;
double cpu_available_frac;
double gpu_available_frac;
int njobs_sent;
// The following keep track of the "easiest" job that was rejected
@ -548,4 +549,5 @@ inline bool is_64b_platform(const char* name) {
return (strstr(name, "64") != NULL);
}
extern double available_frac(BEST_APP_VERSION&);
#endif