mirror of https://github.com/BOINC/boinc.git
server: use gpu_active_frac in scheduling decisions
On some hosts, gpu_active_frac may be much less than active_frac (i.e., GPUs may be available much less than CPUs). Use gpu_active_frac in the following places: - scheduler: in estimating the elapsed time of jobs, to decide whether they can meet deadline - scheduler: in computing the effective speed of a (host, app version), when deciding what size class it belongs to - size_census: in computing effective speed of (host, app versions) (Previously, we were just using active_frac in all these cases)
This commit is contained in:
parent
8ba022760b
commit
5381def663
|
@ -131,4 +131,14 @@ function drand() {
|
|||
return ((double)rand())/getrandmax();
|
||||
}
|
||||
|
||||
// kludge
|
||||
//
|
||||
function is_gpu($plan_class) {
|
||||
if (strstr($plan_class, "ati")) return true;
|
||||
if (strstr($plan_class, "cuda")) return true;
|
||||
if (strstr($plan_class, "nvidia")) return true;
|
||||
if (strstr($plan_class, "intel_gpu")) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
?>
|
||||
|
|
|
@ -37,7 +37,7 @@ function do_app($app) {
|
|||
// joined to the host
|
||||
|
||||
$db = BoincDb::get();
|
||||
$query = "select et_avg, host.on_frac, host.active_frac " .
|
||||
$query = "select et_avg, host.on_frac, host.active_frac, host.gpu_active_frac, app_version.plan_class " .
|
||||
" from DBNAME.host_app_version, DBNAME.host, DBNAME.app_version " .
|
||||
" where host_app_version.app_version_id = app_version.id " .
|
||||
" and app_version.appid = $app->id " .
|
||||
|
@ -46,7 +46,17 @@ function do_app($app) {
|
|||
$result = $db->do_query($query);
|
||||
$a = array();
|
||||
while ($x = mysql_fetch_object($result)) {
|
||||
$a[] = (1/$x->et_avg) * $x->on_frac * $x->active_frac;
|
||||
if (is_gpu($x->plan_class)) {
|
||||
$av = $x->on_frac;
|
||||
if ($x->gpu_active_frac) {
|
||||
$av *= $x->gpu_active_frac;
|
||||
} else {
|
||||
$av *= $x->active_frac;
|
||||
}
|
||||
} else {
|
||||
$av = $x->on_frac * $x->active_frac;
|
||||
}
|
||||
$a[] = (1/$x->et_avg) * $av;
|
||||
}
|
||||
mysql_free_result($result);
|
||||
sort($a);
|
||||
|
|
|
@ -99,7 +99,7 @@ bool JOB::get_score(WU_RESULT& wu_result) {
|
|||
}
|
||||
|
||||
if (app->n_size_classes > 1) {
|
||||
double effective_speed = bavp->host_usage.projected_flops * g_reply->host.on_frac * g_reply->host.active_frac;
|
||||
double effective_speed = bavp->host_usage.projected_flops * available_frac(*bavp);
|
||||
int target_size = get_size_class(*app, effective_speed);
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
|
|
|
@ -409,28 +409,46 @@ static double estimate_duration_unscaled(WORKUNIT& wu, BEST_APP_VERSION& bav) {
|
|||
return rsc_fpops_est/bav.host_usage.projected_flops;
|
||||
}
|
||||
|
||||
static inline void get_running_frac() {
|
||||
double rf;
|
||||
if (g_request->core_client_version<=41900) {
|
||||
rf = g_reply->host.on_frac;
|
||||
} else {
|
||||
rf = g_reply->host.active_frac * g_reply->host.on_frac;
|
||||
// Compute cpu_available_frac and gpu_available_frac.
|
||||
// These are based on client-supplied data, so do sanity checks
|
||||
//
|
||||
#define FRAC_MIN 0.1
|
||||
static inline void clamp_frac(double& frac, const char* name) {
|
||||
if (frac > 1) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] %s=%f; setting to 1\n", name, frac
|
||||
);
|
||||
}
|
||||
frac = 1;
|
||||
} else if (frac < FRAC_MIN) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] %s=%f; setting to %f\n", name, frac, FRAC_MIN
|
||||
);
|
||||
}
|
||||
frac = .01;
|
||||
}
|
||||
}
|
||||
|
||||
// clamp running_frac to a reasonable range
|
||||
//
|
||||
if (rf > 1) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL, "[send] running_frac=%f; setting to 1\n", rf);
|
||||
}
|
||||
rf = 1;
|
||||
} else if (rf < .1) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL, "[send] running_frac=%f; setting to 0.1\n", rf);
|
||||
}
|
||||
rf = .1;
|
||||
static inline void get_available_fracs() {
|
||||
if (g_request->core_client_version<=41900) {
|
||||
g_wreq->cpu_available_frac = g_reply->host.on_frac;
|
||||
g_wreq->gpu_available_frac = g_reply->host.on_frac; // irrelevant
|
||||
} else {
|
||||
g_wreq->cpu_available_frac = g_reply->host.active_frac * g_reply->host.on_frac;
|
||||
g_wreq->gpu_available_frac = g_reply->host.gpu_active_frac * g_reply->host.on_frac;
|
||||
}
|
||||
clamp_frac(g_wreq->cpu_available_frac, "CPU available fraction");
|
||||
clamp_frac(g_wreq->gpu_available_frac, "GPU available fraction");
|
||||
}
|
||||
|
||||
double available_frac(BEST_APP_VERSION& bav) {
|
||||
if (bav.host_usage.uses_gpu()) {
|
||||
return g_wreq->gpu_available_frac;
|
||||
} else {
|
||||
return g_wreq->cpu_available_frac;
|
||||
}
|
||||
g_wreq->running_frac = rf;
|
||||
}
|
||||
|
||||
// estimate the amount of real time to complete this WU,
|
||||
|
@ -440,7 +458,7 @@ static inline void get_running_frac() {
|
|||
//
|
||||
double estimate_duration(WORKUNIT& wu, BEST_APP_VERSION& bav) {
|
||||
double edu = estimate_duration_unscaled(wu, bav);
|
||||
double ed = edu/g_wreq->running_frac;
|
||||
double ed = edu/available_frac(bav);
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] est. duration for WU %d: unscaled %.2f scaled %.2f\n",
|
||||
|
@ -1376,7 +1394,7 @@ void send_work_setup() {
|
|||
|
||||
g_wreq->disk_available = max_allowable_disk();
|
||||
get_mem_sizes();
|
||||
get_running_frac();
|
||||
get_available_fracs();
|
||||
g_wreq->get_job_limits();
|
||||
|
||||
// do sanity checking on GPU scheduling parameters
|
||||
|
@ -1464,9 +1482,10 @@ void send_work_setup() {
|
|||
(int)g_request->global_prefs.work_buf_min()
|
||||
);
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] active_frac %f on_frac %f\n",
|
||||
"[send] on_frac %f active_frac %f gpu_active_frac %f\n",
|
||||
g_reply->host.on_frac,
|
||||
g_reply->host.active_frac,
|
||||
g_reply->host.on_frac
|
||||
g_reply->host.gpu_active_frac
|
||||
);
|
||||
if (g_wreq->anonymous_platform) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
|
|
|
@ -415,7 +415,8 @@ struct WORK_REQ_BASE {
|
|||
|
||||
double disk_available;
|
||||
double ram, usable_ram;
|
||||
double running_frac;
|
||||
double cpu_available_frac;
|
||||
double gpu_available_frac;
|
||||
int njobs_sent;
|
||||
|
||||
// The following keep track of the "easiest" job that was rejected
|
||||
|
@ -548,4 +549,5 @@ inline bool is_64b_platform(const char* name) {
|
|||
return (strstr(name, "64") != NULL);
|
||||
}
|
||||
|
||||
extern double available_frac(BEST_APP_VERSION&);
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue