server: use gpu_active_frac in scheduling decisions

On some hosts, gpu_active_frac may be much less than active_frac (i.e., GPUs may be available much less than CPUs). Use gpu_active_frac in the following places: - scheduler: in estimating the elapsed time of jobs, to decide whether they can meet deadline - scheduler: in computing the effective speed of a (host, app version), when deciding what size class it belongs to - size_census: in computing effective speed of (host, app versions) (Previously, we were just using active_frac in all these cases)
2014-03-06 21:23:02 -08:00 · 2014-03-06 21:23:02 -08:00 · 5381def663
parent 8ba022760b
commit 5381def663
5 changed files with 68 additions and 27 deletions
--- a/html/inc/util_basic.inc
+++ b/html/inc/util_basic.inc
@ -131,4 +131,14 @@ function drand() {
    return ((double)rand())/getrandmax();
 }

+// kludge
+//
+function is_gpu($plan_class) {
+    if (strstr($plan_class, "ati")) return true;
+    if (strstr($plan_class, "cuda")) return true;
+    if (strstr($plan_class, "nvidia")) return true;
+    if (strstr($plan_class, "intel_gpu")) return true;
+    return false;
+}
+
 ?>
--- a/html/ops/size_census.php
+++ b/html/ops/size_census.php
@ -37,7 +37,7 @@ function do_app($app) {
    // joined to the host

    $db = BoincDb::get();
-    $query = "select et_avg, host.on_frac, host.active_frac " .
+    $query = "select et_avg, host.on_frac, host.active_frac, host.gpu_active_frac, app_version.plan_class " .
        " from DBNAME.host_app_version, DBNAME.host, DBNAME.app_version " .
        " where host_app_version.app_version_id = app_version.id " .
        " and app_version.appid = $app->id " .
@ -46,7 +46,17 @@ function do_app($app) {
    $result = $db->do_query($query);
    $a = array();
    while ($x = mysql_fetch_object($result)) {
-        $a[] = (1/$x->et_avg) * $x->on_frac * $x->active_frac;
+        if (is_gpu($x->plan_class)) {
+            $av = $x->on_frac;
+            if ($x->gpu_active_frac) {
+                $av *= $x->gpu_active_frac;
+            } else {
+                $av *= $x->active_frac;
+            }
+        } else {
+            $av = $x->on_frac * $x->active_frac;
+        }
+        $a[] = (1/$x->et_avg) * $av;
    }
    mysql_free_result($result);
    sort($a);
--- a/sched/sched_score.cpp
+++ b/sched/sched_score.cpp
@ -99,7 +99,7 @@ bool JOB::get_score(WU_RESULT& wu_result) {
    }

    if (app->n_size_classes > 1) {
-        double effective_speed = bavp->host_usage.projected_flops * g_reply->host.on_frac * g_reply->host.active_frac;
+        double effective_speed = bavp->host_usage.projected_flops * available_frac(*bavp);
        int target_size = get_size_class(*app, effective_speed);
        if (config.debug_send) {
            log_messages.printf(MSG_NORMAL,
--- a/sched/sched_send.cpp
+++ b/sched/sched_send.cpp
@ -409,28 +409,46 @@ static double estimate_duration_unscaled(WORKUNIT& wu, BEST_APP_VERSION& bav) {
    return rsc_fpops_est/bav.host_usage.projected_flops;
 }

-static inline void get_running_frac() {
-    double rf;
-    if (g_request->core_client_version<=41900) {
-        rf = g_reply->host.on_frac;
-    } else {
-        rf = g_reply->host.active_frac * g_reply->host.on_frac;
+// Compute cpu_available_frac and gpu_available_frac.
+// These are based on client-supplied data, so do sanity checks
+//
+#define FRAC_MIN 0.1
+static inline void clamp_frac(double& frac, const char* name) {
+    if (frac > 1) {
+        if (config.debug_send) {
+            log_messages.printf(MSG_NORMAL,
+                "[send] %s=%f; setting to 1\n", name, frac
+            );
+        }
+        frac = 1;
+    } else if (frac < FRAC_MIN) {
+        if (config.debug_send) {
+            log_messages.printf(MSG_NORMAL,
+                "[send] %s=%f; setting to %f\n", name, frac, FRAC_MIN
+            );
+        }
+        frac = .01;
    }
+}

-    // clamp running_frac to a reasonable range
-    //
-    if (rf > 1) {
-        if (config.debug_send) {
-            log_messages.printf(MSG_NORMAL, "[send] running_frac=%f; setting to 1\n", rf);
-        }
-        rf = 1;
-    } else if (rf < .1) {
-        if (config.debug_send) {
-            log_messages.printf(MSG_NORMAL, "[send] running_frac=%f; setting to 0.1\n", rf);
-        }
-        rf = .1;
+static inline void get_available_fracs() {
+    if (g_request->core_client_version<=41900) {
+        g_wreq->cpu_available_frac = g_reply->host.on_frac;
+        g_wreq->gpu_available_frac = g_reply->host.on_frac; // irrelevant
+    } else {
+        g_wreq->cpu_available_frac = g_reply->host.active_frac * g_reply->host.on_frac;
+        g_wreq->gpu_available_frac = g_reply->host.gpu_active_frac * g_reply->host.on_frac;
+    }
+    clamp_frac(g_wreq->cpu_available_frac, "CPU available fraction");
+    clamp_frac(g_wreq->gpu_available_frac, "GPU available fraction");
+}
+
+double available_frac(BEST_APP_VERSION& bav) {
+    if (bav.host_usage.uses_gpu()) {
+        return g_wreq->gpu_available_frac;
+    } else {
+        return g_wreq->cpu_available_frac;
    }
-    g_wreq->running_frac = rf;
 }

 // estimate the amount of real time to complete this WU,
@ -440,7 +458,7 @@ static inline void get_running_frac() {
 //
 double estimate_duration(WORKUNIT& wu, BEST_APP_VERSION& bav) {
    double edu = estimate_duration_unscaled(wu, bav);
-    double ed = edu/g_wreq->running_frac;
+    double ed = edu/available_frac(bav);
    if (config.debug_send) {
        log_messages.printf(MSG_NORMAL,
            "[send] est. duration for WU %d: unscaled %.2f scaled %.2f\n",
@ -1376,7 +1394,7 @@ void send_work_setup() {

    g_wreq->disk_available = max_allowable_disk();
    get_mem_sizes();
-    get_running_frac();
+    get_available_fracs();
    g_wreq->get_job_limits();

    // do sanity checking on GPU scheduling parameters
@ -1464,9 +1482,10 @@ void send_work_setup() {
            (int)g_request->global_prefs.work_buf_min()
        );
        log_messages.printf(MSG_NORMAL,
-            "[send] active_frac %f on_frac %f\n",
+            "[send] on_frac %f active_frac %f gpu_active_frac %f\n",
+            g_reply->host.on_frac,
            g_reply->host.active_frac,
-            g_reply->host.on_frac
+            g_reply->host.gpu_active_frac
        );
        if (g_wreq->anonymous_platform) {
            log_messages.printf(MSG_NORMAL,
--- a/sched/sched_types.h
+++ b/sched/sched_types.h
@ -415,7 +415,8 @@ struct WORK_REQ_BASE {

    double disk_available;
    double ram, usable_ram;
-    double running_frac;
+    double cpu_available_frac;
+    double gpu_available_frac;
    int njobs_sent;

    // The following keep track of the "easiest" job that was rejected
@ -548,4 +549,5 @@ inline bool is_64b_platform(const char* name) {
    return (strstr(name, "64") != NULL);
 }

+extern double available_frac(BEST_APP_VERSION&);
 #endif