- client: change work fetch policy to avoid starving GPUs in situations where GPU exclusions are used. - client: fix bug in round-robin simulation when GPU exclusions are used.

Note: this fixes a major problem (starvation) with project-level GPU exclusion. However, project-level GPU exclusion interferes with most of the client's scheduling policies. E.g., round-robin simulation doesn't take GPU exclusion into account, and the resulting completion estimates and device shortfalls can be wrong by an order of magnitude. The only way I can see to fix this would be to model each GPU instance as a separate resource, and to associate each job with a particular GPU instance. This would be a sweeping change in both client and server.
2012-10-20 00:43:44 -07:00 · 2012-10-20 00:43:44 -07:00 · 777f1f11e8
parent 02213ba568
commit 777f1f11e8
9 changed files with 147 additions and 20 deletions
--- a/26
+++ b/26
@ -6274,3 +6274,29 @@ Rom    19 Oct 2012
        uc2_graphics.vcproj
        vboxwrapper.vcproj
        wrapper.vcproj
+
+David  19 Oct 2012
+    - client: change work fetch policy to avoid starving GPUs
+        in situations where GPU exclusions are used.
+    - client: fix bug in round-robin simulation when GPU exclusions are used.
+
+    Note: this fixes a major problem (starvation)
+        with project-level GPU exclusion.
+        However, project-level GPU exclusion interferes with most of
+        the client's scheduling policies.
+        E.g., round-robin simulation doesn't take GPU exclusion into account,
+        and the resulting completion estimates and device shortfalls
+        can be wrong by an order of magnitude.
+
+        The only way I can see to fix this would be to model each
+        GPU instance as a separate resource,
+        and to associate each job with a particular GPU instance.
+        This would be a sweeping change in both client and server.
+
+    client/
+        log_flags.cpp
+        project.cpp,h
+        rr_sim.cpp
+        work_fetch.cpp,h
+    lib/
+        coproc.h
--- a/client/log_flags.cpp
+++ b/client/log_flags.cpp
@ -531,6 +531,7 @@ void process_gpu_exclusions() {
        for (int k=1; k<coprocs.n_rsc; k++) {
            int n=0;
            COPROC& cp = coprocs.coprocs[k];
+            p->rsc_pwf[k].non_excluded_instances = (1<<cp.count)-1;  // all 1's
            for (j=0; j<config.exclude_gpus.size(); j++) {
                EXCLUDE_GPU& eg = config.exclude_gpus[j];
                if (strcmp(eg.url.c_str(), p->master_url)) continue;
@ -539,14 +540,16 @@ void process_gpu_exclusions() {
                if (eg.device_num >= 0) {
                    // exclusion may refer to nonexistent GPU
                    //
-                    if (cp.device_num_exists(eg.device_num)) {
+                    int ind = cp.device_num_index(eg.device_num);
+                    if (ind >= 0) {
                        n++;
+                        p->rsc_pwf[k].non_excluded_instances &= ~(1<<ind);
                    }
                } else {
                    n = cp.count;
                }
            }
-            p->ncoprocs_excluded[k] = n;
+            p->rsc_pwf[k].ncoprocs_excluded = n;
        }
    }

--- a/client/project.cpp
+++ b/client/project.cpp
@ -419,7 +419,7 @@ int PROJECT::write_state(MIOFILE& out, bool gui_rpc) {
        if (no_rsc_pref[j]) {
            out.printf("    <no_rsc_pref>%s</no_rsc_pref>\n", rsc_name(j));
        }
-        if (j>0 && gui_rpc && (ncoprocs_excluded[j] == rsc_work_fetch[j].ninstances)) {
+        if (j>0 && gui_rpc && (rsc_pwf[j].ncoprocs_excluded == rsc_work_fetch[j].ninstances)) {
            out.printf("    <no_rsc_config>%s</no_rsc_config>\n", rsc_name(j));
        }
    }
--- a/client/project.h
+++ b/client/project.h
@ -246,8 +246,6 @@ struct PROJECT : PROJ_AM {
    //
    double rr_sim_cpu_share;
    bool rr_sim_active;
-    int ncoprocs_excluded[MAX_RSC];
-        // number of excluded instances per processor type
    bool operator<(const PROJECT& p) {
        return sched_priority > p.sched_priority;
    }
--- a/client/rr_sim.cpp
+++ b/client/rr_sim.cpp
@ -74,6 +74,7 @@ struct RR_SIM {
        int rt = rp->avp->gpu_usage.rsc_type;
        if (rt) {
            rsc_work_fetch[rt].sim_nused += rp->avp->gpu_usage.usage;
+            rsc_work_fetch[rt].sim_used_instances |= p->rsc_pwf[rt].non_excluded_instances;
            p->rsc_pwf[rt].sim_nused += rp->avp->gpu_usage.usage;
        }
    }
@ -169,10 +170,14 @@ void RR_SIM::init_pending_lists() {
    }
 }

-// pick jobs to run; put them in "active" list.
+// Pick jobs to run, putting them in "active" list.
 // Simulate what the job scheduler would do:
 // pick a job from the project P with highest scheduling priority,
-// then adjust P's scheduling priority
+// then adjust P's scheduling priority.
+//
+// This is called at the start of the simulation,
+// and again each time a job finishes.
+// In the latter case, some resources may be saturated.
 //
 void RR_SIM::pick_jobs_to_run(double reltime) {
    active.clear();
@ -241,7 +246,18 @@ void RR_SIM::pick_jobs_to_run(double reltime) {
                // check whether resource is saturated
                //
                if (rt) {
-                    if (rsc_work_fetch[rt].sim_nused >= coprocs.coprocs[rt].count - p->ncoprocs_excluded[rt]) break;
+                    if (rsc_work_fetch[rt].sim_nused >= coprocs.coprocs[rt].count) {
+                        break;
+                    }
+
+                    // if a GPU isn't saturated but this project is using
+                    // its max given exclusions, remove it from project heap
+                    //
+                    if (rsc_pwf.sim_nused >= coprocs.coprocs[rt].count - p->rsc_pwf[rt].ncoprocs_excluded) {
+                        pop_heap(project_heap.begin(), project_heap.end());
+                        project_heap.pop_back();
+                        continue;
+                    }
                } else {
                    if (rsc_work_fetch[rt].sim_nused >= gstate.ncpus) break;
                }
@ -255,7 +271,7 @@ void RR_SIM::pick_jobs_to_run(double reltime) {
                pop_heap(project_heap.begin(), project_heap.end());
                project_heap.pop_back();
            } else if (!rp->rrsim_done) {
-                // Otherwise reshuffle the heap
+                // Otherwise reshuffle the project heap
                //
                make_heap(project_heap.begin(), project_heap.end());
            }
@ -401,7 +417,9 @@ void RR_SIM::simulate() {
                }
            }
        }
-        // adjust FLOPS left
+
+        // adjust FLOPS left of other active jobs
+        //
        for (unsigned int i=0; i<active.size(); i++) {
            rp = active[i];
            rp->rrsim_flops_left -= rp->rrsim_flops*delta_t;
@ -464,6 +482,19 @@ void RR_SIM::simulate() {
        sim_now += delta_t;
    }

+    // identify GPU instances starved because of exclusions
+    //
+    for (int i=1; i<coprocs.n_rsc; i++) {
+        RSC_WORK_FETCH& rwf = rsc_work_fetch[i];
+        COPROC& cp = coprocs.coprocs[i];
+        int mask = (1<<cp.count)-1;
+        rwf.sim_excluded_instances = ~(rwf.sim_used_instances) & mask;
+        msg_printf(0, MSG_INFO,
+            "rsc %d: sim_used_inst %d mask %d sim_excluded_instances %d",
+            i, rwf.sim_used_instances, mask, rwf.sim_excluded_instances
+        );
+    }
+
    // if simulation ends before end of buffer, take the tail into account
    //
    if (sim_now < buf_end) {
--- a/client/work_fetch.cpp
+++ b/client/work_fetch.cpp
@ -187,6 +187,7 @@ void RSC_WORK_FETCH::rr_init() {
    deadline_missed_instances = 0;
    saturated_time = 0;
    busy_time_estimator.reset();
+    sim_used_instances = 0;
 }

 void RSC_WORK_FETCH::accumulate_shortfall(double d_time) {
@ -204,6 +205,7 @@ void RSC_WORK_FETCH::accumulate_shortfall(double d_time) {

 void RSC_WORK_FETCH::update_saturated_time(double dt) {
    double idle = ninstances - sim_nused;
+    //msg_printf(0, MSG_INFO, "update_saturated rsc %d idle %f dt %f", rsc_type, idle, dt);
    if (idle < 1e-6) {
        saturated_time = dt;
    }
@ -222,6 +224,10 @@ static bool wacky_dcf(PROJECT* p) {
 // If this resource is below min buffer level,
 // return the highest-priority project that may have jobs for it.
 //
+// It the resource has instanced starved because of exclusions,
+// return the highest-priority project that may have jobs
+// and doesn't exclude those instances.
+//
 // If strict is true, enforce hysteresis and backoff rules
 // (which are there to limit rate of scheduler RPCs).
 // Otherwise, we're going to do a scheduler RPC anyway
@ -230,12 +236,21 @@ static bool wacky_dcf(PROJECT* p) {
 //
 PROJECT* RSC_WORK_FETCH::choose_project_hyst(bool strict) {
    PROJECT* pbest = NULL;
+    bool buffer_low = true;
    if (strict) {
-        if (saturated_time > gstate.work_buf_min()) return NULL;
+        if (saturated_time > gstate.work_buf_min()) buffer_low = false;
    } else {
-        if (saturated_time > gstate.work_buf_total()) return NULL;
+        if (saturated_time > gstate.work_buf_total()) buffer_low = false;
    }
-    if (saturated_time > gstate.work_buf_total()) return NULL;
+
+    if (log_flags.work_fetch_debug) {
+        msg_printf(0, MSG_INFO,
+            "[work_fetch] buffer_low: %s; sim_excluded_instances %d\n",
+            buffer_low?"yes":"no", sim_excluded_instances
+        );
+    }
+
+    if (!buffer_low && !sim_excluded_instances) return NULL;

    for (unsigned i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
@ -270,11 +285,11 @@ PROJECT* RSC_WORK_FETCH::choose_project_hyst(bool strict) {
        // computing shortfall etc. on a per-project basis
        //
        if (rsc_type) {
-            int n_not_excluded = ninstances - p->ncoprocs_excluded[rsc_type];
+            int n_not_excluded = ninstances - p->rsc_pwf[rsc_type].ncoprocs_excluded;
            if (n_not_excluded == 0) {
                continue;
            }
-            if (p->ncoprocs_excluded[rsc_type]
+            if (p->rsc_pwf[rsc_type].ncoprocs_excluded
                && p->rsc_pwf[rsc_type].n_runnable_jobs > n_not_excluded
            ) {
                continue;
@ -283,6 +298,16 @@ PROJECT* RSC_WORK_FETCH::choose_project_hyst(bool strict) {

        RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
        if (rpwf.anon_skip) continue;
+
+        // if we're sending work only because of exclusion starvation,
+        // make sure this project can use the starved instances
+        //
+        if (!buffer_low) {
+            if ((sim_excluded_instances & rpwf.non_excluded_instances) == 0) {
+                continue;
+            }
+        }
+
        if (pbest) {
            if (pbest->sched_priority > p->sched_priority) {
                continue;
@ -292,7 +317,11 @@ PROJECT* RSC_WORK_FETCH::choose_project_hyst(bool strict) {
    }
    if (!pbest) return NULL;
    work_fetch.clear_request();
-    work_fetch.set_all_requests_hyst(pbest, rsc_type);
+    if (buffer_low) {
+        work_fetch.set_all_requests_hyst(pbest, rsc_type);
+    } else {
+        set_request_excluded(pbest);
+    }
    return pbest;
 }

@ -442,6 +471,29 @@ void RSC_WORK_FETCH::set_request(PROJECT* p) {
    }
 }

+// We're fetching work because some instances are starved because
+// of exclusions.
+// See how many N of these instances are not excluded for this project.
+// Ask for N instances and for N*work_buf_min seconds.
+//
+void RSC_WORK_FETCH::set_request_excluded(PROJECT* p) {
+    RSC_PROJECT_WORK_FETCH& pwf = project_state(p);
+
+    int inst_mask = sim_excluded_instances & pwf.non_excluded_instances;
+    int n = 0;
+    for (int i=0; i<ninstances; i++) {
+        if ((i<<i) & inst_mask) {
+            n++;
+        }
+    }
+    req_instances = n;
+    if (p->resource_share == 0 || config.fetch_minimal_work) {
+        req_secs = 1;
+    } else {
+        req_secs = n*gstate.work_buf_total();
+    }
+}
+
 void RSC_WORK_FETCH::print_state(const char* name) {
    msg_printf(0, MSG_INFO,
        "[work_fetch] %s: shortfall %.2f nidle %.2f saturated %.2f busy %.2f",
@ -877,7 +929,7 @@ void WORK_FETCH::set_initial_work_request(PROJECT* p) {
        rsc_work_fetch[i].req_secs = 1;
        if (i) {
            RSC_WORK_FETCH& rwf = rsc_work_fetch[i];
-            if (rwf.ninstances ==  p->ncoprocs_excluded[i]) {
+            if (rwf.ninstances ==  p->rsc_pwf[i].ncoprocs_excluded) {
                rsc_work_fetch[i].req_secs = 0;
            }
        }
--- a/client/work_fetch.h
+++ b/client/work_fetch.h
@ -100,6 +100,11 @@ struct RSC_PROJECT_WORK_FETCH {
    int n_runnable_jobs;
    double sim_nused;
    double nused_total;     // sum of instances over all runnable jobs
+    int ncoprocs_excluded;
+        // number of excluded instances
+    int non_excluded_instances;
+        // bitmap of non-excluded instances
+        // (i.e. instances this project's jobs can run on)
    int deadlines_missed;
    int deadlines_missed_copy;
        // copy of the above used during schedule_cpus()
@ -116,6 +121,8 @@ struct RSC_PROJECT_WORK_FETCH {
        n_runnable_jobs = 0;
        sim_nused = 0;
        nused_total = 0;
+        ncoprocs_excluded = 0;
+        non_excluded_instances = 0;
        deadlines_missed = 0;
        deadlines_missed_copy = 0;
    }
@ -201,6 +208,11 @@ struct RSC_WORK_FETCH {
        // seconds of idle instances between now and now+work_buf_total()
    double nidle_now;
    double sim_nused;
+    int sim_used_instances;
+        // bitmap of instances used in simulation,
+        // taking into account GPU exclusions
+    int sim_excluded_instances;
+        // bitmap of instances not used (i.e. starved because of exclusion)
    double total_fetchable_share;
        // total RS of projects from which we could fetch jobs for this device
    double saturated_time;
@ -241,6 +253,7 @@ struct RSC_WORK_FETCH {
    void print_state(const char*);
    void clear_request();
    void set_request(PROJECT*);
+    void set_request_excluded(PROJECT*);
    bool may_have_work(PROJECT*);
    RSC_WORK_FETCH() {
        rsc_type = 0;
--- a/doc/sim/sim_web.php
+++ b/doc/sim/sim_web.php
@ -339,6 +339,7 @@ function log_flag_boxes() {
    return "
        <input type=checkbox name=cpu_sched_debug> CPU scheduling debug
        <br> <input type=checkbox name=rr_simulation> Round-robin simulation info
+        <br> <input type=checkbox name=rrsim_detail> Round-robin simulation details
        <br> <input type=checkbox name=work_fetch_debug> Work fetch debug
    ";
 }
@ -440,6 +441,9 @@ function simulation_action() {
    if (post_str("rr_simulation", true)) {
        $x .= "<rr_simulation/>\n";
    }
+    if (post_str("rrsim_detail", true)) {
+        $x .= "<rrsim_detail/>\n";
+    }
    if (post_str("work_fetch_debug", true)) {
        $x .= "<work_fetch_debug/>\n";
    }
--- a/lib/coproc.h
+++ b/lib/coproc.h
@ -264,11 +264,11 @@ struct COPROC {
    COPROC() {
        clear();
    }
-    bool device_num_exists(int n) {
+    int device_num_index(int n) {
        for (int i=0; i<count; i++) {
-            if (device_nums[i] == n) return true;
+            if (device_nums[i] == n) return i;
        }
-        return false;
+        return -1;
    }
    void merge_opencl(
        std::vector<OPENCL_DEVICE_PROP> &opencls,