From 777f1f11e8cdf9f61113e138ef5b935dbb234dec Mon Sep 17 00:00:00 2001
From: David Anderson <davea@ssl.berkeley.edu>
Date: Sat, 20 Oct 2012 00:43:44 -0700
Subject: [PATCH] - client: change work fetch policy to avoid starving GPUs in
 situations where GPU exclusions are used. - client: fix bug in round-robin
 simulation when GPU exclusions are used.

Note: this fixes a major problem (starvation)
    with project-level GPU exclusion.
    However, project-level GPU exclusion interferes with most of
    the client's scheduling policies.
    E.g., round-robin simulation doesn't take GPU exclusion into account,
    and the resulting completion estimates and device shortfalls
    can be wrong by an order of magnitude.

    The only way I can see to fix this would be to model each
    GPU instance as a separate resource,
    and to associate each job with a particular GPU instance.
    This would be a sweeping change in both client and server.
---
 checkin_notes         | 26 +++++++++++++++++
 client/log_flags.cpp  |  7 +++--
 client/project.cpp    |  2 +-
 client/project.h      |  2 --
 client/rr_sim.cpp     | 41 +++++++++++++++++++++++----
 client/work_fetch.cpp | 66 ++++++++++++++++++++++++++++++++++++++-----
 client/work_fetch.h   | 13 +++++++++
 doc/sim/sim_web.php   |  4 +++
 lib/coproc.h          |  6 ++--
 9 files changed, 147 insertions(+), 20 deletions(-)

diff --git a/checkin_notes b/checkin_notes
index c5799a7870..8378c30ae2 100644
--- a/checkin_notes
+++ b/checkin_notes
@@ -6274,3 +6274,29 @@ Rom    19 Oct 2012
         uc2_graphics.vcproj
         vboxwrapper.vcproj
         wrapper.vcproj
+
+David  19 Oct 2012
+    - client: change work fetch policy to avoid starving GPUs
+        in situations where GPU exclusions are used.
+    - client: fix bug in round-robin simulation when GPU exclusions are used.
+
+    Note: this fixes a major problem (starvation)
+        with project-level GPU exclusion.
+        However, project-level GPU exclusion interferes with most of
+        the client's scheduling policies.
+        E.g., round-robin simulation doesn't take GPU exclusion into account,
+        and the resulting completion estimates and device shortfalls
+        can be wrong by an order of magnitude.
+
+        The only way I can see to fix this would be to model each
+        GPU instance as a separate resource,
+        and to associate each job with a particular GPU instance.
+        This would be a sweeping change in both client and server.
+
+    client/
+        log_flags.cpp
+        project.cpp,h
+        rr_sim.cpp
+        work_fetch.cpp,h
+    lib/
+        coproc.h
diff --git a/client/log_flags.cpp b/client/log_flags.cpp
index b4ed996861..c548b2be16 100644
--- a/client/log_flags.cpp
+++ b/client/log_flags.cpp
@@ -531,6 +531,7 @@ void process_gpu_exclusions() {
         for (int k=1; k<coprocs.n_rsc; k++) {
             int n=0;
             COPROC& cp = coprocs.coprocs[k];
+            p->rsc_pwf[k].non_excluded_instances = (1<<cp.count)-1;  // all 1's
             for (j=0; j<config.exclude_gpus.size(); j++) {
                 EXCLUDE_GPU& eg = config.exclude_gpus[j];
                 if (strcmp(eg.url.c_str(), p->master_url)) continue;
@@ -539,14 +540,16 @@ void process_gpu_exclusions() {
                 if (eg.device_num >= 0) {
                     // exclusion may refer to nonexistent GPU
                     //
-                    if (cp.device_num_exists(eg.device_num)) {
+                    int ind = cp.device_num_index(eg.device_num);
+                    if (ind >= 0) {
                         n++;
+                        p->rsc_pwf[k].non_excluded_instances &= ~(1<<ind);
                     }
                 } else {
                     n = cp.count;
                 }
             }
-            p->ncoprocs_excluded[k] = n;
+            p->rsc_pwf[k].ncoprocs_excluded = n;
         }
     }
 
diff --git a/client/project.cpp b/client/project.cpp
index 69f90d44ca..b86c8b5aee 100644
--- a/client/project.cpp
+++ b/client/project.cpp
@@ -419,7 +419,7 @@ int PROJECT::write_state(MIOFILE& out, bool gui_rpc) {
         if (no_rsc_pref[j]) {
             out.printf("    <no_rsc_pref>%s</no_rsc_pref>\n", rsc_name(j));
         }
-        if (j>0 && gui_rpc && (ncoprocs_excluded[j] == rsc_work_fetch[j].ninstances)) {
+        if (j>0 && gui_rpc && (rsc_pwf[j].ncoprocs_excluded == rsc_work_fetch[j].ninstances)) {
             out.printf("    <no_rsc_config>%s</no_rsc_config>\n", rsc_name(j));
         }
     }
diff --git a/client/project.h b/client/project.h
index 74dacb8805..ffc30e1af3 100644
--- a/client/project.h
+++ b/client/project.h
@@ -246,8 +246,6 @@ struct PROJECT : PROJ_AM {
     //
     double rr_sim_cpu_share;
     bool rr_sim_active;
-    int ncoprocs_excluded[MAX_RSC];
-        // number of excluded instances per processor type
     bool operator<(const PROJECT& p) {
         return sched_priority > p.sched_priority;
     }
diff --git a/client/rr_sim.cpp b/client/rr_sim.cpp
index 552a8d7e90..4f0bf3adf3 100644
--- a/client/rr_sim.cpp
+++ b/client/rr_sim.cpp
@@ -74,6 +74,7 @@ struct RR_SIM {
         int rt = rp->avp->gpu_usage.rsc_type;
         if (rt) {
             rsc_work_fetch[rt].sim_nused += rp->avp->gpu_usage.usage;
+            rsc_work_fetch[rt].sim_used_instances |= p->rsc_pwf[rt].non_excluded_instances;
             p->rsc_pwf[rt].sim_nused += rp->avp->gpu_usage.usage;
         }
     }
@@ -169,10 +170,14 @@ void RR_SIM::init_pending_lists() {
     }
 }
 
-// pick jobs to run; put them in "active" list.
+// Pick jobs to run, putting them in "active" list.
 // Simulate what the job scheduler would do:
 // pick a job from the project P with highest scheduling priority,
-// then adjust P's scheduling priority
+// then adjust P's scheduling priority.
+//
+// This is called at the start of the simulation,
+// and again each time a job finishes.
+// In the latter case, some resources may be saturated.
 //
 void RR_SIM::pick_jobs_to_run(double reltime) {
     active.clear();
@@ -241,7 +246,18 @@ void RR_SIM::pick_jobs_to_run(double reltime) {
                 // check whether resource is saturated
                 //
                 if (rt) {
-                    if (rsc_work_fetch[rt].sim_nused >= coprocs.coprocs[rt].count - p->ncoprocs_excluded[rt]) break;
+                    if (rsc_work_fetch[rt].sim_nused >= coprocs.coprocs[rt].count) {
+                        break;
+                    }
+
+                    // if a GPU isn't saturated but this project is using
+                    // its max given exclusions, remove it from project heap
+                    //
+                    if (rsc_pwf.sim_nused >= coprocs.coprocs[rt].count - p->rsc_pwf[rt].ncoprocs_excluded) {
+                        pop_heap(project_heap.begin(), project_heap.end());
+                        project_heap.pop_back();
+                        continue;
+                    }
                 } else {
                     if (rsc_work_fetch[rt].sim_nused >= gstate.ncpus) break;
                 }
@@ -255,7 +271,7 @@ void RR_SIM::pick_jobs_to_run(double reltime) {
                 pop_heap(project_heap.begin(), project_heap.end());
                 project_heap.pop_back();
             } else if (!rp->rrsim_done) {
-                // Otherwise reshuffle the heap
+                // Otherwise reshuffle the project heap
                 //
                 make_heap(project_heap.begin(), project_heap.end());
             }
@@ -401,7 +417,9 @@ void RR_SIM::simulate() {
                 }
             }
         }
-        // adjust FLOPS left
+
+        // adjust FLOPS left of other active jobs
+        //
         for (unsigned int i=0; i<active.size(); i++) {
             rp = active[i];
             rp->rrsim_flops_left -= rp->rrsim_flops*delta_t;
@@ -464,6 +482,19 @@ void RR_SIM::simulate() {
         sim_now += delta_t;
     }
 
+    // identify GPU instances starved because of exclusions
+    //
+    for (int i=1; i<coprocs.n_rsc; i++) {
+        RSC_WORK_FETCH& rwf = rsc_work_fetch[i];
+        COPROC& cp = coprocs.coprocs[i];
+        int mask = (1<<cp.count)-1;
+        rwf.sim_excluded_instances = ~(rwf.sim_used_instances) & mask;
+        msg_printf(0, MSG_INFO,
+            "rsc %d: sim_used_inst %d mask %d sim_excluded_instances %d",
+            i, rwf.sim_used_instances, mask, rwf.sim_excluded_instances
+        );
+    }
+
     // if simulation ends before end of buffer, take the tail into account
     //
     if (sim_now < buf_end) {
diff --git a/client/work_fetch.cpp b/client/work_fetch.cpp
index e7384dc590..71bcb4757c 100644
--- a/client/work_fetch.cpp
+++ b/client/work_fetch.cpp
@@ -187,6 +187,7 @@ void RSC_WORK_FETCH::rr_init() {
     deadline_missed_instances = 0;
     saturated_time = 0;
     busy_time_estimator.reset();
+    sim_used_instances = 0;
 }
 
 void RSC_WORK_FETCH::accumulate_shortfall(double d_time) {
@@ -204,6 +205,7 @@ void RSC_WORK_FETCH::accumulate_shortfall(double d_time) {
 
 void RSC_WORK_FETCH::update_saturated_time(double dt) {
     double idle = ninstances - sim_nused;
+    //msg_printf(0, MSG_INFO, "update_saturated rsc %d idle %f dt %f", rsc_type, idle, dt);
     if (idle < 1e-6) {
         saturated_time = dt;
     }
@@ -222,6 +224,10 @@ static bool wacky_dcf(PROJECT* p) {
 // If this resource is below min buffer level,
 // return the highest-priority project that may have jobs for it.
 //
+// It the resource has instanced starved because of exclusions,
+// return the highest-priority project that may have jobs
+// and doesn't exclude those instances.
+//
 // If strict is true, enforce hysteresis and backoff rules
 // (which are there to limit rate of scheduler RPCs).
 // Otherwise, we're going to do a scheduler RPC anyway
@@ -230,12 +236,21 @@ static bool wacky_dcf(PROJECT* p) {
 //
 PROJECT* RSC_WORK_FETCH::choose_project_hyst(bool strict) {
     PROJECT* pbest = NULL;
+    bool buffer_low = true;
     if (strict) {
-        if (saturated_time > gstate.work_buf_min()) return NULL;
+        if (saturated_time > gstate.work_buf_min()) buffer_low = false;
     } else {
-        if (saturated_time > gstate.work_buf_total()) return NULL;
+        if (saturated_time > gstate.work_buf_total()) buffer_low = false;
     }
-    if (saturated_time > gstate.work_buf_total()) return NULL;
+
+    if (log_flags.work_fetch_debug) {
+        msg_printf(0, MSG_INFO,
+            "[work_fetch] buffer_low: %s; sim_excluded_instances %d\n",
+            buffer_low?"yes":"no", sim_excluded_instances
+        );
+    }
+
+    if (!buffer_low && !sim_excluded_instances) return NULL;
 
     for (unsigned i=0; i<gstate.projects.size(); i++) {
         PROJECT* p = gstate.projects[i];
@@ -270,11 +285,11 @@ PROJECT* RSC_WORK_FETCH::choose_project_hyst(bool strict) {
         // computing shortfall etc. on a per-project basis
         //
         if (rsc_type) {
-            int n_not_excluded = ninstances - p->ncoprocs_excluded[rsc_type];
+            int n_not_excluded = ninstances - p->rsc_pwf[rsc_type].ncoprocs_excluded;
             if (n_not_excluded == 0) {
                 continue;
             }
-            if (p->ncoprocs_excluded[rsc_type]
+            if (p->rsc_pwf[rsc_type].ncoprocs_excluded
                 && p->rsc_pwf[rsc_type].n_runnable_jobs > n_not_excluded
             ) {
                 continue;
@@ -283,6 +298,16 @@ PROJECT* RSC_WORK_FETCH::choose_project_hyst(bool strict) {
 
         RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
         if (rpwf.anon_skip) continue;
+
+        // if we're sending work only because of exclusion starvation,
+        // make sure this project can use the starved instances
+        //
+        if (!buffer_low) {
+            if ((sim_excluded_instances & rpwf.non_excluded_instances) == 0) {
+                continue;
+            }
+        }
+
         if (pbest) {
             if (pbest->sched_priority > p->sched_priority) {
                 continue;
@@ -292,7 +317,11 @@ PROJECT* RSC_WORK_FETCH::choose_project_hyst(bool strict) {
     }
     if (!pbest) return NULL;
     work_fetch.clear_request();
-    work_fetch.set_all_requests_hyst(pbest, rsc_type);
+    if (buffer_low) {
+        work_fetch.set_all_requests_hyst(pbest, rsc_type);
+    } else {
+        set_request_excluded(pbest);
+    }
     return pbest;
 }
 
@@ -442,6 +471,29 @@ void RSC_WORK_FETCH::set_request(PROJECT* p) {
     }
 }
 
+// We're fetching work because some instances are starved because
+// of exclusions.
+// See how many N of these instances are not excluded for this project.
+// Ask for N instances and for N*work_buf_min seconds.
+//
+void RSC_WORK_FETCH::set_request_excluded(PROJECT* p) {
+    RSC_PROJECT_WORK_FETCH& pwf = project_state(p);
+
+    int inst_mask = sim_excluded_instances & pwf.non_excluded_instances;
+    int n = 0;
+    for (int i=0; i<ninstances; i++) {
+        if ((i<<i) & inst_mask) {
+            n++;
+        }
+    }
+    req_instances = n;
+    if (p->resource_share == 0 || config.fetch_minimal_work) {
+        req_secs = 1;
+    } else {
+        req_secs = n*gstate.work_buf_total();
+    }
+}
+
 void RSC_WORK_FETCH::print_state(const char* name) {
     msg_printf(0, MSG_INFO,
         "[work_fetch] %s: shortfall %.2f nidle %.2f saturated %.2f busy %.2f",
@@ -877,7 +929,7 @@ void WORK_FETCH::set_initial_work_request(PROJECT* p) {
         rsc_work_fetch[i].req_secs = 1;
         if (i) {
             RSC_WORK_FETCH& rwf = rsc_work_fetch[i];
-            if (rwf.ninstances ==  p->ncoprocs_excluded[i]) {
+            if (rwf.ninstances ==  p->rsc_pwf[i].ncoprocs_excluded) {
                 rsc_work_fetch[i].req_secs = 0;
             }
         }
diff --git a/client/work_fetch.h b/client/work_fetch.h
index 975d14503c..4f1a1d371d 100644
--- a/client/work_fetch.h
+++ b/client/work_fetch.h
@@ -100,6 +100,11 @@ struct RSC_PROJECT_WORK_FETCH {
     int n_runnable_jobs;
     double sim_nused;
     double nused_total;     // sum of instances over all runnable jobs
+    int ncoprocs_excluded;
+        // number of excluded instances
+    int non_excluded_instances;
+        // bitmap of non-excluded instances
+        // (i.e. instances this project's jobs can run on)
     int deadlines_missed;
     int deadlines_missed_copy;
         // copy of the above used during schedule_cpus()
@@ -116,6 +121,8 @@ struct RSC_PROJECT_WORK_FETCH {
         n_runnable_jobs = 0;
         sim_nused = 0;
         nused_total = 0;
+        ncoprocs_excluded = 0;
+        non_excluded_instances = 0;
         deadlines_missed = 0;
         deadlines_missed_copy = 0;
     }
@@ -201,6 +208,11 @@ struct RSC_WORK_FETCH {
         // seconds of idle instances between now and now+work_buf_total()
     double nidle_now;
     double sim_nused;
+    int sim_used_instances;
+        // bitmap of instances used in simulation,
+        // taking into account GPU exclusions
+    int sim_excluded_instances;
+        // bitmap of instances not used (i.e. starved because of exclusion)
     double total_fetchable_share;
         // total RS of projects from which we could fetch jobs for this device
     double saturated_time;
@@ -241,6 +253,7 @@ struct RSC_WORK_FETCH {
     void print_state(const char*);
     void clear_request();
     void set_request(PROJECT*);
+    void set_request_excluded(PROJECT*);
     bool may_have_work(PROJECT*);
     RSC_WORK_FETCH() {
         rsc_type = 0;
diff --git a/doc/sim/sim_web.php b/doc/sim/sim_web.php
index 73d184ff3d..8a91e025d0 100644
--- a/doc/sim/sim_web.php
+++ b/doc/sim/sim_web.php
@@ -339,6 +339,7 @@ function log_flag_boxes() {
     return "
         <input type=checkbox name=cpu_sched_debug> CPU scheduling debug
         <br> <input type=checkbox name=rr_simulation> Round-robin simulation info
+        <br> <input type=checkbox name=rrsim_detail> Round-robin simulation details
         <br> <input type=checkbox name=work_fetch_debug> Work fetch debug
     ";
 }
@@ -440,6 +441,9 @@ function simulation_action() {
     if (post_str("rr_simulation", true)) {
         $x .= "<rr_simulation/>\n";
     }
+    if (post_str("rrsim_detail", true)) {
+        $x .= "<rrsim_detail/>\n";
+    }
     if (post_str("work_fetch_debug", true)) {
         $x .= "<work_fetch_debug/>\n";
     }
diff --git a/lib/coproc.h b/lib/coproc.h
index 1b9095a759..5971dddeea 100644
--- a/lib/coproc.h
+++ b/lib/coproc.h
@@ -264,11 +264,11 @@ struct COPROC {
     COPROC() {
         clear();
     }
-    bool device_num_exists(int n) {
+    int device_num_index(int n) {
         for (int i=0; i<count; i++) {
-            if (device_nums[i] == n) return true;
+            if (device_nums[i] == n) return i;
         }
-        return false;
+        return -1;
     }
     void merge_opencl(
         std::vector<OPENCL_DEVICE_PROP> &opencls,