From 17e44af601da218df6da0d90f065029f86b9de65 Mon Sep 17 00:00:00 2001
From: David Anderson <davea@ssl.berkeley.edu>
Date: Tue, 11 Feb 2014 12:33:13 -0800
Subject: [PATCH] Client: fix job scheduling bug that could starve CPUs

Job scheduling has 2 phases:
    make_run_list(): build a sorted list of runnable jobs
    enforce_run_list() go through the list and run jobs
The run list in general contains more jobs than can actually be run.
This is intentional.
There are lots of reasons why enforce_run_list() might not be able
to run a particular job, and we don't know these during make_run_list().
So we need to give enforce_run_list() a surplus of choices.

The problem: make_run_list() was accounting RAM usage of jobs in the list,
and stopping when this exceeded physical RAM.
This led to a situation where we added a bunch of GPU jobs to the list -
more than could actually be run -
and this caused too few CPU jobs to be put in the list.

Oddly, the comment at the start of cpu_sched.cpp said that RAM usage
was ignored by make_run_list(); this was not the case.

Anyway, I removed RAM accounting from make_run_list().
---
 client/cpu_sched.cpp | 32 +++++++++++---------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/client/cpu_sched.cpp b/client/cpu_sched.cpp
index e6164647b1..029ca32558 100644
--- a/client/cpu_sched.cpp
+++ b/client/cpu_sched.cpp
@@ -17,7 +17,7 @@
 
 // CPU scheduling logic.
 //
-//  - create an ordered "run list" (schedule_cpus).
+//  - create an ordered "run list" (make_run_list()).
 //      The ordering is roughly as follows:
 //          - GPU jobs first, then CPU jobs
 //          - for a given resource, jobs in deadline danger first
@@ -39,8 +39,8 @@
 //      - sort the list according to "more_important()"
 //      - shuffle the list to avoid starving multi-thread jobs
 //
-//  - scan through the resulting list,
-//      running the jobs and preempting other jobs.
+//  - scan through the resulting list, running the jobs and preempting
+//      other jobs (enforce_run_list).
 //      Don't run a job if
 //      - its GPUs can't be assigned (possible if need >1 GPU)
 //      - it's a multi-thread job, and CPU usage would be #CPUs+1 or more
@@ -99,7 +99,6 @@ struct PROC_RESOURCES {
     double ncpus_used_st;   // #CPUs of GPU or single-thread jobs
     double ncpus_used_mt;   // #CPUs of multi-thread jobs
     COPROCS pr_coprocs;
-    double ram_left;
 
     void init() {
         ncpus = gstate.ncpus;
@@ -107,7 +106,6 @@ struct PROC_RESOURCES {
         ncpus_used_mt = 0;
         pr_coprocs.clone(coprocs, false);
         pr_coprocs.clear_usage();
-        ram_left = gstate.available_ram();
         if (have_max_concurrent) {
             max_concurrent_init();
         }
@@ -116,7 +114,11 @@ struct PROC_RESOURCES {
     // should we stop scanning jobs?
     //
     inline bool stop_scan_cpu() {
-        return ncpus_used_st >= ncpus;
+        if (ncpus_used_st >= ncpus) return true;
+        if (ncpus_used_mt >= 2*ncpus) return true;
+            // kind of arbitrary, but need to have some limit
+            // in case there are only MT jobs, and lots of them
+        return false;
     }
 
     inline bool stop_scan_coproc(int rsc_type) {
@@ -131,7 +133,6 @@ struct PROC_RESOURCES {
     // (i.e add it to the runnable list; not actually run it)
     //
     bool can_schedule(RESULT* rp, ACTIVE_TASK* atp) {
-        double wss;
         if (max_concurrent_exceeded(rp)) return false;
         if (atp) {
 			// don't schedule if something's pending
@@ -154,11 +155,7 @@ struct PROC_RESOURCES {
                 }
                 atp->needs_shmem = false;
             }
-            wss = atp->procinfo.working_set_size_smoothed;
-        } else {
-            wss = rp->avp->max_working_set_size;
         }
-        if (wss > ram_left) return false;
         if (rp->schedule_backoff > gstate.now) return false;
         if (rp->uses_coprocs()) {
             if (gpu_suspend_reason) return false;
@@ -211,13 +208,6 @@ struct PROC_RESOURCES {
         } else {
             ncpus_used_st += rp->avp->avg_ncpus;
         }
-        double wss;
-        if (atp) {
-            wss = atp->procinfo.working_set_size_smoothed;
-        } else {
-            wss = rp->avp->max_working_set_size;
-        }
-        ram_left -= wss;
 
         adjust_rec_sched(rp);
         max_concurrent_inc(rp);
@@ -1093,7 +1083,7 @@ void CLIENT_STATE::append_unfinished_time_slice(vector<RESULT*> &run_list) {
 //    That's the only kind of suspended GPU job.
 // CORPOC::usage[]: for each instance, its usage
 //
-// enforce_schedule() calls assign_coprocs(),
+// enforce_run_list() calls assign_coprocs(),
 // which assigns coproc instances to scheduled jobs,
 // and prunes jobs for which we can't make an assignment
 // (the job list is in order of decreasing priority)
@@ -1536,7 +1526,7 @@ bool CLIENT_STATE::enforce_run_list(vector<RESULT*>& run_list) {
 #endif
 
     if (log_flags.cpu_sched_debug) {
-        msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_schedule(): start");
+        msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_run_list(): start");
         msg_printf(0, MSG_INFO, "[cpu_sched_debug] preliminary job list:");
         print_job_list(run_list);
     }
@@ -1901,7 +1891,7 @@ bool CLIENT_STATE::enforce_run_list(vector<RESULT*>& run_list) {
         set_client_state_dirty("enforce_cpu_schedule");
     }
     if (log_flags.cpu_sched_debug) {
-        msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_schedule: end");
+        msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_run_list: end");
     }
     if (coproc_start_deferred) {
         if (log_flags.cpu_sched_debug) {