Client: fix job scheduling bug that could starve CPUs

Job scheduling has 2 phases: make_run_list(): build a sorted list of runnable jobs enforce_run_list() go through the list and run jobs The run list in general contains more jobs than can actually be run. This is intentional. There are lots of reasons why enforce_run_list() might not be able to run a particular job, and we don't know these during make_run_list(). So we need to give enforce_run_list() a surplus of choices. The problem: make_run_list() was accounting RAM usage of jobs in the list, and stopping when this exceeded physical RAM. This led to a situation where we added a bunch of GPU jobs to the list - more than could actually be run - and this caused too few CPU jobs to be put in the list. Oddly, the comment at the start of cpu_sched.cpp said that RAM usage was ignored by make_run_list(); this was not the case. Anyway, I removed RAM accounting from make_run_list().
2014-02-11 12:33:13 -08:00 · 2014-02-11 12:33:13 -08:00 · 17e44af601
parent 2a84d20eae
commit 17e44af601
1 changed files with 11 additions and 21 deletions
--- a/client/cpu_sched.cpp
+++ b/client/cpu_sched.cpp
@ -17,7 +17,7 @@

 // CPU scheduling logic.
 //
-//  - create an ordered "run list" (schedule_cpus).
+//  - create an ordered "run list" (make_run_list()).
 //      The ordering is roughly as follows:
 //          - GPU jobs first, then CPU jobs
 //          - for a given resource, jobs in deadline danger first
@ -39,8 +39,8 @@
 //      - sort the list according to "more_important()"
 //      - shuffle the list to avoid starving multi-thread jobs
 //
-//  - scan through the resulting list,
-//      running the jobs and preempting other jobs.
+//  - scan through the resulting list, running the jobs and preempting
+//      other jobs (enforce_run_list).
 //      Don't run a job if
 //      - its GPUs can't be assigned (possible if need >1 GPU)
 //      - it's a multi-thread job, and CPU usage would be #CPUs+1 or more
@ -99,7 +99,6 @@ struct PROC_RESOURCES {
    double ncpus_used_st;   // #CPUs of GPU or single-thread jobs
    double ncpus_used_mt;   // #CPUs of multi-thread jobs
    COPROCS pr_coprocs;
-    double ram_left;

    void init() {
        ncpus = gstate.ncpus;
@ -107,7 +106,6 @@ struct PROC_RESOURCES {
        ncpus_used_mt = 0;
        pr_coprocs.clone(coprocs, false);
        pr_coprocs.clear_usage();
-        ram_left = gstate.available_ram();
        if (have_max_concurrent) {
            max_concurrent_init();
        }
@ -116,7 +114,11 @@ struct PROC_RESOURCES {
    // should we stop scanning jobs?
    //
    inline bool stop_scan_cpu() {
-        return ncpus_used_st >= ncpus;
+        if (ncpus_used_st >= ncpus) return true;
+        if (ncpus_used_mt >= 2*ncpus) return true;
+            // kind of arbitrary, but need to have some limit
+            // in case there are only MT jobs, and lots of them
+        return false;
    }

    inline bool stop_scan_coproc(int rsc_type) {
@ -131,7 +133,6 @@ struct PROC_RESOURCES {
    // (i.e add it to the runnable list; not actually run it)
    //
    bool can_schedule(RESULT* rp, ACTIVE_TASK* atp) {
-        double wss;
        if (max_concurrent_exceeded(rp)) return false;
        if (atp) {
 			// don't schedule if something's pending
@ -154,11 +155,7 @@ struct PROC_RESOURCES {
                }
                atp->needs_shmem = false;
            }
-            wss = atp->procinfo.working_set_size_smoothed;
-        } else {
-            wss = rp->avp->max_working_set_size;
        }
-        if (wss > ram_left) return false;
        if (rp->schedule_backoff > gstate.now) return false;
        if (rp->uses_coprocs()) {
            if (gpu_suspend_reason) return false;
@ -211,13 +208,6 @@ struct PROC_RESOURCES {
        } else {
            ncpus_used_st += rp->avp->avg_ncpus;
        }
-        double wss;
-        if (atp) {
-            wss = atp->procinfo.working_set_size_smoothed;
-        } else {
-            wss = rp->avp->max_working_set_size;
-        }
-        ram_left -= wss;

        adjust_rec_sched(rp);
        max_concurrent_inc(rp);
@ -1093,7 +1083,7 @@ void CLIENT_STATE::append_unfinished_time_slice(vector<RESULT*> &run_list) {
 //    That's the only kind of suspended GPU job.
 // CORPOC::usage[]: for each instance, its usage
 //
-// enforce_schedule() calls assign_coprocs(),
+// enforce_run_list() calls assign_coprocs(),
 // which assigns coproc instances to scheduled jobs,
 // and prunes jobs for which we can't make an assignment
 // (the job list is in order of decreasing priority)
@ -1536,7 +1526,7 @@ bool CLIENT_STATE::enforce_run_list(vector<RESULT*>& run_list) {
 #endif

    if (log_flags.cpu_sched_debug) {
-        msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_schedule(): start");
+        msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_run_list(): start");
        msg_printf(0, MSG_INFO, "[cpu_sched_debug] preliminary job list:");
        print_job_list(run_list);
    }
@ -1901,7 +1891,7 @@ bool CLIENT_STATE::enforce_run_list(vector<RESULT*>& run_list) {
        set_client_state_dirty("enforce_cpu_schedule");
    }
    if (log_flags.cpu_sched_debug) {
-        msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_schedule: end");
+        msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_run_list: end");
    }
    if (coproc_start_deferred) {
        if (log_flags.cpu_sched_debug) {