From ba6a479fb460cdd56467fa0766c4ccd83d7ea328 Mon Sep 17 00:00:00 2001
From: David Anderson <davea@ssl.berkeley.edu>
Date: Fri, 27 Jan 2006 22:55:05 +0000
Subject: [PATCH] CPU scheduler

svn path=/trunk/boinc/; revision=9336
---
 checkin_notes         |  11 +++++
 client/client_state.C |  10 ++++
 client/client_state.h |  13 +++--
 client/client_types.C |  32 ++++++++++++
 client/client_types.h |  15 ++++++
 client/cs_prefs.C     |   1 +
 client/cs_scheduler.C | 110 ++++++++++++++++++++++++++++++------------
 client/cs_statefile.C |   5 +-
 8 files changed, 163 insertions(+), 34 deletions(-)
diff --git a/checkin_notes b/checkin_notes
index 84a8f87f7d..a1be246f71 100755
--- a/checkin_notes
+++ b/checkin_notes
@@ -1095,3 +1095,14 @@ David  27 Jan 2006
 
     client/
         cs_apps.C
+
+David  27 Jan 2006
+    - core client: various scheduler fixes
+        (from John McLeod)
+
+    client/
+        client_state.C,h
+        client_types.C,h
+        cs_prefs.C
+        cs_scheduler.C
+        cs_statefile.C
diff --git a/client/client_state.C b/client/client_state.C
index 9960aa7ac2..7944711de7 100644
--- a/client/client_state.C
+++ b/client/client_state.C
@@ -125,6 +125,8 @@ CLIENT_STATE::CLIENT_STATE() {
     have_tentative_project = false;
     new_version_check_time = 0;
     detach_console = false;
+
+    rr_results_fail_count = rr_last_results_fail_count = 0;
 }
 
 #if 0
@@ -1445,5 +1447,13 @@ double calculate_exponential_backoff( int n, double MIN, double MAX) {
     return rand_range(MIN, rmax);
 }
 
+bool CLIENT_STATE::network_is_intermittent() const
+{
+    if (global_prefs.net_start_hour != global_prefs.net_end_hour) return true;
+    if (global_prefs.confirm_before_connecting) return true;
+    if ((now - network_last_unsuspended) < SECONDS_PER_DAY * 7) return true;
+    if ((now - network_last_unsuspended) < global_prefs.work_buf_min_days * SECONDS_PER_DAY) return true;
+    return false;
+}
 
 const char *BOINC_RCSID_e836980ee1 = "$Id$";
diff --git a/client/client_state.h b/client/client_state.h
index e1f34ab493..5be603b0d6 100644
--- a/client/client_state.h
+++ b/client/client_state.h
@@ -81,6 +81,7 @@ public:
     std::vector<APP_VERSION*> app_versions;
     std::vector<WORKUNIT*> workunits;
     std::vector<RESULT*> results;
+    std::vector<CPU> cpus;
 
     NET_XFER_SET* net_xfers;
     PERS_FILE_XFER_SET* pers_file_xfers;
@@ -150,6 +151,7 @@ public:
         // Don't do CPU.  See check_suspend_activities for logic
     bool network_suspended;
         // Don't do network.  See check_suspend_network for logic
+    double network_last_unsuspended;
 	bool executing_as_daemon;
         // true if --daemon is on the commandline
         // this means we are running as a daemon on unix,
@@ -186,6 +188,8 @@ private:
     double total_cpu_time_this_period;
 	bool work_fetch_no_new_work;
 	bool cpu_earliest_deadline_first;
+    long rr_last_results_fail_count;
+    long rr_results_fail_count;
 
 // --------------- acct_mgr.C:
 public:
@@ -231,6 +235,7 @@ public:
     bool want_network();
     void network_available();
     bool no_gui_rpc;
+    bool network_id_intermittent() const;
 private:
     int link_app(PROJECT*, APP*);
     int link_file_info(PROJECT*, FILE_INFO*);
@@ -288,8 +293,8 @@ private:
     int choose_version_num(char*, SCHEDULER_REPLY&);
     int app_finished(ACTIVE_TASK&);
     void assign_results_to_projects();
-    bool schedule_largest_debt_project(double expected_pay_off);
-    bool schedule_earliest_deadline_result();
+    bool schedule_largest_debt_project(double expected_pay_off, int cpu_index);
+    bool schedule_earliest_deadline_result(int cpu_index);
     bool start_apps();
     bool schedule_cpus();
     bool handle_finished_apps();
@@ -338,11 +343,13 @@ private:
 // --------------- cs_scheduler.C:
 public:
     double work_needed_secs();
+    void force_reschedule_all_cpus();
     PROJECT* next_project_master_pending();
     PROJECT* next_project_need_work();
     int make_scheduler_request(PROJECT*);
     int handle_scheduler_reply(PROJECT*, char* scheduler_url, int& nresults);
     int compute_work_requests();
+    bool network_is_intermittent() const;
     SCHEDULER_OP* scheduler_op;
     void scale_duration_correction_factors(double);
 private:
@@ -359,7 +366,7 @@ private:
     bool no_work_for_a_cpu();
     int proj_min_results(PROJECT*, double);
     bool round_robin_misses_deadline(double, double);
-    bool rr_misses_deadline(double, double);
+    bool rr_misses_deadline(double, double, bool);
     bool edf_misses_deadline(double);
     void set_scheduler_modes();
     void generate_new_host_cpid();
diff --git a/client/client_types.C b/client/client_types.C
index 6c3927fe90..e3d1e61cfc 100644
--- a/client/client_types.C
+++ b/client/client_types.C
@@ -99,6 +99,7 @@ void PROJECT::init() {
     work_request = 0;
     work_request_urgency = WORK_FETCH_DONT_NEED;
     duration_correction_factor = 1;
+    deadline_problem_count = 0;
 }
 
 // parse project fields from client_state.xml
@@ -1476,6 +1477,18 @@ double RESULT::estimated_cpu_time_remaining() {
     return estimated_cpu_time();
 }
 
+double RESULT::computation_deadline() {
+    double compute_deadline = report_deadline - 
+        (gstate.global_prefs.cpu_scheduling_period_minutes*60 + // the number of seconds between task switches (some people are setting this to be a couple of days).
+        SECONDS_PER_DAY); // one day of slack
+    if (::gstate.network_is_intermittent()) {
+        // only needed if the network connection is unreliable.
+        compute_deadline -= gstate.global_prefs.work_buf_min_days*SECONDS_PER_DAY; // the number of seconds possible between INet connections
+    }
+    return compute_deadline;
+}
+
+
 // The given result has just completed successfully.
 // Update the correction factor used to predict
 // completion time for this project's results
@@ -1534,4 +1547,23 @@ FILE_INFO* RESULT::lookup_file_logical(const char* lname) {
     return 0;
 }
 
+CPU::CPU()
+:
+sched_last_time(0),
+must_schedule(true),  // a new one MUST be scheduled right now.
+result(NULL)
+{
+}
+
+CPU::~CPU()
+{
+    gstate.request_schedule_cpus("Reduction in CPUs");
+}
+
+void CPU::schedule_result(RESULT * r)
+{
+    result = r;
+    sched_last_time = gstate.now;
+}
+
 const char *BOINC_RCSID_b81ff9a584 = "$Id$";
diff --git a/client/client_types.h b/client/client_types.h
index cf543e005c..071553a523 100644
--- a/client/client_types.h
+++ b/client/client_types.h
@@ -241,6 +241,7 @@ public:
     // fields used by CPU scheduler and work fetch
     // everything from here on applies only to CPU intensive projects
 
+    int deadline_problem_count;
     bool contactable();
         // not suspended and not deferred and not no more work
     bool runnable();
@@ -399,6 +400,7 @@ struct RESULT {
     char name[256];
     char wu_name[256];
     double report_deadline;
+    double computation_deadline();
     std::vector<FILE_REF> output_files;
     bool ready_to_report;
         // we're ready to report this result to the server;
@@ -466,7 +468,20 @@ struct RESULT {
     bool already_selected;
         // used to keep cpu scheduler from scheduling a result twice
         // transient; used only within schedule_cpus()
+    bool deadline_problem;
+};
 
+struct CPU {
+public:
+    CPU();
+    ~CPU();
+
+    void schedule_result(RESULT * result);
+
+    double sched_last_time;
+    bool must_schedule;
+    RESULT *result;
+private:
 };
 
 #endif
diff --git a/client/cs_prefs.C b/client/cs_prefs.C
index 3ff005807c..32acf23375 100644
--- a/client/cs_prefs.C
+++ b/client/cs_prefs.C
@@ -237,6 +237,7 @@ int CLIENT_STATE::suspend_network(int reason) {
 
 int CLIENT_STATE::resume_network() {
     msg_printf(NULL, MSG_INFO, "Resuming network activity");
+    network_last_unsuspended = now;
     return 0;
 }
 
diff --git a/client/cs_scheduler.C b/client/cs_scheduler.C
index b2ff9bbb5c..fc6348bad8 100644
--- a/client/cs_scheduler.C
+++ b/client/cs_scheduler.C
@@ -442,7 +442,7 @@ PROJECT* CLIENT_STATE::find_project_with_overdue_results() {
         if (have_sporadic_connection) {
             return p;
         }
-        if (gstate.now > r->report_deadline - REPORT_DEADLINE_CUSHION) {
+        if (gstate.now > r->computation_deadline()) {
             return p;
         }
         if (gstate.now > r->completed_time + global_prefs.work_buf_min_days*SECONDS_PER_DAY) {
@@ -537,7 +537,9 @@ int CLIENT_STATE::compute_work_requests() {
         overall_work_fetch_urgency = WORK_FETCH_NEED_IMMEDIATELY;
     } else if (global_work_need > 0) {
         scope_messages.printf("compute_work_requests(): global work needed is greater than zero\n");
-        overall_work_fetch_urgency = WORK_FETCH_NEED;
+        // disconnected hosts need to keep the queue full at all times that they are connected.
+        // always connected hosts do not need to be as agressive at keeping the queue full.
+        overall_work_fetch_urgency = network_is_intermittent() ? WORK_FETCH_NEED_IMMEDIATELY : WORK_FETCH_NEED;
     } else {
         overall_work_fetch_urgency = WORK_FETCH_OK;
     }
@@ -615,7 +617,7 @@ int CLIENT_STATE::compute_work_requests() {
             p->work_request = max(0.0,
                 //(2*work_min_period - estimated_time_to_starvation)
                 (work_min_period - estimated_time_to_starvation)
-                * ncpus
+                * avg_proc_rate() * p->resource_share / prrs
             );
 
         } else if (overall_work_fetch_urgency > WORK_FETCH_OK) {
@@ -634,6 +636,13 @@ int CLIENT_STATE::compute_work_requests() {
         "CLIENT_STATE::compute_work_requests(): client work need: %f sec, urgency %d\n",
         global_work_need, overall_work_fetch_urgency
     );
+
+    // Make certain that there is only one project with a positive work request.
+    PROJECT * next_proj = next_project_need_work();
+    for (i = 0; i < projects.size(); ++i)
+    {
+        if (projects[i] != next_proj) projects[i]->work_request = 0.0;
+    }
     return 0;
 }
 
@@ -1052,8 +1061,8 @@ bool CLIENT_STATE::should_get_work() {
 
     // if the CPU started this time period overloaded,
     // let it process for a while to get out of the CPU overload state.
-    //
-    if (!work_fetch_no_new_work) {
+    // unless there is an override of some sort.
+    if (!work_fetch_no_new_work || must_schedule_cpus) {
         set_scheduler_modes();
     }
 
@@ -1102,13 +1111,14 @@ bool CLIENT_STATE::no_work_for_a_cpu() {
 
 // return true if round-robin scheduling will miss a deadline
 //
-bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) {
+bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs, bool current_work_only) {
     PROJECT* p, *pbest;
     RESULT* rp, *rpbest;
     vector<RESULT*> active;
     unsigned int i;
     double x;
     vector<RESULT*>::iterator it;
+    bool deadline_missed = false;
 
     SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_SCHED_CPU);
 
@@ -1121,12 +1131,16 @@ bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) {
         p->pending.clear();
     }
 
+    rr_results_fail_count = 0;
+
     for (i=0; i<results.size(); i++) {
         rp = results[i];
+        if (rp->aborted_via_gui) continue;
         if (!rp->runnable()) continue;
         if (rp->aborted_via_gui) continue;
         if (rp->project->non_cpu_intensive) continue;
         rp->rrsim_cpu_left = rp->estimated_cpu_time_remaining();
+        rp->deadline_problem = false;
         p = rp->project;
         if (p->active.size() < (unsigned int)ncpus) {
             active.push_back(rp);
@@ -1160,12 +1174,17 @@ bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) {
 
         // "rpbest" is first result to finish.  Does it miss its deadline?
         //
-        double diff = sim_now + rpbest->rrsim_finish_delay - rpbest->report_deadline;
+        double diff = sim_now + rpbest->rrsim_finish_delay - rpbest->computation_deadline();
         if (diff > 0) {
-            scope_messages.printf(
-                "rr_sim: result %s misses deadline by %f\n", rpbest->name, diff
-            );
-            return true;
+            if (current_work_only) {
+                scope_messages.printf(
+                    "rr_sim: result %s misses deadline by %f\n", rpbest->name, diff
+                );
+                rpbest->deadline_problem = true;
+                deadline_missed = true;
+                ++rr_results_fail_count;
+            }
+            else return true;
         }
 
         // remove *rpbest from active set,
@@ -1219,8 +1238,8 @@ bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) {
 
         sim_now += rpbest->rrsim_finish_delay;
     }
-    scope_messages.printf( "rr_sim: deadlines met\n");
-    return false;
+    if (!deadline_missed) scope_messages.printf( "rr_sim: deadlines met\n");
+    return deadline_missed;
 }
 
 #if 0
@@ -1332,20 +1351,19 @@ bool CLIENT_STATE::edf_misses_deadline(double per_cpu_proc_rate) {
 // and print a message if we're changing their value
 //
 void CLIENT_STATE::set_scheduler_modes() {
-    RESULT* rp;
     unsigned int i;
     bool should_not_fetch_work = false;
-    bool use_earliest_deadline_first = false;
+    bool result_has_deadline_problem = false;
     double total_proc_rate = avg_proc_rate();
     double per_cpu_proc_rate = total_proc_rate/ncpus;
 
     SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_SCHED_CPU);
 
     double rrs = runnable_resource_share();
-    if (rr_misses_deadline(per_cpu_proc_rate, rrs)) {
+    if (rr_misses_deadline(per_cpu_proc_rate, rrs, true)) {
         // if round robin would miss a deadline, use EDF
         //
-        use_earliest_deadline_first = true;
+        result_has_deadline_problem = true;
         if (!no_work_for_a_cpu()) {
             should_not_fetch_work = true;
         }
@@ -1356,12 +1374,13 @@ void CLIENT_STATE::set_scheduler_modes() {
         PROJECT* p = next_project_need_work();
         if (p && !p->runnable()) {
             rrs += p->resource_share;
-            if (rr_misses_deadline(per_cpu_proc_rate, rrs)) {
+            if (rr_misses_deadline(per_cpu_proc_rate, rrs, false)) {
                 should_not_fetch_work = true;
             }
         }
     }
 
+#if 0
     for (i=0; i<results.size(); i++) {
         rp = results[i];
         if (rp->computing_done()) continue;
@@ -1370,7 +1389,8 @@ void CLIENT_STATE::set_scheduler_modes() {
         // Is the nearest deadline within a day?
         //
         if (rp->report_deadline - gstate.now < 60 * 60 * 24) {
-            use_earliest_deadline_first = true; 
+            result_has_deadline_problem = true; 
+            rp->deadline_problem = true;
             scope_messages.printf(
                 "set_scheduler_modes(): Less than 1 day until deadline.\n"
             );
@@ -1379,13 +1399,20 @@ void CLIENT_STATE::set_scheduler_modes() {
         // is there a deadline < twice the users connect period?
         //
         if (rp->report_deadline - gstate.now < global_prefs.work_buf_min_days * SECONDS_PER_DAY * 2) {
-            use_earliest_deadline_first = true;
+            result_has_deadline_problem = true;
+            rp->deadline_problem = true;
             scope_messages.printf(
                 "set_scheduler_modes(): Deadline is before reconnect time.\n"
             );
         }
     }
+#endif
 
+    for (i = 0; i < projects.size(); ++i) projects[i]->deadline_problem_count = 0;
+
+    for (i = 0; i < results.size(); ++i) {
+        if (results[i]->deadline_problem) ++results[i]->project->deadline_problem_count;
+    }
     // display only when the policy changes to avoid once per second
     //
     if (work_fetch_no_new_work && !should_not_fetch_work) {
@@ -1400,32 +1427,48 @@ void CLIENT_STATE::set_scheduler_modes() {
         );
     }
 
-    if (cpu_earliest_deadline_first && !use_earliest_deadline_first) {
+    if (cpu_earliest_deadline_first && !result_has_deadline_problem) {
         msg_printf(NULL, MSG_INFO,
             "Resuming round-robin CPU scheduling."
         );
     }
-    if (!cpu_earliest_deadline_first && use_earliest_deadline_first) {
+    if (!cpu_earliest_deadline_first && result_has_deadline_problem) {
         msg_printf(NULL, MSG_INFO,
-            "Using earliest-deadline-first scheduling because computer is overcommitted."
+            "Using critical-deadline-first scheduling because computer is overcommitted."
         );
     }
 
+    if (rr_results_fail_count > rr_last_results_fail_count) {
+        force_reschedule_all_cpus();
+    }
+    rr_last_results_fail_count = rr_results_fail_count;
+
+
     work_fetch_no_new_work = should_not_fetch_work;
-    cpu_earliest_deadline_first = use_earliest_deadline_first;
+    cpu_earliest_deadline_first = result_has_deadline_problem;
 }
 
 double CLIENT_STATE::work_needed_secs() {
-    double total_work = 0;
+    // Note that one CPDN result with 30 days remaining will not keep 4 CPUs busy today.
+    std::vector <double> work_for_cpu;
+    work_for_cpu.resize(ncpus);
+    for( int i = 0; i < ncpus; ++i) work_for_cpu[i] = 0;
     for( unsigned int i = 0; i < results.size(); ++i) {
         if (results[i]->project->non_cpu_intensive) continue;
-        total_work += results[i]->estimated_cpu_time_remaining();
+        int best_cpu = 0;
+        for (int j = 1; j < ncpus; ++j)
+        {
+            if (work_for_cpu[j] < work_for_cpu[best_cpu])
+                best_cpu = j;
+        }
+        work_for_cpu[best_cpu] += results[i]->estimated_cpu_time_remaining();
     }
-    double x = global_prefs.work_buf_min_days*SECONDS_PER_DAY*avg_proc_rate() - total_work;
-    if (x < 0) {
-        return 0;
+    double total_work_need = 0;
+    for (int i =0; i < ncpus; ++i) {
+        double x = global_prefs.work_buf_min_days*SECONDS_PER_DAY*avg_proc_rate()/ncpus - work_for_cpu[i];
+        if (x < 0) total_work_need += x;
     }
-    return x;
+    return total_work_need;
 }
 
 void CLIENT_STATE::scale_duration_correction_factors(double factor) {
@@ -1436,6 +1479,13 @@ void CLIENT_STATE::scale_duration_correction_factors(double factor) {
     }
 }
 
+void CLIENT_STATE::force_reschedule_all_cpus()
+{
+    for (std::vector<CPU>::iterator it = cpus.begin(); it != cpus.end(); ++it) {
+        (*it).must_schedule = true;
+    }
+}
+
 // Choose a new host CPID.
 // Do scheduler RPCs to all projects to propagate the CPID
 //
diff --git a/client/cs_statefile.C b/client/cs_statefile.C
index 4e762b56fe..a0548e9ed8 100644
--- a/client/cs_statefile.C
+++ b/client/cs_statefile.C
@@ -309,6 +309,7 @@ int CLIENT_STATE::parse_state_file() {
             continue;
         } else if (parse_int(buf, "<user_network_request>", user_network_request)) {
             continue;
+        } else if (parse_double(buf, "<network_last_unsuspended>", network_last_unsuspended)) {
         } else if (parse_int(buf, "<core_client_major_version>", old_major_version)) {
         } else if (parse_int(buf, "<core_client_minor_version>", old_minor_version)) {
         } else if (parse_int(buf, "<core_client_release>", old_release)) {
@@ -425,7 +426,8 @@ int CLIENT_STATE::write_state(MIOFILE& f) {
         "<core_client_minor_version>%d</core_client_minor_version>\n"
         "<core_client_release>%d</core_client_release>\n"
         "<user_run_request>%d</user_run_request>\n"
-        "<user_network_request>%d</user_network_request>\n"
+        "<user_network_request>%f</user_network_request>\n"
+        "<network_last_unsuspended>%d</network_last_unsuspended>\n"
         "%s"
         "<new_version_check_time>%f</new_version_check_time>\n",
         platform_name,
@@ -434,6 +436,7 @@ int CLIENT_STATE::write_state(MIOFILE& f) {
         core_client_release,
         user_run_request,
         user_network_request,
+        network_last_unsuspended,
         cpu_benchmarks_pending?"<cpu_benchmarks_pending/>\n":"",
         new_version_check_time
     );