diff --git a/checkin_notes b/checkin_notes index 84a8f87f7d..a1be246f71 100755 --- a/checkin_notes +++ b/checkin_notes @@ -1095,3 +1095,14 @@ David 27 Jan 2006 client/ cs_apps.C + +David 27 Jan 2006 + - core client: various scheduler fixes + (from John McLeod) + + client/ + client_state.C,h + client_types.C,h + cs_prefs.C + cs_scheduler.C + cs_statefile.C diff --git a/client/client_state.C b/client/client_state.C index 9960aa7ac2..7944711de7 100644 --- a/client/client_state.C +++ b/client/client_state.C @@ -125,6 +125,8 @@ CLIENT_STATE::CLIENT_STATE() { have_tentative_project = false; new_version_check_time = 0; detach_console = false; + + rr_results_fail_count = rr_last_results_fail_count = 0; } #if 0 @@ -1445,5 +1447,13 @@ double calculate_exponential_backoff( int n, double MIN, double MAX) { return rand_range(MIN, rmax); } +bool CLIENT_STATE::network_is_intermittent() const +{ + if (global_prefs.net_start_hour != global_prefs.net_end_hour) return true; + if (global_prefs.confirm_before_connecting) return true; + if ((now - network_last_unsuspended) < SECONDS_PER_DAY * 7) return true; + if ((now - network_last_unsuspended) < global_prefs.work_buf_min_days * SECONDS_PER_DAY) return true; + return false; +} const char *BOINC_RCSID_e836980ee1 = "$Id$"; diff --git a/client/client_state.h b/client/client_state.h index e1f34ab493..5be603b0d6 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -81,6 +81,7 @@ public: std::vector app_versions; std::vector workunits; std::vector results; + std::vector cpus; NET_XFER_SET* net_xfers; PERS_FILE_XFER_SET* pers_file_xfers; @@ -150,6 +151,7 @@ public: // Don't do CPU. See check_suspend_activities for logic bool network_suspended; // Don't do network. See check_suspend_network for logic + double network_last_unsuspended; bool executing_as_daemon; // true if --daemon is on the commandline // this means we are running as a daemon on unix, @@ -186,6 +188,8 @@ private: double total_cpu_time_this_period; bool work_fetch_no_new_work; bool cpu_earliest_deadline_first; + long rr_last_results_fail_count; + long rr_results_fail_count; // --------------- acct_mgr.C: public: @@ -231,6 +235,7 @@ public: bool want_network(); void network_available(); bool no_gui_rpc; + bool network_id_intermittent() const; private: int link_app(PROJECT*, APP*); int link_file_info(PROJECT*, FILE_INFO*); @@ -288,8 +293,8 @@ private: int choose_version_num(char*, SCHEDULER_REPLY&); int app_finished(ACTIVE_TASK&); void assign_results_to_projects(); - bool schedule_largest_debt_project(double expected_pay_off); - bool schedule_earliest_deadline_result(); + bool schedule_largest_debt_project(double expected_pay_off, int cpu_index); + bool schedule_earliest_deadline_result(int cpu_index); bool start_apps(); bool schedule_cpus(); bool handle_finished_apps(); @@ -338,11 +343,13 @@ private: // --------------- cs_scheduler.C: public: double work_needed_secs(); + void force_reschedule_all_cpus(); PROJECT* next_project_master_pending(); PROJECT* next_project_need_work(); int make_scheduler_request(PROJECT*); int handle_scheduler_reply(PROJECT*, char* scheduler_url, int& nresults); int compute_work_requests(); + bool network_is_intermittent() const; SCHEDULER_OP* scheduler_op; void scale_duration_correction_factors(double); private: @@ -359,7 +366,7 @@ private: bool no_work_for_a_cpu(); int proj_min_results(PROJECT*, double); bool round_robin_misses_deadline(double, double); - bool rr_misses_deadline(double, double); + bool rr_misses_deadline(double, double, bool); bool edf_misses_deadline(double); void set_scheduler_modes(); void generate_new_host_cpid(); diff --git a/client/client_types.C b/client/client_types.C index 6c3927fe90..e3d1e61cfc 100644 --- a/client/client_types.C +++ b/client/client_types.C @@ -99,6 +99,7 @@ void PROJECT::init() { work_request = 0; work_request_urgency = WORK_FETCH_DONT_NEED; duration_correction_factor = 1; + deadline_problem_count = 0; } // parse project fields from client_state.xml @@ -1476,6 +1477,18 @@ double RESULT::estimated_cpu_time_remaining() { return estimated_cpu_time(); } +double RESULT::computation_deadline() { + double compute_deadline = report_deadline - + (gstate.global_prefs.cpu_scheduling_period_minutes*60 + // the number of seconds between task switches (some people are setting this to be a couple of days). + SECONDS_PER_DAY); // one day of slack + if (::gstate.network_is_intermittent()) { + // only needed if the network connection is unreliable. + compute_deadline -= gstate.global_prefs.work_buf_min_days*SECONDS_PER_DAY; // the number of seconds possible between INet connections + } + return compute_deadline; +} + + // The given result has just completed successfully. // Update the correction factor used to predict // completion time for this project's results @@ -1534,4 +1547,23 @@ FILE_INFO* RESULT::lookup_file_logical(const char* lname) { return 0; } +CPU::CPU() +: +sched_last_time(0), +must_schedule(true), // a new one MUST be scheduled right now. +result(NULL) +{ +} + +CPU::~CPU() +{ + gstate.request_schedule_cpus("Reduction in CPUs"); +} + +void CPU::schedule_result(RESULT * r) +{ + result = r; + sched_last_time = gstate.now; +} + const char *BOINC_RCSID_b81ff9a584 = "$Id$"; diff --git a/client/client_types.h b/client/client_types.h index cf543e005c..071553a523 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -241,6 +241,7 @@ public: // fields used by CPU scheduler and work fetch // everything from here on applies only to CPU intensive projects + int deadline_problem_count; bool contactable(); // not suspended and not deferred and not no more work bool runnable(); @@ -399,6 +400,7 @@ struct RESULT { char name[256]; char wu_name[256]; double report_deadline; + double computation_deadline(); std::vector output_files; bool ready_to_report; // we're ready to report this result to the server; @@ -466,7 +468,20 @@ struct RESULT { bool already_selected; // used to keep cpu scheduler from scheduling a result twice // transient; used only within schedule_cpus() + bool deadline_problem; +}; +struct CPU { +public: + CPU(); + ~CPU(); + + void schedule_result(RESULT * result); + + double sched_last_time; + bool must_schedule; + RESULT *result; +private: }; #endif diff --git a/client/cs_prefs.C b/client/cs_prefs.C index 3ff005807c..32acf23375 100644 --- a/client/cs_prefs.C +++ b/client/cs_prefs.C @@ -237,6 +237,7 @@ int CLIENT_STATE::suspend_network(int reason) { int CLIENT_STATE::resume_network() { msg_printf(NULL, MSG_INFO, "Resuming network activity"); + network_last_unsuspended = now; return 0; } diff --git a/client/cs_scheduler.C b/client/cs_scheduler.C index b2ff9bbb5c..fc6348bad8 100644 --- a/client/cs_scheduler.C +++ b/client/cs_scheduler.C @@ -442,7 +442,7 @@ PROJECT* CLIENT_STATE::find_project_with_overdue_results() { if (have_sporadic_connection) { return p; } - if (gstate.now > r->report_deadline - REPORT_DEADLINE_CUSHION) { + if (gstate.now > r->computation_deadline()) { return p; } if (gstate.now > r->completed_time + global_prefs.work_buf_min_days*SECONDS_PER_DAY) { @@ -537,7 +537,9 @@ int CLIENT_STATE::compute_work_requests() { overall_work_fetch_urgency = WORK_FETCH_NEED_IMMEDIATELY; } else if (global_work_need > 0) { scope_messages.printf("compute_work_requests(): global work needed is greater than zero\n"); - overall_work_fetch_urgency = WORK_FETCH_NEED; + // disconnected hosts need to keep the queue full at all times that they are connected. + // always connected hosts do not need to be as agressive at keeping the queue full. + overall_work_fetch_urgency = network_is_intermittent() ? WORK_FETCH_NEED_IMMEDIATELY : WORK_FETCH_NEED; } else { overall_work_fetch_urgency = WORK_FETCH_OK; } @@ -615,7 +617,7 @@ int CLIENT_STATE::compute_work_requests() { p->work_request = max(0.0, //(2*work_min_period - estimated_time_to_starvation) (work_min_period - estimated_time_to_starvation) - * ncpus + * avg_proc_rate() * p->resource_share / prrs ); } else if (overall_work_fetch_urgency > WORK_FETCH_OK) { @@ -634,6 +636,13 @@ int CLIENT_STATE::compute_work_requests() { "CLIENT_STATE::compute_work_requests(): client work need: %f sec, urgency %d\n", global_work_need, overall_work_fetch_urgency ); + + // Make certain that there is only one project with a positive work request. + PROJECT * next_proj = next_project_need_work(); + for (i = 0; i < projects.size(); ++i) + { + if (projects[i] != next_proj) projects[i]->work_request = 0.0; + } return 0; } @@ -1052,8 +1061,8 @@ bool CLIENT_STATE::should_get_work() { // if the CPU started this time period overloaded, // let it process for a while to get out of the CPU overload state. - // - if (!work_fetch_no_new_work) { + // unless there is an override of some sort. + if (!work_fetch_no_new_work || must_schedule_cpus) { set_scheduler_modes(); } @@ -1102,13 +1111,14 @@ bool CLIENT_STATE::no_work_for_a_cpu() { // return true if round-robin scheduling will miss a deadline // -bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) { +bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs, bool current_work_only) { PROJECT* p, *pbest; RESULT* rp, *rpbest; vector active; unsigned int i; double x; vector::iterator it; + bool deadline_missed = false; SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_SCHED_CPU); @@ -1121,12 +1131,16 @@ bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) { p->pending.clear(); } + rr_results_fail_count = 0; + for (i=0; iaborted_via_gui) continue; if (!rp->runnable()) continue; if (rp->aborted_via_gui) continue; if (rp->project->non_cpu_intensive) continue; rp->rrsim_cpu_left = rp->estimated_cpu_time_remaining(); + rp->deadline_problem = false; p = rp->project; if (p->active.size() < (unsigned int)ncpus) { active.push_back(rp); @@ -1160,12 +1174,17 @@ bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) { // "rpbest" is first result to finish. Does it miss its deadline? // - double diff = sim_now + rpbest->rrsim_finish_delay - rpbest->report_deadline; + double diff = sim_now + rpbest->rrsim_finish_delay - rpbest->computation_deadline(); if (diff > 0) { - scope_messages.printf( - "rr_sim: result %s misses deadline by %f\n", rpbest->name, diff - ); - return true; + if (current_work_only) { + scope_messages.printf( + "rr_sim: result %s misses deadline by %f\n", rpbest->name, diff + ); + rpbest->deadline_problem = true; + deadline_missed = true; + ++rr_results_fail_count; + } + else return true; } // remove *rpbest from active set, @@ -1219,8 +1238,8 @@ bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) { sim_now += rpbest->rrsim_finish_delay; } - scope_messages.printf( "rr_sim: deadlines met\n"); - return false; + if (!deadline_missed) scope_messages.printf( "rr_sim: deadlines met\n"); + return deadline_missed; } #if 0 @@ -1332,20 +1351,19 @@ bool CLIENT_STATE::edf_misses_deadline(double per_cpu_proc_rate) { // and print a message if we're changing their value // void CLIENT_STATE::set_scheduler_modes() { - RESULT* rp; unsigned int i; bool should_not_fetch_work = false; - bool use_earliest_deadline_first = false; + bool result_has_deadline_problem = false; double total_proc_rate = avg_proc_rate(); double per_cpu_proc_rate = total_proc_rate/ncpus; SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_SCHED_CPU); double rrs = runnable_resource_share(); - if (rr_misses_deadline(per_cpu_proc_rate, rrs)) { + if (rr_misses_deadline(per_cpu_proc_rate, rrs, true)) { // if round robin would miss a deadline, use EDF // - use_earliest_deadline_first = true; + result_has_deadline_problem = true; if (!no_work_for_a_cpu()) { should_not_fetch_work = true; } @@ -1356,12 +1374,13 @@ void CLIENT_STATE::set_scheduler_modes() { PROJECT* p = next_project_need_work(); if (p && !p->runnable()) { rrs += p->resource_share; - if (rr_misses_deadline(per_cpu_proc_rate, rrs)) { + if (rr_misses_deadline(per_cpu_proc_rate, rrs, false)) { should_not_fetch_work = true; } } } +#if 0 for (i=0; icomputing_done()) continue; @@ -1370,7 +1389,8 @@ void CLIENT_STATE::set_scheduler_modes() { // Is the nearest deadline within a day? // if (rp->report_deadline - gstate.now < 60 * 60 * 24) { - use_earliest_deadline_first = true; + result_has_deadline_problem = true; + rp->deadline_problem = true; scope_messages.printf( "set_scheduler_modes(): Less than 1 day until deadline.\n" ); @@ -1379,13 +1399,20 @@ void CLIENT_STATE::set_scheduler_modes() { // is there a deadline < twice the users connect period? // if (rp->report_deadline - gstate.now < global_prefs.work_buf_min_days * SECONDS_PER_DAY * 2) { - use_earliest_deadline_first = true; + result_has_deadline_problem = true; + rp->deadline_problem = true; scope_messages.printf( "set_scheduler_modes(): Deadline is before reconnect time.\n" ); } } +#endif + for (i = 0; i < projects.size(); ++i) projects[i]->deadline_problem_count = 0; + + for (i = 0; i < results.size(); ++i) { + if (results[i]->deadline_problem) ++results[i]->project->deadline_problem_count; + } // display only when the policy changes to avoid once per second // if (work_fetch_no_new_work && !should_not_fetch_work) { @@ -1400,32 +1427,48 @@ void CLIENT_STATE::set_scheduler_modes() { ); } - if (cpu_earliest_deadline_first && !use_earliest_deadline_first) { + if (cpu_earliest_deadline_first && !result_has_deadline_problem) { msg_printf(NULL, MSG_INFO, "Resuming round-robin CPU scheduling." ); } - if (!cpu_earliest_deadline_first && use_earliest_deadline_first) { + if (!cpu_earliest_deadline_first && result_has_deadline_problem) { msg_printf(NULL, MSG_INFO, - "Using earliest-deadline-first scheduling because computer is overcommitted." + "Using critical-deadline-first scheduling because computer is overcommitted." ); } + if (rr_results_fail_count > rr_last_results_fail_count) { + force_reschedule_all_cpus(); + } + rr_last_results_fail_count = rr_results_fail_count; + + work_fetch_no_new_work = should_not_fetch_work; - cpu_earliest_deadline_first = use_earliest_deadline_first; + cpu_earliest_deadline_first = result_has_deadline_problem; } double CLIENT_STATE::work_needed_secs() { - double total_work = 0; + // Note that one CPDN result with 30 days remaining will not keep 4 CPUs busy today. + std::vector work_for_cpu; + work_for_cpu.resize(ncpus); + for( int i = 0; i < ncpus; ++i) work_for_cpu[i] = 0; for( unsigned int i = 0; i < results.size(); ++i) { if (results[i]->project->non_cpu_intensive) continue; - total_work += results[i]->estimated_cpu_time_remaining(); + int best_cpu = 0; + for (int j = 1; j < ncpus; ++j) + { + if (work_for_cpu[j] < work_for_cpu[best_cpu]) + best_cpu = j; + } + work_for_cpu[best_cpu] += results[i]->estimated_cpu_time_remaining(); } - double x = global_prefs.work_buf_min_days*SECONDS_PER_DAY*avg_proc_rate() - total_work; - if (x < 0) { - return 0; + double total_work_need = 0; + for (int i =0; i < ncpus; ++i) { + double x = global_prefs.work_buf_min_days*SECONDS_PER_DAY*avg_proc_rate()/ncpus - work_for_cpu[i]; + if (x < 0) total_work_need += x; } - return x; + return total_work_need; } void CLIENT_STATE::scale_duration_correction_factors(double factor) { @@ -1436,6 +1479,13 @@ void CLIENT_STATE::scale_duration_correction_factors(double factor) { } } +void CLIENT_STATE::force_reschedule_all_cpus() +{ + for (std::vector::iterator it = cpus.begin(); it != cpus.end(); ++it) { + (*it).must_schedule = true; + } +} + // Choose a new host CPID. // Do scheduler RPCs to all projects to propagate the CPID // diff --git a/client/cs_statefile.C b/client/cs_statefile.C index 4e762b56fe..a0548e9ed8 100644 --- a/client/cs_statefile.C +++ b/client/cs_statefile.C @@ -309,6 +309,7 @@ int CLIENT_STATE::parse_state_file() { continue; } else if (parse_int(buf, "", user_network_request)) { continue; + } else if (parse_double(buf, "", network_last_unsuspended)) { } else if (parse_int(buf, "", old_major_version)) { } else if (parse_int(buf, "", old_minor_version)) { } else if (parse_int(buf, "", old_release)) { @@ -425,7 +426,8 @@ int CLIENT_STATE::write_state(MIOFILE& f) { "%d\n" "%d\n" "%d\n" - "%d\n" + "%f\n" + "%d\n" "%s" "%f\n", platform_name, @@ -434,6 +436,7 @@ int CLIENT_STATE::write_state(MIOFILE& f) { core_client_release, user_run_request, user_network_request, + network_last_unsuspended, cpu_benchmarks_pending?"\n":"", new_version_check_time );