diff --git a/checkin_notes b/checkin_notes index 7da3674e98..3ce4de52db 100755 --- a/checkin_notes +++ b/checkin_notes @@ -27123,3 +27123,15 @@ Rom 12 April 2005 BOINCTaskCtrl.cpp lib/ gui_rpc_client.C + +David 13 April 2005 + client/ + app_control.C + client_state.C,h + client_types.C,h + cs_apps.C + cs_scheduler.C + scheduler_op.C,h + lib/ + boinc_win.h + prefs.C,h diff --git a/client/app_control.C b/client/app_control.C index 9a2ea5c99c..9cc514432f 100644 --- a/client/app_control.C +++ b/client/app_control.C @@ -673,6 +673,7 @@ int ACTIVE_TASK_SET::abort_project(PROJECT* project) { task_iter++; } } + project->long_term_debt = 0; return 0; } diff --git a/client/client_state.C b/client/client_state.C index 12a8e6bb6b..4c5720477b 100644 Binary files a/client/client_state.C and b/client/client_state.C differ diff --git a/client/client_state.h b/client/client_state.h index dea5c67a80..2953409e4e 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -147,6 +147,8 @@ private: // double cpu_sched_last_time; double cpu_sched_work_done_this_period; + bool work_fetch_no_new_work; + bool cpu_crunch_nearest_first; // --------------- client_state.C: public: @@ -219,6 +221,7 @@ private: int app_finished(ACTIVE_TASK&); void assign_results_to_projects(); bool schedule_largest_debt_project(double expected_pay_off); + bool schedule_nearest_deadline_project(double expected_pay_off); bool start_apps(); bool schedule_cpus(double); bool handle_finished_apps(double); @@ -282,6 +285,9 @@ private: bool scheduler_rpc_poll(double); double ettprc(PROJECT*, int); double avg_proc_rate(PROJECT*); + bool should_get_work(); + bool no_work_for_a_cpu(); + void set_cpu_scheduler_modes(); // --------------- cs_statefile.C: public: diff --git a/client/client_types.C b/client/client_types.C index 6adaf425c0..739af87fd8 100644 --- a/client/client_types.C +++ b/client/client_types.C @@ -86,6 +86,7 @@ void PROJECT::init() { anonymous_platform = false; non_cpu_intensive = false; debt = 0; + long_term_debt = 0; send_file_list = false; suspended_via_gui = false; dont_request_more_work = false; @@ -176,6 +177,7 @@ int PROJECT::parse_state(MIOFILE& in) { else if (match_tag(buf, "")) deletion_policy_expire = true; #endif else if (parse_double(buf, "", debt)) continue; + else if (parse_double(buf, "", long_term_debt)) continue; else if (parse_double(buf, "", x)) continue; // not authoritative else scope_messages.printf("PROJECT::parse_state(): unrecognized: %s\n", buf); } @@ -221,6 +223,7 @@ int PROJECT::write_state(MIOFILE& out, bool gui_rpc) { " %d\n" " %f\n" " %f\n" + " %f\n" " %f\n" "%s%s%s%s%s%s", master_url, @@ -247,6 +250,7 @@ int PROJECT::write_state(MIOFILE& out, bool gui_rpc) { master_fetch_failures, min_rpc_time, debt, + long_term_debt, resource_share, master_url_fetch_pending?" \n":"", sched_rpc_pending?" \n":"", @@ -314,6 +318,7 @@ void PROJECT::copy_state_fields(PROJECT& p) { sched_rpc_pending = p.sched_rpc_pending; safe_strcpy(code_sign_key, p.code_sign_key); debt = p.debt; + long_term_debt = p.long_term_debt; send_file_list = p.send_file_list; non_cpu_intensive = p.non_cpu_intensive; suspended_via_gui = p.suspended_via_gui; @@ -657,7 +662,7 @@ int FILE_INFO::write_gui(MIOFILE& out) { " %f\n" " %f\n" " %d\n", - project->master_url, + project->master_url, project->project_name, name, nbytes, diff --git a/client/client_types.h b/client/client_types.h index 356660ff00..3d762fe42d 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -232,6 +232,7 @@ public: struct RESULT *next_runnable_result; // the next result to run for this project // the following used by work-fetch algorithm + double long_term_debt; // how much CPU time we owe this project in the long term (secs) double work_request; // the unit is "normalized CPU seconds", // i.e. the work should take 1 CPU on this host diff --git a/client/cs_apps.C b/client/cs_apps.C index 6082812d09..a3fb5c5fce 100644 --- a/client/cs_apps.C +++ b/client/cs_apps.C @@ -347,6 +347,36 @@ bool CLIENT_STATE::schedule_largest_debt_project(double expected_pay_off) { best_project->next_runnable_result = 0; return true; } +// The CPU scheduler is in panic mode. +// Schedule the active task with the earliest deadline +// Return true iff a task was scheduled. +// +bool CLIENT_STATE::schedule_nearest_deadline_project(double expected_pay_off) { + PROJECT *best_project = NULL; + RESULT *best_result = NULL; + double earliest_deadline; + bool first = true; + unsigned int i; + + for (i=0; i < results.size(); ++i) { + RESULT *r = results[i]; + if (RESULT_FILES_DOWNLOADED != r->state) continue; + if (r->project->non_cpu_intensive) continue; + if (r->already_selected) continue; + if (first || r->report_deadline < earliest_deadline) { + first = false; + best_project = r->project; + best_result = r; + earliest_deadline = r->report_deadline; + } + } + if (!best_result) return false; + + schedule_result(best_result); + best_project->anticipated_debt -= expected_pay_off; + best_project->next_runnable_result = 0; + return true; +} // Schedule active tasks to be run and preempted. // @@ -399,6 +429,8 @@ bool CLIENT_STATE::schedule_cpus(double now) { results[i]->already_selected = false; } + set_cpu_scheduler_modes(); + // do work accounting for active tasks // for (i=0; inon_cpu_intensive) continue; + count_cpu_intensive++; + double debt_inc = + (p->resource_share/local_total_resource_share) + * cpu_sched_work_done_this_period + - p->work_done_this_period; + p->long_term_debt += debt_inc; + total_long_term_debt += p->long_term_debt; if (!p->next_runnable_result) { p->debt = 0; p->anticipated_debt = 0; - } else { - p->debt += - (p->resource_share/local_total_resource_share) - * cpu_sched_work_done_this_period - - p->work_done_this_period; + } else { + p->debt += debt_inc; if (first) { first = false; min_debt = p->debt; @@ -452,6 +490,8 @@ bool CLIENT_STATE::schedule_cpus(double now) { ); } + double avg_long_term_debt = total_long_term_debt / count_cpu_intensive; + // Normalize debts to zero // for (i=0; idebt); p->next_runnable_result = NULL; } + p->long_term_debt -= avg_long_term_debt; } // schedule tasks for projects in order of decreasing anticipated debt @@ -477,7 +518,11 @@ bool CLIENT_STATE::schedule_cpus(double now) { expected_pay_off = cpu_sched_work_done_this_period / ncpus; for (j=0; j #include #include +#include +#include #endif #include "crypt.h" @@ -68,13 +70,17 @@ const int SECONDS_BEFORE_REPORTING_MIN_RPC_TIME_AGAIN = 60*60; // #define REPORT_DEADLINE_CUSHION SECONDS_PER_DAY +// try to maintain no more than this load factor on the CPU. +// +#define MAX_CPU_LOAD_FACTOR 0.8 + static int proj_min_results(PROJECT* p, int ncpus) { return (int)(ceil(ncpus*p->resource_share/trs)); } void PROJECT::set_min_rpc_time(double future_time) { - if (future_time > min_rpc_time) { - min_rpc_time = future_time; - } + if (future_time > min_rpc_time) { + min_rpc_time = future_time; + } min_report_min_rpc_time = 0; } @@ -139,7 +145,7 @@ PROJECT* CLIENT_STATE::next_project_sched_rpc_pending() { // 4) has dont_request_more_work == false // PROJECT* CLIENT_STATE::next_project_need_work(PROJECT *old) { - PROJECT *p; + PROJECT *p, *p_prospect = NULL; double now = dtime(); unsigned int i; bool found_old = (old == 0); @@ -153,11 +159,13 @@ PROJECT* CLIENT_STATE::next_project_need_work(PROJECT *old) { if (p->waiting_until_min_rpc_time(now)) continue; if (p->suspended_via_gui) continue; if (p->dont_request_more_work) continue; + if (p->long_term_debt < 0 && !no_work_for_a_cpu()) continue; + if (p_prospect && p->long_term_debt < p_prospect->long_term_debt && !p->non_cpu_intensive) continue; if (found_old && p->work_request > 0) { - return p; + p_prospect = p; } } - return 0; + return p_prospect; } // Write a scheduler request to a disk file @@ -194,22 +202,22 @@ int CLIENT_STATE::make_scheduler_request(PROJECT* p, double work_req) { p->authenticator, p->hostid, p->rpc_seqno, - p->anonymous_platform?"anonymous":platform_name, + p->anonymous_platform?"anonymous":platform_name, core_client_major_version, core_client_minor_version, work_req, p->resource_share / trs, ettprc(p, proj_min_results(p, ncpus)-1) ); - if (p->anonymous_platform) { - fprintf(f, " \n"); - for (i=0; iproject != p) continue; - avp->write(mf); - } - fprintf(f, " \n"); - } + if (p->anonymous_platform) { + fprintf(f, " \n"); + for (i=0; iproject != p) continue; + avp->write(mf); + } + fprintf(f, " \n"); + } #if 0 anything_free(free); fprintf(f, " %f\n", free); @@ -352,7 +360,7 @@ double CLIENT_STATE::avg_proc_rate(PROJECT *p) { double running_frac = time_stats.on_frac * time_stats.active_frac; if (running_frac < 0.1) running_frac = 0.1; if (running_frac > 1) running_frac = 1; - return (p->resource_share / trs) * ncpus * running_frac; + return (p ? (p->resource_share / trs) : 1) * ncpus * running_frac; } // "estimated time to project result count" @@ -411,6 +419,7 @@ int CLIENT_STATE::compute_work_requests() { if (p->min_rpc_time >= now) continue; if (p->dont_request_more_work) continue; if (p->suspended_via_gui) continue; + if (p->long_term_debt < 0 && !no_work_for_a_cpu()) continue; int min_results = proj_min_results(p, ncpus); double estimated_time_to_starvation = ettprc(p, min_results-1); @@ -471,7 +480,9 @@ bool CLIENT_STATE::scheduler_rpc_poll(double now) { switch(scheduler_op->state) { case SCHEDULER_OP_STATE_IDLE: if (network_suspended) break; - urgency = compute_work_requests(); + if (should_get_work()) { + urgency = compute_work_requests(); + } // highest priority is to report overdue results // @@ -486,14 +497,11 @@ bool CLIENT_STATE::scheduler_rpc_poll(double now) { global_prefs.work_buf_min_days ); } else if (urgency == NEED_WORK_IMMEDIATELY) { - msg_printf(NULL, MSG_INFO, - "Insufficient work; requesting more" - ); - } - scheduler_op->init_get_work(); +} + scheduler_op->init_get_work(false); action = true; } else if ((p=next_project_master_pending())) { - scheduler_op->init_get_work(); + scheduler_op->init_get_work(true); action = true; } else if ((p=next_project_sched_rpc_pending())) { scheduler_op->init_return_results(p); @@ -559,7 +567,7 @@ int CLIENT_STATE::handle_scheduler_reply( // see if we have a new venue from this project // - if (strlen(sr.host_venue) && strcmp(project->host_venue, sr.host_venue)) { + if (strlen(sr.host_venue) && strcmp(project->host_venue, sr.host_venue)) { safe_strcpy(project->host_venue, sr.host_venue); msg_printf(project, MSG_INFO, "New host venue: %s", sr.host_venue); update_project_prefs = true; @@ -596,10 +604,10 @@ int CLIENT_STATE::handle_scheduler_reply( sr.global_prefs_xml ); fclose(f); - update_global_prefs = true; - } + update_global_prefs = true; + } - if (update_global_prefs) { + if (update_global_prefs) { bool found_venue; retval = global_prefs.parse_file( GLOBAL_PREFS_FILE_NAME, project->host_venue, found_venue @@ -813,4 +821,142 @@ int CLIENT_STATE::handle_scheduler_reply( return 0; } +// Prevents work from being downloaded if there may be too much if more is downloaded. +bool CLIENT_STATE::should_get_work() { + // if there are fewer wus available then CPUS, then we need more work. + if (no_work_for_a_cpu()) return true; + + double tot_cpu_time_remaining = 0; + for (unsigned int i = 0; i < results.size();++i) { + tot_cpu_time_remaining += results[i]->estimated_cpu_time_remaining(); + } + if (tot_cpu_time_remaining < global_prefs.work_buf_min_days * SECONDS_PER_DAY) return true; + + // if the CPU started this time period over loaded, let it crunch for a while to get out + // of the CPU overload status. + if (!work_fetch_no_new_work) { + set_cpu_scheduler_modes(); + } + bool ret = !work_fetch_no_new_work; + + return ret; +} + +// CPU idle check. +bool CLIENT_STATE::no_work_for_a_cpu() { + return (unsigned int)ncpus > results.size(); +} + +// sets a couple of variables showing the needed state for the CPU scheduler. +void CLIENT_STATE::set_cpu_scheduler_modes() { + std::map < double, RESULT * > results_by_deadline; + std::set < PROJECT * > projects_with_work; + // cheap sorting trick. This works on every implementation of std::map that I know of, and + // it will be hard to avoid this behavour since inserts, lookups and deletes are all guaranteed lg(N) + std::vector::iterator it_u; + for (it_u = results.begin() ; it_u != results.end(); ++it_u) { + if (RESULT_COMPUTE_ERROR > (*it_u)->state && !(*it_u)->project->non_cpu_intensive) { + results_by_deadline[(*it_u)->report_deadline] = *it_u; + projects_with_work.insert((*it_u)->project); + } + } + + bool should_not_fetch_work = false; + bool should_crunch_nearest = false; + double now; + double frac_booked; + std::vector booked_to; + now = dtime(); + frac_booked = 0; + for (int i = 0; i < ncpus; ++i) booked_to.push_back(now); + + std::map::iterator it; + double up_frac = avg_proc_rate(0); + for (it = results_by_deadline.begin(); it != results_by_deadline.end() && !should_not_fetch_work; ++it) { + RESULT *r = (*it).second; + if (RESULT_COMPUTE_ERROR > ((*it).second)->state) { + double lowest_book = booked_to[0]; + int lowest_booked_cpu = 0; + for(int i = 1; i < ncpus; ++i) + { + if (booked_to[i] < lowest_book) { + lowest_book = booked_to[i]; + lowest_booked_cpu = i; + } + } + booked_to[lowest_booked_cpu] += ((*it).second)->estimated_cpu_time_remaining(); + + // Are the deadlines too tight to meet reliably? + if (booked_to[lowest_booked_cpu] - now > (r->report_deadline - now) * MAX_CPU_LOAD_FACTOR * up_frac) { + + should_not_fetch_work = true; + should_crunch_nearest = true; + if (!cpu_crunch_nearest_first || !work_fetch_no_new_work) { + msg_printf(NULL, MSG_INFO, + "Work fetch policy, CPU Scheduler policy - Overbooked."); + } + } + // Is the deadline soon? + if (r->report_deadline - now < 60 * 60 * 24) { + should_crunch_nearest = true; + if (!cpu_crunch_nearest_first) { + msg_printf(NULL, MSG_INFO, + "CPU Scheduler policy - Deadline < 1 day."); + } + } + + // is there a deadline < twice the users connect period? If so, we should crunch nearest so + // that it can be returned the next connection if possible. + if (r->report_deadline - now < global_prefs.work_buf_min_days * SECONDS_PER_DAY * 2) { + should_crunch_nearest = true; + if (!cpu_crunch_nearest_first) { + msg_printf(NULL, MSG_INFO, + "CPU Scheduler policy - deadline < 2 * queue size."); + } + } + + // is it getting a little uncomfortable? + frac_booked += r->estimated_cpu_time_remaining() / (r->report_deadline - now); + } + } + + // Is it getting a little uncomfortable? + if (frac_booked > MAX_CPU_LOAD_FACTOR * up_frac * ncpus) { + should_not_fetch_work = true; + if (!work_fetch_no_new_work) { + msg_printf(NULL, MSG_INFO, + "Work fetch policy - uncomfortable."); + } + } + + // check for too many projects that have work + if (projects_with_work.size() >= (unsigned int)global_prefs.max_projects_on_client) { + should_not_fetch_work = true; + if (!work_fetch_no_new_work) { + msg_printf(NULL, MSG_INFO, + "Work fetch policy - max projects exceeded."); + } + } + + if (work_fetch_no_new_work && !should_not_fetch_work) { // display only when the policy changes to avoid once per second + msg_printf(NULL, MSG_INFO, + "Work fetch policy - work fetch now allowed."); + } + if (!work_fetch_no_new_work && should_not_fetch_work) { // display only when the policy changes to avoid once per second + msg_printf(NULL, MSG_INFO, + "Work fetch policy - no work fetch allowed."); + } + if (cpu_crunch_nearest_first && !should_crunch_nearest) { // display only when the policy changes to avoid once per second + msg_printf(NULL, MSG_INFO, + "CPU scheduler policy - crunch highest debt first (normal mode)."); + } + if (!cpu_crunch_nearest_first && should_crunch_nearest) { // display only when the policy changes to avoid once per second + msg_printf(NULL, MSG_INFO, + "CPU scheduler policy - crunch earliest deadline first (panic mode)."); + } + + work_fetch_no_new_work = should_not_fetch_work; + cpu_crunch_nearest_first = should_crunch_nearest; +} + const char *BOINC_RCSID_d35a4a7711 = "$Id$"; diff --git a/client/scheduler_op.C b/client/scheduler_op.C index 2263cffb93..8caf5cc468 100644 --- a/client/scheduler_op.C +++ b/client/scheduler_op.C @@ -80,14 +80,14 @@ bool SCHEDULER_OP::check_master_fetch_start() { // PRECONDITION: compute_work_requests() has been called // to fill in PROJECT::work_request // -int SCHEDULER_OP::init_get_work() { +int SCHEDULER_OP::init_get_work(bool master_file_only) { int retval; char err_msg[256]; double ns; must_get_work = true; project = gstate.next_project_need_work(0); - if (project) { + if (project && !master_file_only) { ns = project->work_request; msg_printf(project, MSG_INFO, "Requesting %.2f seconds of work", ns); retval = init_op_project(ns); @@ -614,14 +614,14 @@ int SCHEDULER_REPLY::parse(FILE* in, PROJECT* project) { project->write_statistics_file(); - return 0; + return 0; } else if (parse_str(buf, "", project->project_name, sizeof(project->project_name))) continue; - else if (parse_str(buf, "", project->user_name, sizeof(project->user_name))) continue; + else if (parse_str(buf, "", project->user_name, sizeof(project->user_name))) continue; else if (parse_double(buf, "", project->user_total_credit)) continue; else if (parse_double(buf, "", project->user_expavg_credit)) continue; else if (parse_double(buf, "", project->user_create_time)) continue; - else if (parse_str(buf, "", project->team_name, sizeof(project->team_name))) continue; + else if (parse_str(buf, "", project->team_name, sizeof(project->team_name))) continue; else if (parse_int(buf, "", hostid)) continue; else if (parse_double(buf, "", project->host_total_credit)) continue; else if (parse_double(buf, "", project->host_expavg_credit)) continue; diff --git a/client/scheduler_op.h b/client/scheduler_op.h index 8978edd5ad..328e16d6bd 100644 --- a/client/scheduler_op.h +++ b/client/scheduler_op.h @@ -73,7 +73,7 @@ struct SCHEDULER_OP { SCHEDULER_OP(HTTP_OP_SET*); bool poll(); - int init_get_work(); + int init_get_work(bool master_file_only); int init_return_results(PROJECT*); int init_op_project(double ns); int init_master_fetch(); diff --git a/lib/boinc_win.h b/lib/boinc_win.h index edbd3e3983..da083e014d 100644 --- a/lib/boinc_win.h +++ b/lib/boinc_win.h @@ -115,6 +115,8 @@ #include #include #include +#include +#include #endif diff --git a/lib/prefs.C b/lib/prefs.C index 0adb75b97f..0bf0411785 100644 --- a/lib/prefs.C +++ b/lib/prefs.C @@ -51,6 +51,7 @@ void GLOBAL_PREFS::defaults() { dont_verify_images = false; work_buf_min_days = 0.1; max_cpus = 1; + max_projects_on_client = 5; cpu_scheduling_period_minutes = 60; disk_interval = 60; disk_max_used_gb = 1; @@ -175,6 +176,9 @@ int GLOBAL_PREFS::parse(FILE* in, const char* host_venue, bool& found_venue) { } else if (parse_int(buf, "", max_cpus)) { if (max_cpus < 1) max_cpus = 1; continue; + } else if (parse_int(buf, "", max_projects_on_client)) { + if (max_projects_on_client < 1) max_projects_on_client = 1; + continue; } else if (parse_double(buf, "", disk_interval)) { if (disk_interval<0) disk_interval = 0; continue; @@ -235,6 +239,7 @@ int GLOBAL_PREFS::write(FILE* f) { "%s%s%s%s%s%s" " %f\n" " %d\n" + " %d\n" " %f\n" " %f\n" " %f\n" @@ -258,6 +263,7 @@ int GLOBAL_PREFS::write(FILE* f) { dont_verify_images?" \n":"", work_buf_min_days, max_cpus, + max_projects_on_client, cpu_scheduling_period_minutes, disk_interval, disk_max_used_gb, diff --git a/lib/prefs.h b/lib/prefs.h index 3fd12e9c33..db9eebf90b 100644 --- a/lib/prefs.h +++ b/lib/prefs.h @@ -46,6 +46,7 @@ struct GLOBAL_PREFS { bool dont_verify_images; double work_buf_min_days; int max_cpus; + int max_projects_on_client; double cpu_scheduling_period_minutes; double disk_interval; double disk_max_used_gb;