diff --git a/checkin_notes b/checkin_notes index 4da0619add..09068cebcc 100755 --- a/checkin_notes +++ b/checkin_notes @@ -6339,3 +6339,11 @@ Janus 8 May 2005 payment_success.php (new) payment_process.php (new) +David 8 May 2005 + - work fetch and CPU scheduling fixes (from John McLeod) + + client/ + client_state.C,h + client_types.C,h + cs_scheduler.C + scheduler_op.C,h diff --git a/client/client_state.C b/client/client_state.C index ff0d0c6a9d..65faab9654 100644 --- a/client/client_state.C +++ b/client/client_state.C @@ -1319,7 +1319,7 @@ double CLIENT_STATE::total_resource_share() { double x = 0; for (i=0; iresource_share; + if (!projects[i]->non_cpu_intensive ) x += projects[i]->resource_share; } return x; } diff --git a/client/client_state.h b/client/client_state.h index c2bfc78cd1..2d0ea698d4 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -47,8 +47,9 @@ #define USER_RUN_REQUEST_NEVER 3 #define WORK_FETCH_DONT_NEED 0 -#define WORK_FETCH_NEED 1 -#define WORK_FETCH_NEED_IMMEDIATELY 2 +#define WORK_FETCH_OK 1 +#define WORK_FETCH_NEED 2 +#define WORK_FETCH_NEED_IMMEDIATELY 3 enum SUSPEND_REASON { SUSPEND_REASON_BATTERIES = 1, @@ -273,7 +274,7 @@ private: public: double work_needed_secs(); PROJECT* next_project_master_pending(); - PROJECT* next_project_need_work(PROJECT*); + PROJECT* next_project_need_work(PROJECT* old, int urgency); int make_scheduler_request(PROJECT*, double); int handle_scheduler_reply(PROJECT*, char* scheduler_url, int& nresults); int compute_work_requests(); @@ -287,9 +288,9 @@ private: bool scheduler_rpc_poll(double); double ettprc(PROJECT*, int); double avg_proc_rate(PROJECT*); - bool should_get_work(); + bool should_get_work(); bool no_work_for_a_cpu(); - void set_cpu_scheduler_modes(); + void set_cpu_scheduler_modes(); // --------------- cs_statefile.C: public: diff --git a/client/client_types.C b/client/client_types.C index 0cefee14ec..e235c5d8e4 100644 --- a/client/client_types.C +++ b/client/client_types.C @@ -1129,6 +1129,8 @@ void RESULT::clear() { app = NULL; wup = NULL; project = NULL; + high_priority = false; + return_result_immediately = false; } // parse a element from scheduling server. @@ -1148,6 +1150,13 @@ int RESULT::parse_server(MIOFILE& in) { validate_time(report_deadline); continue; } + if (match_tag(buf, "")) { + high_priority = true; + return_result_immediately = true; + } + if (match_tag(buf, "")) { + return_result_immediately = true; + } if (match_tag(buf, "")) { file_ref.parse(in); output_files.push_back(file_ref); @@ -1257,6 +1266,8 @@ int RESULT::write(MIOFILE& out, bool to_server) { if (ready_to_report) out.printf(" \n"); if (suspended_via_gui) out.printf(" \n"); if (aborted_via_gui) out.printf(" \n"); + if (high_priority) out.printf(" \n"); + if (return_result_immediately) out.printf(" \n"); out.printf( " %s\n" " %f\n", @@ -1296,6 +1307,8 @@ int RESULT::write_gui(MIOFILE& out) { if (ready_to_report) out.printf(" \n"); if (suspended_via_gui) out.printf(" \n"); if (aborted_via_gui) out.printf(" \n"); + if (high_priority) out.printf(" \n"); + if (return_result_immediately) out.printf(" \n"); ACTIVE_TASK* atp = gstate.active_tasks.lookup_result(this); if (atp) { atp->write(out); diff --git a/client/client_types.h b/client/client_types.h index 06d7561d8d..6f52e07424 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -231,7 +231,7 @@ public: double work_done_this_period; // how much CPU time has been devoted to this // project in the current period (secs) struct RESULT *next_runnable_result; // the next result to run for this project - + // the following used by work-fetch algorithm double long_term_debt; // how much CPU time we owe this project in the long term (secs) @@ -365,6 +365,13 @@ struct RESULT { bool already_selected; // used to keep cpu scheduler from scheduling a result twice // transient; used only within schedule_cpus() + bool high_priority; // this will override the normal CPU handler so that this WU will run to completion + // before any other normal work is run. It will not override the panic mode scheduler. + // this can be set by the server supplying the WU. This was strongly requested by + // Folding@Home. + bool return_result_immediately; // this is what it says. It can be set by the project supplying the WU. + // It will be set to true by the parsing code if high_priority is set, even + // if it is not set by the calling server. void clear(); int parse_server(MIOFILE&); int parse_state(MIOFILE&); diff --git a/client/cs_scheduler.C b/client/cs_scheduler.C index d2542329df..2f4beabbdf 100644 --- a/client/cs_scheduler.C +++ b/client/cs_scheduler.C @@ -75,7 +75,7 @@ const int SECONDS_BEFORE_REPORTING_MIN_RPC_TIME_AGAIN = 60*60; #define MAX_CPU_LOAD_FACTOR 0.8 static int proj_min_results(PROJECT* p, int ncpus) { - return (int)(ceil(ncpus*p->resource_share/trs)); + return p->non_cpu_intensive ? 1 : (int)(ceil(ncpus*p->resource_share/trs)); } void PROJECT::set_min_rpc_time(double future_time) { if (future_time > min_rpc_time) { @@ -144,7 +144,7 @@ PROJECT* CLIENT_STATE::next_project_sched_rpc_pending() { // 3) has master_url_fetch_pending == false // 4) has dont_request_more_work == false // -PROJECT* CLIENT_STATE::next_project_need_work(PROJECT* old) { +PROJECT* CLIENT_STATE::next_project_need_work(PROJECT* old, int urgency) { PROJECT *p, *p_prospect = NULL; double work_on_prospect; double now = dtime(); @@ -163,7 +163,18 @@ PROJECT* CLIENT_STATE::next_project_need_work(PROJECT* old) { if (p->suspended_via_gui) continue; if (p->dont_request_more_work) continue; if (p->long_term_debt < 0 && !cpu_idle) continue; - if (p->non_cpu_intensive) continue; + if (p->work_request_urgency == WORK_FETCH_DONT_NEED) continue; + + // if we don't really need work, and we don't really need work from this project, pass. + if (urgency <= WORK_FETCH_OK && p->work_request_urgency <= WORK_FETCH_OK) continue; + + // if there is a project for which a work request is OK and one that has a higher priority, take the one + // with the higher priority. + if (p_prospect && p->work_request_urgency == WORK_FETCH_OK && + p_prospect->work_request_urgency > p->work_request_urgency) { + + continue; + } double work_on_current = ettprc(p, 0); if (p_prospect @@ -172,17 +183,13 @@ PROJECT* CLIENT_STATE::next_project_need_work(PROJECT* old) { ) { continue; } - if (p->work_request_urgency == WORK_FETCH_DONT_NEED - && (!cpu_idle || p->non_cpu_intensive) - ) { - continue; - } - if (found_old && p->work_request > 0) { + if (p->work_request > 0) { p_prospect = p; work_on_prospect = work_on_current; } } + if (p_prospect && !(p_prospect->work_request > 0.0)) p_prospect->work_request = 1.0; return p_prospect; } @@ -405,7 +412,12 @@ double CLIENT_STATE::ettprc(PROJECT *p, int k) { --num_results_to_skip; continue; } - est += rp->estimated_cpu_time_remaining(); + if (rp->project->non_cpu_intensive) { + // if it is a non_cpu intensive project, it needs only one at a time. + est = max(rp->estimated_cpu_time_remaining(), global_prefs.work_buf_min_days * SECONDS_PER_DAY); + } else { + est += rp->estimated_cpu_time_remaining(); + } } double apr = avg_proc_rate(p); return est/apr; @@ -418,10 +430,36 @@ double CLIENT_STATE::ettprc(PROJECT *p, int k) { // int CLIENT_STATE::compute_work_requests() { int urgency = WORK_FETCH_DONT_NEED; + int highest_project_urgency = WORK_FETCH_DONT_NEED; unsigned int i; double work_min_period = global_prefs.work_buf_min_days * SECONDS_PER_DAY; double now = dtime(); + double global_work_need = work_needed_secs(); + for (i = 0; i < projects.size(); ++i) { + projects[i]->work_request_urgency = WORK_FETCH_DONT_NEED; + projects[i]->work_request = 0; + } + + + if (!should_get_work()) { + return WORK_FETCH_DONT_NEED; + } else if (no_work_for_a_cpu()) { + urgency = WORK_FETCH_NEED_IMMEDIATELY; + } else if (global_work_need > 0) { + urgency = WORK_FETCH_NEED; + } else { + urgency = WORK_FETCH_OK; + } + + double max_fetch = work_min_period; + // it is possible to have a work fetch policy of no new work and also have + // a CPU idle or not enough to fill the cache. In this case, we get work, but in little tiny increments + // as we are already in trouble and we need to minimize the damage. + if (this->work_fetch_no_new_work) { + max_fetch = 1.0; + } + trs = total_resource_share(); // for each project, compute @@ -435,10 +473,11 @@ int CLIENT_STATE::compute_work_requests() { p->work_request = 0; p->work_request_urgency = WORK_FETCH_DONT_NEED; + if (p->master_url_fetch_pending) continue; if (p->min_rpc_time >= now) continue; if (p->dont_request_more_work) continue; if (p->suspended_via_gui) continue; - if (p->long_term_debt < 0 && !no_work_for_a_cpu()) continue; + if ((p->long_term_debt < 0) && (urgency != WORK_FETCH_NEED_IMMEDIATELY)) continue; int min_results = proj_min_results(p, ncpus); double estimated_time_to_starvation = ettprc(p, min_results-1); @@ -450,7 +489,6 @@ int CLIENT_STATE::compute_work_requests() { #if DEBUG_SCHED msg_printf(p, MSG_INFO, "is starved"); #endif - urgency = WORK_FETCH_NEED_IMMEDIATELY; p->work_request_urgency = WORK_FETCH_NEED_IMMEDIATELY; } else { #if DEBUG_SCHED @@ -458,11 +496,15 @@ int CLIENT_STATE::compute_work_requests() { estimated_time_to_starvation ); #endif - urgency = max(WORK_FETCH_NEED, urgency); - urgency = WORK_FETCH_NEED; + p->work_request_urgency = WORK_FETCH_NEED; } + } else if (WORK_FETCH_OK < urgency) { + p->work_request_urgency = WORK_FETCH_OK; + p->work_request = global_work_need; } + highest_project_urgency = max(highest_project_urgency, p->work_request_urgency); + // determine work requests for each project // NOTE: don't need to divide by active_frac etc.; // the scheduler does that (see sched/sched_send.C) @@ -477,12 +519,6 @@ int CLIENT_STATE::compute_work_requests() { #endif } - if (urgency == WORK_FETCH_DONT_NEED) { - for (i=0; iwork_request = 0; - } - } - return urgency; } @@ -522,10 +558,10 @@ bool CLIENT_STATE::scheduler_rpc_poll(double now) { "Insufficient work; requesting more" ); } - scheduler_op->init_get_work(false); + scheduler_op->init_get_work(false, urgency); action = true; } else if ((p=next_project_master_pending())) { - scheduler_op->init_get_work(true); + scheduler_op->init_get_work(true, urgency); action = true; } else if ((p=next_project_sched_rpc_pending())) { scheduler_op->init_return_results(p); @@ -1012,4 +1048,18 @@ void CLIENT_STATE::set_cpu_scheduler_modes() { cpu_earliest_deadline_first = use_earliest_deadline_first; } +double CLIENT_STATE::work_needed_secs() +{ + double total_work = 0; + for( unsigned int i = 0; i < results.size(); ++i) { + if (results[i]->project->non_cpu_intensive) continue; + total_work += results[i]->estimated_cpu_time_remaining(); + } + if (total_work > global_prefs.work_buf_min_days) { + return 0; + } else { + return global_prefs.work_buf_min_days - total_work; + } +} + const char *BOINC_RCSID_d35a4a7711 = "$Id$"; diff --git a/client/scheduler_op.C b/client/scheduler_op.C index f7e66c7c14..b6b4d8d0ad 100644 --- a/client/scheduler_op.C +++ b/client/scheduler_op.C @@ -80,13 +80,13 @@ bool SCHEDULER_OP::check_master_fetch_start() { // PRECONDITION: compute_work_requests() has been called // to fill in PROJECT::work_request // -int SCHEDULER_OP::init_get_work(bool master_file_only) { +int SCHEDULER_OP::init_get_work(bool master_file_only, int urgency) { int retval; char err_msg[256]; double ns; must_get_work = true; - project = gstate.next_project_need_work(0); + project = gstate.next_project_need_work(0, urgency); if (project && !master_file_only) { ns = project->work_request; msg_printf(project, MSG_INFO, "Requesting %.2f seconds of work", ns); @@ -209,6 +209,7 @@ void SCHEDULER_OP::backoff(PROJECT* p, const char *error_msg ) { p->nrpc_failures++; } set_min_rpc_time(p); + p->long_term_debt -= (p->min_rpc_time - dtime()) / gstate.global_prefs.max_projects_on_client; } // low-level routine to initiate an RPC @@ -448,10 +449,13 @@ bool SCHEDULER_OP::poll() { backoff(project, "No schedulers responded"); if (must_get_work) { int urgency = gstate.compute_work_requests(); - project = gstate.next_project_need_work(project); - if (project && urgency != WORK_FETCH_DONT_NEED) { - retval = init_op_project(project->work_request); - } else { + if (urgency != WORK_FETCH_DONT_NEED) { + project = gstate.next_project_need_work(project, urgency); + if (project) { + retval = init_op_project(project->work_request); + } else { + scheduler_op_done = true; + } scheduler_op_done = true; } } else { @@ -512,7 +516,7 @@ bool SCHEDULER_OP::poll() { if (must_get_work) { int urgency = gstate.compute_work_requests(); if (urgency != WORK_FETCH_DONT_NEED) { - project = gstate.next_project_need_work(project); + project = gstate.next_project_need_work(project, urgency); if (project) { retval = init_op_project(project->work_request); } else { diff --git a/client/scheduler_op.h b/client/scheduler_op.h index 328e16d6bd..18dea4f880 100644 --- a/client/scheduler_op.h +++ b/client/scheduler_op.h @@ -73,7 +73,7 @@ struct SCHEDULER_OP { SCHEDULER_OP(HTTP_OP_SET*); bool poll(); - int init_get_work(bool master_file_only); + int init_get_work(bool master_file_only, int urgency); int init_return_results(PROJECT*); int init_op_project(double ns); int init_master_fetch();