From a37403a67313529089b413b8a4f5082f9809c0b8 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Thu, 10 May 2007 21:50:52 +0000 Subject: [PATCH] - scheduler: add config option. If set, the scheduler will use EDF simulation, together with the in-progress workload reported by the client, to avoid sending results that 1) will miss their deadline, or 2) will cause an in-progress result to miss its deadline, or 3) will make an in-progress result miss its deadline by more than is already predicted. If this option is not set, or if the client request doesn't include a workload description (i.e. the client is old) use the existing approach, which assumes there's no workload. NOTE: this is experimental. Production projects should not use it. - EDF sim: write debug stuff to stderr instead of stdout - Account manager: - if an account is detach_when_done, set dont_request_more_work - check done_request_more_work even for first-time projects - update_uotd: generate a file for use by Google gadget - user_links(): use full URLs (so can use in Google gadget) client/ acct_mgr.C work_fetch.C html/ inc/ uotd.inc util.inc user/ uotd_gadget.php (new) sched/ Makefile.am edf_sim.C sched_config.C,h sched_resend.C sched_send.C,h server_types.C,h svn path=/trunk/boinc/; revision=12639 --- checkin_notes | 36 +++++++++++++++++++++++++++++++++ client/acct_mgr.C | 6 ++++++ client/work_fetch.C | 2 +- html/inc/uotd.inc | 13 +++++++----- html/inc/util.inc | 5 +++-- html/user/uotd_gadget.php | 23 +++++++++++++++++++++ sched/Makefile.am | 1 + sched/edf_sim.C | 35 +++++++++++++++++++++----------- sched/sched_config.C | 1 + sched/sched_config.h | 24 ++++++++++++++++------ sched/sched_resend.C | 18 ++++++++++------- sched/sched_send.C | 42 +++++++++++++++++++++++++++++---------- sched/sched_send.h | 1 + sched/server_types.C | 17 ++++++++++++++-- sched/server_types.h | 1 + 15 files changed, 180 insertions(+), 45 deletions(-) create mode 100644 html/user/uotd_gadget.php diff --git a/checkin_notes b/checkin_notes index 2217b047b1..60c7f41154 100755 --- a/checkin_notes +++ b/checkin_notes @@ -4739,3 +4739,39 @@ David 10 May 2007 win_build/ sim.vcproj +David 10 May 2007 + - scheduler: add config option. + If set, the scheduler will use EDF simulation, + together with the in-progress workload reported by the client, + to avoid sending results that + 1) will miss their deadline, or + 2) will cause an in-progress result to miss its deadline, or + 3) will make an in-progress result miss its deadline + by more than is already predicted. + If this option is not set, or if the client request doesn't + include a workload description (i.e. the client is old) + use the existing approach, which assumes there's no workload. + NOTE: this is experimental. Production projects should not use it. + - EDF sim: write debug stuff to stderr instead of stdout + - Account manager: + - if an account is detach_when_done, set dont_request_more_work + - check done_request_more_work even for first-time projects + - update_uotd: generate a file for use by Google gadget + - user_links(): use full URLs (so can use in Google gadget) + + client/ + acct_mgr.C + work_fetch.C + html/ + inc/ + uotd.inc + util.inc + user/ + uotd_gadget.php (new) + sched/ + Makefile.am + edf_sim.C + sched_config.C,h + sched_resend.C + sched_send.C,h + server_types.C,h diff --git a/client/acct_mgr.C b/client/acct_mgr.C index 5edcd775fc..7ec518b7bd 100644 --- a/client/acct_mgr.C +++ b/client/acct_mgr.C @@ -411,6 +411,9 @@ void ACCT_MGR_OP::handle_reply(int http_op_retval) { } if (acct.detach_when_done.present) { pp->detach_when_done = acct.detach_when_done.value; + if (pp->detach_when_done) { + pp->dont_request_more_work = true; + } } // initiate a scheduler RPC if requested by AMS @@ -448,6 +451,9 @@ void ACCT_MGR_OP::handle_reply(int http_op_retval) { gstate.add_project( acct.url.c_str(), acct.authenticator.c_str(), "", true ); + if (acct.dont_request_more_work.present) { + pp->dont_request_more_work = acct.dont_request_more_work.value; + } } } } diff --git a/client/work_fetch.C b/client/work_fetch.C index a9900b8fc6..683b95efde 100644 --- a/client/work_fetch.C +++ b/client/work_fetch.C @@ -147,7 +147,7 @@ PROJECT* CLIENT_STATE::next_project_sched_rpc_pending() { p->sched_rpc_pending = RPC_REASON_PROJECT_REQ; p->next_rpc_time = 0; } - //if (p->suspended_via_gui) continue; + // if (p->suspended_via_gui) continue; // do the RPC even if suspended. // This is critical for acct mgrs, to propagate new host CPIDs // diff --git a/html/inc/uotd.inc b/html/inc/uotd.inc index 858ae45d25..4dde56ca68 100644 --- a/html/inc/uotd.inc +++ b/html/inc/uotd.inc @@ -7,7 +7,8 @@ define('UOTD_THRESHOLD', 7); function uotd_thumbnail($profile, $user) { if ($profile->has_picture) { - return "id>"; + $ub = URL_BASE; + return "id>id."_sm.jpg>"; } else { return ""; } @@ -51,7 +52,7 @@ function select_uotd() { if ($assigned['mday'] == $now['mday']) { $user = lookup_user_id($current_uotd->userid); echo "Already have UOTD for today\n"; - generate_uotd_gadget($profile, $user); + generate_uotd_gadget($current_uotd, $user); exit(); } } @@ -94,8 +95,6 @@ function select_uotd() { $profile = mysql_fetch_object($result); $user = lookup_user_id($profile->userid); - generate_uotd_gadget($profile, $user); - $sql = "UPDATE profile SET uotd_time = ".time()." WHERE userid=$user->id"; mysql_query($sql); @@ -106,6 +105,8 @@ function select_uotd() { Your profile will be featured on the " . PROJECT . " website for the next 24 hours." ); echo "Chose user $user->id as UOTD\n"; + $profile->uotd_time = time(); + generate_uotd_gadget($profile, $user); } // This is the default policy for choosing the UOTD on any BOINC project. @@ -153,7 +154,9 @@ function generate_uotd_gadget($profile, $user) { $x = "\n"; $gadget = PROFILE_PATH."uotd_gadget.html"; if( $h = fopen($gadget, "w") ){ - if( (time()-$profile->uotd_time) <= 86400+3600 ) { // allow for slop + $age = time()-$profile->uotd_time; + echo "age: $age"; + if($age <= 86400+3600) { // allow for slop $x .= uotd_thumbnail($profile, $user); $x .= user_links($user); $x .= "  ". diff --git a/html/inc/util.inc b/html/inc/util.inc index 01440bff84..06854dde62 100644 --- a/html/inc/util.inc +++ b/html/inc/util.inc @@ -512,9 +512,10 @@ function sched_stopped() { } function user_links($user) { - $x = ''.$user->name.''; + $ub = URL_BASE; + $x = ''.$user->name.''; if ($user->has_profile) { - $x .= ' User profile image'; + $x .= ' User profile image'; } # Does this project accept donations? # If so, do you want to have a link next to user name as it appears on the web site? diff --git a/html/user/uotd_gadget.php b/html/user/uotd_gadget.php new file mode 100644 index 0000000000..56cf27184a --- /dev/null +++ b/html/user/uotd_gadget.php @@ -0,0 +1,23 @@ + + + + +"; + +?> diff --git a/sched/Makefile.am b/sched/Makefile.am index 97052d3b0c..cab8b31778 100644 --- a/sched/Makefile.am +++ b/sched/Makefile.am @@ -62,6 +62,7 @@ EXTRA_DIST = \ cgi_SOURCES = \ + edf_sim.C \ handle_request.C \ main.C \ sched_array.C \ diff --git a/sched/edf_sim.C b/sched/edf_sim.C index 9258a28b46..ec1557594a 100644 --- a/sched/edf_sim.C +++ b/sched/edf_sim.C @@ -25,16 +25,21 @@ using std::vector; //#define TEST -#ifdef TEST -#define INFO0 printf -#define INFO1 printf -#define INFO2 printf +#define DEBUG +#ifdef DEBUG +#define INFO0 +#define INFO1 +#define INFO2 #else #define INFO0 // #define INFO1 // #define INFO2 // #endif +// 0 shows initial workload and candidate decisions +// 1 shows function calls results of sim: make/miss deadline +// 2 shows every step of sim + bool lessthan_deadline(const IP_RESULT& p1, const IP_RESULT& p2) { if (p1.computation_deadline < p2.computation_deadline) return true; return false; @@ -48,7 +53,7 @@ void mark_edf_misses (int ncpus, vector& ip_results){ double booked_to[128]; int j; - INFO1("mark_edf_misses\n"); + INFO1 fprintf(stderr, "mark_edf_misses\n"); // keeps track of when each cpu is next free // @@ -77,18 +82,18 @@ void mark_edf_misses (int ncpus, vector& ip_results){ } booked_to[lowest_booked_cpu] += r.cpu_time_remaining; - INFO2(" running %s on cpu %d; finishes at %f\n", + INFO2 fprintf(stderr, " running %s on cpu %d; finishes at %f\n", r.name, lowest_booked_cpu, booked_to[lowest_booked_cpu] ); if (booked_to[lowest_booked_cpu] > r.computation_deadline) { r.misses_deadline = true; r.estimated_completion_time = booked_to[lowest_booked_cpu]; - INFO1(" %s misses_deadline; est completion %f\n", + INFO1 fprintf(stderr, " %s misses_deadline; est completion %f\n", r.name, booked_to[lowest_booked_cpu] ); } else { r.misses_deadline = false; - INFO1(" %s makes deadline; est completion %f\n", + INFO1 fprintf(stderr, " %s makes deadline; est completion %f\n", r.name, booked_to[lowest_booked_cpu] ); // if result doesn't miss its deadline, @@ -106,9 +111,14 @@ void init_ip_results( vector& ip_results ){ unsigned int i; + + INFO0 fprintf(stderr, "init_ip_results; work_buf_min %f ncpus %d:\n", work_buf_min, ncpus); for (i=0; i r.computation_deadline && !r.misses_deadline ) { - INFO1 (" %s now misses deadline: %f\n", + INFO0 fprintf(stderr, " cand. fails; %s now misses deadline: %f\n", r.name, booked_to[lowest_booked_cpu] ); return false; @@ -230,10 +240,11 @@ bool check_candidate ( if (r.misses_deadline && booked_to[lowest_booked_cpu] > r.estimated_completion_time ){ - INFO0 (" %s: late result to be returned even later\n", r.name); + INFO1 fprintf(stderr, " cand. fails; late result %s to be returned even later\n", r.name); return false; } } + INFO1 fprintf(stderr, " cand. succeeds\n"); return true; } diff --git a/sched/sched_config.C b/sched/sched_config.C index 03c61f5e32..3a51e4074d 100644 --- a/sched/sched_config.C +++ b/sched/sched_config.C @@ -130,6 +130,7 @@ int SCHED_CONFIG::parse(FILE* f) { else if (xp.parse_int(tag, "grace_period_hours", grace_period_hours)) continue; else if (xp.parse_int(tag, "delete_delay_hours", delete_delay_hours)) continue; + else if (xp.parse_bool(tag, "workload_sim", workload_sim)) continue; // some tags that scheduler doesn't care about // diff --git a/sched/sched_config.h b/sched/sched_config.h index 088fa547d7..a63bbe439c 100644 --- a/sched/sched_config.h +++ b/sched/sched_config.h @@ -78,12 +78,24 @@ public: int sched_debug_level; int fuh_debug_level; int reliable_time; // age of workunit before requiring reliable - int reliable_min_avg_credit; // min average credit for a host to be declared reliable - int reliable_max_avg_turnaround; // max average turnaround for a host to be declared reliable - int reliable_priority_on_over; // additional results generated after at least one result is over will have their priority boosted by this amount - int reliable_priority_on_over_except_error; // additional results generated after at least one result is over (unless their is an error) will have their priority boosted by this amount - int reliable_on_priority; // results with a priority equal or greater than this value will be sent to reliable hosts - double reliable_reduced_delay_bound; // Reduce the delay bounds for reliable hosts by this percent + int reliable_min_avg_credit; + // min average credit for a host to be declared reliable + int reliable_max_avg_turnaround; + // max average turnaround for a host to be declared reliable + int reliable_priority_on_over; + // additional results generated after at least one result + // is over will have their priority boosted by this amount + int reliable_priority_on_over_except_error; + // additional results generated after at least one result is over + // (unless their is an error) will have their priority boosted + // by this amount + int reliable_on_priority; + // results with a priority equal or greater than this value + // will be sent to reliable hosts + double reliable_reduced_delay_bound; + // Reduce the delay bounds for reliable hosts by this percent + bool workload_sim; + // Do workload simulation in deciding whether to send a result int parse(FILE*); int parse_file(const char* dir="."); diff --git a/sched/sched_resend.C b/sched/sched_resend.C index 32ed9baf89..2267bd9d29 100644 --- a/sched/sched_resend.C +++ b/sched/sched_resend.C @@ -17,8 +17,14 @@ // or write to the Free Software Foundation, Inc., // 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// scheduler code related to sending work - +// scheduler code related to sending "lost" work +// (i.e. results we sent to the host, but which they're not reporting) +// +// TODO: +// - make sure result is still needed (no canonical result yet) +// - don't send if project has been reset since first send; +// this result may have been the cause of reset +// (need to pass last reset time from client) #include "config.h" #include "error_numbers.h" @@ -40,11 +46,9 @@ #define FCGI_ToFILE(x) (x) #endif -// returns zero if result still feasible. -// result may hve been given a new report time. -// Returns nonzero if result is no longer feasible -// (not enough time to compute it on host). -// In this case result is unchanged. +// Assign a new deadline for the result; +// if it's not likely to complete by this time, return nonzero. +// TODO: EXPLAIN THE FORMULA FOR NEW DEADLINE // static int possibly_give_result_new_deadline( DB_RESULT& result, WORKUNIT& wu, SCHEDULER_REPLY& reply diff --git a/sched/sched_send.C b/sched/sched_send.C index a231f3936d..2c4343b280 100644 --- a/sched/sched_send.C +++ b/sched/sched_send.C @@ -410,9 +410,27 @@ int wu_is_infeasible( int reason = 0; check_app_filter(wu, request, reply, reason); + if (reason) return reason; check_memory(wu, request, reply, reason); + if (reason) return reason; check_disk(wu, request, reply, reason); - check_deadline(wu, request, reply, reason); + if (reason) return reason; + + if (config.workload_sim && request.have_other_results_list) { + double est_cpu = estimate_cpu_duration(wu, reply); + IP_RESULT candidate("", wu.delay_bound, est_cpu); + strcpy(candidate.name, wu.name); + if (check_candidate(candidate, reply.host.p_ncpus, request.ip_results)) { + // it passed the feasibility test, + // but don't add it the the workload yet; + // wait until we commit to sending it + } else { + reply.wreq.insufficient_speed = true; + reason |= INFEASIBLE_WORKLOAD; + } + } else { + check_deadline(wu, request, reply, reason); + } return reason; } @@ -799,6 +817,14 @@ int add_result_to_reply( request.estimated_delay += wu_seconds_filled/reply.host.p_ncpus; reply.wreq.nresults++; if (!resent_result) reply.host.nresults_today++; + + // add this result to workload for simulation + // + if (config.workload_sim && request.have_other_results_list) { + double est_cpu = estimate_cpu_duration(wu, reply); + IP_RESULT ipr ("", time(0)+wu.delay_bound, est_cpu); + request.ip_results.push_back(ipr); + } return 0; } @@ -837,15 +863,11 @@ int send_work( reply.wreq.seconds_to_fill = MIN_SECONDS_TO_SEND; } - // TODO: add code to send results that were sent earlier but not reported. - // Cautions (from John McLeod): - // - make sure the result is still needed - // - don't send if the project has been reset since first send, - // since result may have been cause of the reset - // (need to pass reset time?) - // - make sure can complete by deadline - // - don't send if project is suspended or "no more work" on client - // (need to pass these) + if (config.workload_sim && sreq.have_other_results_list) { + init_ip_results( + sreq.global_prefs.work_buf_min(), reply.host.p_ncpus, sreq.ip_results + ); + } if (config.locality_scheduling) { reply.wreq.infeasible_only = false; diff --git a/sched/sched_send.h b/sched/sched_send.h index 5f5601e1f0..ade4f987ca 100644 --- a/sched/sched_send.h +++ b/sched/sched_send.h @@ -48,6 +48,7 @@ extern bool app_core_compatible(WORK_REQ& wreq, APP_VERSION& av); #define INFEASIBLE_CPU 4 #define INFEASIBLE_WORK_BUF 8 #define INFEASIBLE_APP_SETTING 16 +#define INFEASIBLE_WORKLOAD 32 extern int wu_is_infeasible(WORKUNIT&, SCHEDULER_REQUEST&, SCHEDULER_REPLY&); diff --git a/sched/server_types.C b/sched/server_types.C index 47b40b7815..8e38c971d9 100644 --- a/sched/server_types.C +++ b/sched/server_types.C @@ -92,10 +92,12 @@ int IP_RESULT::parse(FILE* f) { report_deadline = 0; cpu_time_remaining = 0; + strcpy(name, ""); while (fgets(buf, sizeof(buf), f)) { if (match_tag(buf, "")) { return 0; } + if (parse_str(buf, "", name, sizeof(name))) continue; if (parse_double(buf, "", report_deadline)) continue; if (parse_double(buf, "", cpu_time_remaining)) continue; } @@ -257,12 +259,18 @@ int SCHEDULER_REQUEST::parse(FILE* fin) { continue; } else if (match_tag(buf, "")) { have_ip_results_list = true; + int i = 0; + double now = time(0); while (fgets(buf, sizeof(buf), fin)) { if (match_tag(buf, "")) break; if (match_tag(buf, "")) { IP_RESULT ir; retval = ir.parse(fin); if (!retval) { + if (!strlen(ir.name)) { + sprintf(ir.name, "ip%d", i++); + } + ir.report_deadline -= now; ip_results.push_back(ir); } } @@ -812,6 +820,10 @@ int RESULT::parse_from_client(FILE* fin) { safe_strcat(stderr_out, buf); } continue; + } else if (match_tag(buf, "")) { + continue; + } else if (match_tag(buf, "")) { + continue; } else { log_messages.printf( SCHED_MSG_LOG::MSG_NORMAL, @@ -859,6 +871,8 @@ int HOST::parse(FILE* fin) { // fields reported by 5.5+ clients, not currently used // + else if (match_tag(buf, "")) continue; +#if 0 else if (match_tag(buf, "")) continue; else if (match_tag(buf, "")) continue; @@ -869,8 +883,7 @@ int HOST::parse(FILE* fin) { else if (match_tag(buf, "")) continue; else if (match_tag(buf, "")) continue; else if (match_tag(buf, "")) continue; - - +#endif else { log_messages.printf(SCHED_MSG_LOG::MSG_NORMAL, "HOST::parse(): unrecognized: %s\n", buf diff --git a/sched/server_types.h b/sched/server_types.h index 33ab7739f3..92ab61ab02 100644 --- a/sched/server_types.h +++ b/sched/server_types.h @@ -108,6 +108,7 @@ struct GLOBAL_PREFS { void parse(const char* buf, const char* venue); void defaults(); + inline double work_buf_min() {return work_buf_min_days*86400;} }; struct GUI_URLS {