From df8cbdb29493a000ac7ee97c8da1aa4826330b51 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Fri, 30 Nov 2007 23:02:55 +0000 Subject: [PATCH] - scheduler: - if WU is infeasible, print message instead of number - remove useless messages - remove EDF simulations printfs - don't update nresults_on_host in resend_lost_work() (it's already done in add_result_to_reply()) svn path=/trunk/boinc/; revision=14336 --- checkin_notes | 15 ++++ sched/edf_sim.C | 54 ++++++++----- sched/handle_request.C | 21 +++-- sched/sched_array.C | 4 +- sched/sched_resend.C | 177 +++++++++++++++++++---------------------- sched/sched_send.C | 25 ++++-- sched/sched_send.h | 2 +- 7 files changed, 159 insertions(+), 139 deletions(-) diff --git a/checkin_notes b/checkin_notes index 21a950ec98..5229593fcb 100644 --- a/checkin_notes +++ b/checkin_notes @@ -11964,3 +11964,18 @@ Rom 30 Nov 2007 boinctray.vcproj win_build/installerv2/ BOINC.ism + +David 30 Nov 2007 + - scheduler: + - if WU is infeasible, print message instead of number + - remove useless messages + - remove EDF simulations printfs + - don't update nresults_on_host in resend_lost_work() + (it's already done in add_result_to_reply()) + + sched/ + edf_sim.C + handle_request.C + sched_array.C + sched_resend.C + sched_send.C,h diff --git a/sched/edf_sim.C b/sched/edf_sim.C index cf11a7e651..30730e6739 100644 --- a/sched/edf_sim.C +++ b/sched/edf_sim.C @@ -25,16 +25,6 @@ using std::vector; //#define TEST -//#define DEBUG -#ifdef DEBUG -#define INFO0 -#define INFO1 -#define INFO2 -#else -#define INFO0 // -#define INFO1 // -#define INFO2 // -#endif // 0 shows initial workload and candidate decisions // 1 shows function calls results of sim: make/miss deadline @@ -53,7 +43,9 @@ void mark_edf_misses (int ncpus, vector& ip_results){ double booked_to[128]; int j; - INFO1 fprintf(stderr, "mark_edf_misses\n"); +#ifdef DEBUG + fprintf(stderr, "mark_edf_misses\n"); +#endif // keeps track of when each cpu is next free // @@ -82,20 +74,26 @@ void mark_edf_misses (int ncpus, vector& ip_results){ } booked_to[lowest_booked_cpu] += r.cpu_time_remaining; - INFO2 fprintf(stderr, " running %s on cpu %d; finishes at %f\n", +#ifdef DEBUG + fprintf(stderr, " running %s on cpu %d; finishes at %f\n", r.name, lowest_booked_cpu, booked_to[lowest_booked_cpu] ); +#endif if (booked_to[lowest_booked_cpu] > r.computation_deadline) { r.misses_deadline = true; r.estimated_completion_time = booked_to[lowest_booked_cpu]; - INFO1 fprintf(stderr, " %s misses_deadline; est completion %f\n", +#ifdef DEBUG + fprintf(stderr, " %s misses_deadline; est completion %f\n", r.name, booked_to[lowest_booked_cpu] ); +#endif } else { r.misses_deadline = false; - INFO1 fprintf(stderr, " %s makes deadline; est completion %f\n", +#ifdef DEBUG + fprintf(stderr, " %s makes deadline; est completion %f\n", r.name, booked_to[lowest_booked_cpu] ); +#endif // if result doesn't miss its deadline, // then the estimated_completion_time is of no use } @@ -112,13 +110,17 @@ void init_ip_results( ){ unsigned int i; - INFO0 fprintf(stderr, "init_ip_results; work_buf_min %f ncpus %d:\n", work_buf_min, ncpus); +#ifdef DEBUG + fprintf(stderr, "init_ip_results; work_buf_min %f ncpus %d:\n", work_buf_min, ncpus); +#endif for (i=0; i computation_deadline AND // result would not have missed deadline to begin with @@ -229,9 +235,11 @@ bool check_candidate ( if (booked_to[lowest_booked_cpu] > r.computation_deadline && !r.misses_deadline ) { - INFO0 fprintf(stderr, " cand. fails; %s now misses deadline: %f\n", +#ifdef DEBUG + fprintf(stderr, " cand. fails; %s now misses deadline: %f\n", r.name, booked_to[lowest_booked_cpu] ); +#endif return false; } // check a late result (i.e., one that would have missed its @@ -240,11 +248,15 @@ bool check_candidate ( if (r.misses_deadline && booked_to[lowest_booked_cpu] > r.estimated_completion_time ){ - INFO1 fprintf(stderr, " cand. fails; late result %s to be returned even later\n", r.name); +#ifdef DEBUG + fprintf(stderr, " cand. fails; late result %s to be returned even later\n", r.name); +#endif return false; } } - INFO1 fprintf(stderr, " cand. succeeds\n"); +#ifdef DEBUG + fprintf(stderr, " cand. succeeds\n"); +#endif return true; } diff --git a/sched/handle_request.C b/sched/handle_request.C index 8a13fa1640..bf6bffce99 100644 --- a/sched/handle_request.C +++ b/sched/handle_request.C @@ -530,7 +530,7 @@ static int update_host_record(HOST& initial_host, HOST& xhost, USER& user) { return 0; } -// Figure out which of the results the user currently has +// Figure out which of the results the host currently has // should be aborted outright, or aborted if not started yet // int send_result_abort( @@ -556,10 +556,6 @@ int send_result_abort( result_names.append("'"); result_names.append(orp.name); result_names.append("'"); - log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG, - "Result is on [HOST#%d]: %s\n", - reply.host.id, orp.name.c_str() - ); } // query the db for the results and set the appropriate flag @@ -1426,14 +1422,15 @@ void process_request( handle_results(sreq, reply); reply.wreq.nresults_on_host = sreq.other_results.size(); - if (config.resend_lost_results && sreq.have_other_results_list) { - if (resend_lost_work(sreq, reply, platforms, ss)) { - ok_to_send_work = false; + if (sreq.have_other_results_list) { + if (config.resend_lost_results) { + if (resend_lost_work(sreq, reply, platforms, ss)) { + ok_to_send_work = false; + } + } + if (config.send_result_abort) { + send_result_abort(sreq, reply, ss); } - } - - if (config.send_result_abort && sreq.have_other_results_list) { - send_result_abort(sreq, reply, ss); } // if last RPC was within config.min_sendwork_interval, don't send work diff --git a/sched/sched_array.C b/sched/sched_array.C index 56667bb62e..6e96622d0d 100644 --- a/sched/sched_array.C +++ b/sched/sched_array.C @@ -116,8 +116,8 @@ void scan_work_array( retval = wu_is_infeasible(wu, sreq, reply, *app); if (retval) { log_messages.printf( - SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible: %d\n", - reply.host.id, wu.id, wu.name, retval + SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible: %s\n", + reply.host.id, wu.id, wu.name, infeasible_string(retval) ); continue; } diff --git a/sched/sched_resend.C b/sched/sched_resend.C index 1c77f54072..431af0a2f1 100644 --- a/sched/sched_resend.C +++ b/sched/sched_resend.C @@ -54,19 +54,19 @@ static int possibly_give_result_new_deadline( DB_RESULT& result, WORKUNIT& wu, SCHEDULER_REPLY& reply ) { const double resend_frac = 0.5; // range [0, 1) - int result_sent_time = time(0); - int result_report_deadline = result_sent_time + (int)(resend_frac*(result.report_deadline - result.sent_time)); + int now = time(0); + int result_report_deadline = now + (int)(resend_frac*(result.report_deadline - result.sent_time)); if (result_report_deadline < result.report_deadline) { result_report_deadline = result.report_deadline; } - if (result_report_deadline > result_sent_time + wu.delay_bound) { - result_report_deadline = result_sent_time + wu.delay_bound; + if (result_report_deadline > now + wu.delay_bound) { + result_report_deadline = now + wu.delay_bound; } // If infeasible, return without modifying result // - if (estimate_cpu_duration(wu, reply) > result_report_deadline-result_sent_time) { + if (estimate_cpu_duration(wu, reply) > result_report_deadline-now) { log_messages.printf( SCHED_MSG_LOG::MSG_DEBUG, "[RESULT#%d] [HOST#%d] not resending lost result: can't complete in time\n", @@ -83,11 +83,17 @@ static int possibly_give_result_new_deadline( result.id, reply.host.id, result_report_deadline==result.report_deadline?"NO update to":"Updated" ); - result.sent_time = result_sent_time; + result.sent_time = now; result.report_deadline = result_report_deadline; return 0; } +// resend any jobs that: +// 1) we already sent to this host; +// 2) are still in progress (i.e. haven't timed out) and +// 3) aren't present on the host +// Return true if there were any such jobs +// bool resend_lost_work( SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, PLATFORM_LIST& platforms, SCHED_SHMEM& ss @@ -96,116 +102,99 @@ bool resend_lost_work( std::vectorresults; unsigned int i; char buf[256]; + char warning_msg[256]; bool did_any = false; - int num_to_resend=0; + int num_eligible_to_resend=0; int num_resent=0; - int num_on_host=0; APP* app; APP_VERSION* avp; int retval; - // print list of results on host - // - for (i=0; i