From be4cd9bb79c0ddaa27459eb01ce821a31da809c4 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Mon, 27 Oct 2008 21:23:07 +0000 Subject: [PATCH] - scheduler: notify user if we're not sending work because we don't have any (matchmaker only). - back end programs: for programs that do enumerations, check for error returns and exit (otherwise we'll get stuck forever if DB fails) NOTE: In the course of researching this I came across a bug in the transitioner: if there's a WU with more than 1000 results, the enumeration will always return ERR_DB_NOT_FOUND, and the transitioner won't ever do anything again. Fixing this is a little tricky, so I'm not going to do it right now. svn path=/trunk/boinc/; revision=16324 --- checkin_notes | 25 ++++++++++++++++++++++++- sched/file_deleter.cpp | 15 +++++++++++---- sched/request_file_list.cpp | 12 +++++++++++- sched/sched_send.cpp | 26 ++++++++++++++++++++++---- sched/server_types.h | 1 + sched/transitioner.cpp | 17 ++++++++++++++--- sched/trickle_handler.cpp | 13 +++++++++++-- sched/update_stats.cpp | 34 +++++++++++++++++++++++++++++++--- 8 files changed, 125 insertions(+), 18 deletions(-) diff --git a/checkin_notes b/checkin_notes index b3dc60f091..296a8a6adc 100644 --- a/checkin_notes +++ b/checkin_notes @@ -8719,4 +8719,27 @@ David 27 Oct 2008 time_stats.cpp work_fetch.cpp clientgui/ - BOINCBaseView.cpp \ No newline at end of file + BOINCBaseView.cpp + +David 27 Oct 2008 + - scheduler: notify user if we're not sending work + because we don't have any (matchmaker only). + - back end programs: for programs that do enumerations, + check for error returns and exit + (otherwise we'll get stuck forever if DB fails) + + NOTE: In the course of researching this I came across a bug + in the transitioner: if there's a WU with more than 1000 results, + the enumeration will always return ERR_DB_NOT_FOUND, + and the transitioner won't ever do anything again. + Fixing this is a little tricky, so I'm not going to do it right now. + + sched/ + file_deleter.cpp + request_file_list.cpp + sched_send.cpp + server_types.h + transitioner.cpp + trickle_handler.cpp + update_stats.cpp + diff --git a/sched/file_deleter.cpp b/sched/file_deleter.cpp index 6812bc7654..508130daae 100644 --- a/sched/file_deleter.cpp +++ b/sched/file_deleter.cpp @@ -275,9 +275,7 @@ bool do_pass(bool retry_error) { retval = wu.enumerate(buf); if (retval) { if (retval != ERR_DB_NOT_FOUND) { - log_messages.printf(MSG_DEBUG, - "DB connection lost, exiting\n" - ); + log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n"); exit(0); } break; @@ -306,7 +304,16 @@ bool do_pass(bool retry_error) { clause, RESULTS_PER_ENUM ); - while (!result.enumerate(buf)) { + while (1) { + retval = result.enumerate(buf); + if (retval) { + if (retval != ERR_DB_NOT_FOUND) { + log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n"); + exit(0); + } + break; + } + did_something = true; retval = 0; if (!preserve_result_files) { diff --git a/sched/request_file_list.cpp b/sched/request_file_list.cpp index e1a8ffd90b..5f75b57c19 100644 --- a/sched/request_file_list.cpp +++ b/sched/request_file_list.cpp @@ -32,6 +32,7 @@ #include "boinc_db.h" #include "str_util.h" +#include "error_numbers.h" #include "sched_config.h" #include "sched_util.h" @@ -55,7 +56,16 @@ int request_file_list(int host_id) { int request_files_from_all() { DB_HOST host; - while(!host.enumerate()) { + + while(1) { + int retval = host.enumerate(); + if (retval) { + if (retval != ERR_DB_NOT_FOUND) { + fprintf(stderr, "lost DB connection\n"); + exit(1); + } + break; + } request_file_list(host.get_id()); } return 0; diff --git a/sched/sched_send.cpp b/sched/sched_send.cpp index 9049beef83..152f0f3702 100644 --- a/sched/sched_send.cpp +++ b/sched/sched_send.cpp @@ -985,7 +985,7 @@ int add_result_to_reply( } } retval = result.mark_as_sent(old_server_state); - if (retval==ERR_DB_NOT_FOUND) { + if (retval == ERR_DB_NOT_FOUND) { log_messages.printf(MSG_CRITICAL, "[RESULT#%d] [HOST#%d]: CAN'T SEND, already sent to another host\n", result.id, reply.host.id @@ -1163,9 +1163,11 @@ static void explain_to_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { if (!reply.wreq.host_info.preferred_apps[i].work_available) { APP* app = ssp->lookup_app(reply.wreq.host_info.preferred_apps[i].appid); // don't write message if the app is deprecated - if ( app != NULL ) { + if (app != NULL) { char explanation[256]; - sprintf(explanation,"No work is available for %s",find_user_friendly_name(reply.wreq.host_info.preferred_apps[i].appid)); + sprintf(explanation, "No work is available for %s", + find_user_friendly_name(reply.wreq.host_info.preferred_apps[i].appid) + ); USER_MESSAGE um(explanation, "high"); reply.insert_message(um); } @@ -1234,6 +1236,13 @@ static void explain_to_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { ); reply.insert_message(um); } + if (reply.wreq.no_jobs_available) { + USER_MESSAGE um( + "(Project has no jobs available)", + "high" + ); + reply.insert_message(um); + } if (reply.wreq.daily_result_quota_exceeded) { struct tm *rpc_time_tm; int delay_time; @@ -1778,7 +1787,7 @@ void JOB_SET::send(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { } void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { - int i, slots_locked=0; + int i, slots_locked=0, slots_nonempty=0; JOB_SET jobs (sreq, reply); int min_slots = config.mm_min_slots; if (!min_slots) min_slots = ssp->max_wu_results/2; @@ -1799,8 +1808,10 @@ void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { case WR_STATE_EMPTY: continue; case WR_STATE_PRESENT: + slots_nonempty++; break; default: + slots_nonempty++; if (wu_result.state == g_pid) break; slots_locked++; continue; @@ -1839,6 +1850,13 @@ void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { if (jobs.request_satisfied() && slots_scanned>=min_slots) break; } + if (!slots_nonempty) { + log_messages.printf(MSG_CRITICAL, + "Job cache is empty - check feeder\n" + ); + reply.wreq.no_jobs_available = true; + } + // TODO: trim jobs from tail of list until we pass the EDF check // jobs.send(sreq, reply); diff --git a/sched/server_types.h b/sched/server_types.h index 9ed4cf8122..f00fb4f2fc 100644 --- a/sched/server_types.h +++ b/sched/server_types.h @@ -162,6 +162,7 @@ struct WORK_REQ { bool daily_result_quota_exceeded; int daily_result_quota; // for this machine: number of cpus * daily_quota/cpu bool cache_size_exceeded; + bool no_jobs_available; // project has no work right now int nresults_on_host; // How many results from this project are in progress on the host. // Initially this is the number of "other_results" diff --git a/sched/transitioner.cpp b/sched/transitioner.cpp index 754af527bf..98a86e2c31 100644 --- a/sched/transitioner.cpp +++ b/sched/transitioner.cpp @@ -40,6 +40,7 @@ using namespace std; #include "util.h" #include "backend_lib.h" #include "common_defs.h" +#include "error_numbers.h" #include "sched_config.h" #include "sched_util.h" @@ -619,11 +620,21 @@ bool do_pass() { // loop over entries that are due to be checked // - while (!transitioner.enumerate((int)time(0), SELECT_LIMIT, mod_n, mod_i, items)) { + while (1) { + retval = transitioner.enumerate( + (int)time(0), SELECT_LIMIT, mod_n, mod_i, items + ); + if (retval) { + if (retval != ERR_DB_NOT_FOUND) { + log_messages.printf(MSG_CRITICAL, + "WU enum error%d; exiting\n", retval + ); + exit(1); + } + break; + } did_something = true; - TRANSITIONER_ITEM& wu_item = items[0]; - retval = handle_wu(transitioner, items); if (retval) { log_messages.printf(MSG_CRITICAL, diff --git a/sched/trickle_handler.cpp b/sched/trickle_handler.cpp index f26bac46d6..47312a04d0 100644 --- a/sched/trickle_handler.cpp +++ b/sched/trickle_handler.cpp @@ -28,13 +28,14 @@ // // return nonzero on error -using namespace std; #include "config.h" #include #include "boinc_db.h" #include "util.h" +#include "error_numbers.h" #include "str_util.h" + #include "sched_config.h" #include "sched_util.h" #include "sched_msgs.h" @@ -82,7 +83,15 @@ bool do_trickle_scan() { int retval; sprintf(buf, "where variety='%s' and handled=0", variety); - while (!mfh.enumerate(buf)) { + while (1) { + retval = mfh.enumerate(buf); + if (retval) { + if (retval != ERR_DB_NOT_FOUND) { + fprintf(stderr, "lost DB conn\n"); + exit(1); + } + break; + } retval = handle_trickle(mfh); if (!retval) { mfh.handled = true; diff --git a/sched/update_stats.cpp b/sched/update_stats.cpp index 7953c64c6a..c5907a1b30 100644 --- a/sched/update_stats.cpp +++ b/sched/update_stats.cpp @@ -35,6 +35,7 @@ #include "boinc_db.h" #include "util.h" #include "str_util.h" +#include "error_numbers.h" #include "sched_config.h" #include "sched_util.h" @@ -53,7 +54,16 @@ int update_users() { int retval; char buf[256]; - while (!user.enumerate("where expavg_credit>0.1")) { + while (1) { + retval = user.enumerate("where expavg_credit>0.1"); + if (retval) { + if (retval != ERR_DB_NOT_FOUND) { + log_messages.printf(MSG_CRITICAL, "lost DB conn\n"); + exit(1); + } + break; + } + if (user.expavg_time > update_time_cutoff) continue; update_average(0, 0, CREDIT_HALF_LIFE, user.expavg_credit, user.expavg_time); sprintf( buf, "expavg_credit=%f, expavg_time=%f", @@ -74,7 +84,16 @@ int update_hosts() { int retval; char buf[256]; - while (!host.enumerate("where expavg_credit>0.1")) { + while (1) { + retval = host.enumerate("where expavg_credit>0.1"); + if (retval) { + if (retval != ERR_DB_NOT_FOUND) { + log_messages.printf(MSG_CRITICAL, "lost DB conn\n"); + exit(1); + } + break; + } + if (host.expavg_time > update_time_cutoff) continue; update_average(0, 0, CREDIT_HALF_LIFE, host.expavg_credit, host.expavg_time); sprintf( @@ -123,7 +142,16 @@ int update_teams() { int retval; char buf[256]; - while (!team.enumerate("where expavg_credit>0.1")) { + while (1) { + retval = team.enumerate("where expavg_credit>0.1"); + if (retval) { + if (retval != ERR_DB_NOT_FOUND) { + log_messages.printf(MSG_CRITICAL, "lost DB conn\n"); + exit(1); + } + break; + } + retval = get_team_totals(team); if (retval) { log_messages.printf(MSG_CRITICAL,