diff --git a/checkin_notes b/checkin_notes index c8aa297197..15ba82a505 100755 --- a/checkin_notes +++ b/checkin_notes @@ -7695,3 +7695,23 @@ David 11 Nov 2003 sched_shmem.C tools/ backend_lib.C + +David 11 Nov 2003 + - modify the scheduler so that if a client requests work, + and there are unsent results, but none can be sent, + then the client gets an explanatory message: either + - no app version available + - insufficient disk + - insufficient memory + - host is too slow + This involved creating a WORK_REQ structure that gets passed + around, to avoid huge numbers of arguments + - modify the scheduler so that if either + 1) the shared-mem segment is absent or + 2) the DB can't be opened + then it writes a "project temporarily down for maintenance" + message instead of just exiting + + sched/ + handle_request.C + main.C diff --git a/sched/handle_request.C b/sched/handle_request.C index 88f212434e..ef79ed048a 100644 --- a/sched/handle_request.C +++ b/sched/handle_request.C @@ -47,6 +47,21 @@ const int MAX_WUS_TO_SEND = 10; const double COBBLESTONE_FACTOR = 300.0; +struct WORK_REQ { + bool infeasible_only; + double seconds_to_fill; + double disk_available; + int nresults; + + // the following flags are set whenever a result is infeasible; + // used to construct explanatory message to user + // + bool insufficient_disk; + bool insufficient_mem; + bool insufficient_speed; + bool no_app_version; +}; + // compute the max disk usage we can request of the host // double max_allowable_disk(USER& user, SCHEDULER_REQUEST& req) { @@ -58,7 +73,7 @@ double max_allowable_disk(USER& user, SCHEDULER_REQUEST& req) { // fill in default values for missing prefs // - if (prefs.disk_max_used_gb == 0) prefs.disk_max_used_gb = 0.01; // 10 MB + if (prefs.disk_max_used_gb == 0) prefs.disk_max_used_gb = 0.1; // 100 MB if (prefs.disk_max_used_pct == 0) prefs.disk_max_used_pct = 10; // min_free_gb can be zero @@ -124,19 +139,13 @@ inline double estimate_wallclock_duration(WORKUNIT& wu, HOST& host) { // return true if the WU can be executed on the host // -bool wu_is_feasible(WORKUNIT& wu, HOST& host) { - if(host.d_free && wu.rsc_disk_bound > host.d_free) { - log_messages.printf( - SchedMessages::DEBUG, "[WU#%d %s] needs %f disk; [HOST#%d] has %f\n", - wu.id, wu.name, wu.rsc_disk_bound, host.id, host.d_free - ); - return false; - } +bool wu_is_feasible(WORKUNIT& wu, HOST& host, WORK_REQ& wreq) { if (host.m_nbytes && wu.rsc_memory_bound > host.m_nbytes) { log_messages.printf( SchedMessages::DEBUG, "[WU#%d %s] needs %f mem; [HOST#%d] has %f\n", wu.id, wu.name, wu.rsc_memory_bound, host.id, host.m_nbytes ); + wreq.insufficient_mem = true; return false; } @@ -148,6 +157,7 @@ bool wu_is_feasible(WORKUNIT& wu, HOST& host) { SchedMessages::DEBUG, "[WU#%d %s] needs requires %d seconds on [HOST#%d]; delay_bound is %d\n", wu.id, wu.name, (int)wu_wallclock_time, host.id, wu.delay_bound ); + wreq.insufficient_speed = true; return false; } @@ -204,7 +214,8 @@ int insert_wu_tags(WORKUNIT& wu, APP& app) { // Add the app and app_version to the reply also. // int add_wu_to_reply( - WORKUNIT& wu, SCHEDULER_REPLY& reply, PLATFORM& platform, SCHED_SHMEM& ss + WORKUNIT& wu, SCHEDULER_REPLY& reply, PLATFORM& platform, SCHED_SHMEM& ss, + WORK_REQ& wreq ) { APP* app; APP_VERSION* avp; @@ -221,10 +232,11 @@ int add_wu_to_reply( avp = ss.lookup_app_version(app->id, platform.id, app->min_version); if (!avp) { log_messages.printf( - SchedMessages::CRITICAL, - "Can't find app version: APP#%d PLATFORM#%d min_version %d\n", + SchedMessages::DEBUG, + "no app version available: APP#%d PLATFORM#%d min_version %d\n", app->id, platform.id, app->min_version ); + wreq.no_app_version = true; return ERR_NULL; } @@ -640,10 +652,9 @@ static bool already_in_reply(WU_RESULT& wu_result, SCHEDULER_REPLY& reply) { // previously infeasible for some host // static void scan_work_array( - bool infeasible_only, double& seconds_to_fill, double& disk_available, - int& nresults, + WORK_REQ& wreq, SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, PLATFORM& platform, - SCHED_SHMEM& ss, bool& insufficient_disk + SCHED_SHMEM& ss ) { int i, retval, n; WORKUNIT wu; @@ -651,12 +662,12 @@ static void scan_work_array( double wu_seconds_filled; char buf[256]; - if (disk_available < 0) insufficient_disk = true; + if (wreq.disk_available < 0) wreq.insufficient_disk = true; for (i=0; i disk_available) { - insufficient_disk = true; + if (wu_result.workunit.rsc_disk_bound > wreq.disk_available) { + wreq.insufficient_disk = true; wu_result.infeasible_count++; continue; } - if (infeasible_only && wu_result.infeasible_count==0) { + if (wreq.infeasible_only && wu_result.infeasible_count==0) { continue; } @@ -713,7 +724,7 @@ static void scan_work_array( // don't send if host can't handle it // wu = wu_result.workunit; - if (!wu_is_feasible(wu, reply.host)) { + if (!wu_is_feasible(wu, reply.host, wreq)) { log_messages.printf( SchedMessages::DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible\n", reply.host.id, wu.id, wu.name @@ -726,7 +737,7 @@ static void scan_work_array( // for the client's platform. // Treat the same as the WU being infeasible // - retval = add_wu_to_reply(wu, reply, platform, ss); + retval = add_wu_to_reply(wu, reply, platform, ss, wreq); if (retval) { wu_result.infeasible_count++; continue; @@ -749,7 +760,7 @@ static void scan_work_array( // ****** HERE WE'VE COMMITTED TO SENDING THIS RESULT TO HOST ****** // - disk_available -= wu.rsc_disk_bound; + wreq.disk_available -= wu.rsc_disk_bound; // update the result in DB // @@ -793,10 +804,10 @@ static void scan_work_array( } reply.insert_result(result); - seconds_to_fill -= wu_seconds_filled; + wreq.seconds_to_fill -= wu_seconds_filled; - nresults++; - if (nresults == MAX_WUS_TO_SEND) break; + wreq.nresults++; + if (wreq.nresults == MAX_WUS_TO_SEND) break; } } @@ -805,49 +816,63 @@ int send_work( SCHED_SHMEM& ss ) { int nresults = 0; - double seconds_to_fill; - double disk_available; - bool insufficient_disk = false; + WORK_REQ wreq; - disk_available = max_allowable_disk(reply.user, sreq); + wreq.disk_available = max_allowable_disk(reply.user, sreq); + wreq.insufficient_disk = false; + wreq.insufficient_mem = false; + wreq.insufficient_speed = false; + wreq.no_app_version = false; log_messages.printf( SchedMessages::NORMAL, "[HOST#%d] got request for %d seconds of work; available disk %f GB\n", - reply.host.id, sreq.work_req_seconds, disk_available/1e9 + reply.host.id, sreq.work_req_seconds, wreq.disk_available/1e9 ); if (sreq.work_req_seconds <= 0) return 0; - seconds_to_fill = sreq.work_req_seconds; - if (seconds_to_fill > MAX_SECONDS_TO_SEND) { - seconds_to_fill = MAX_SECONDS_TO_SEND; + wreq.seconds_to_fill = sreq.work_req_seconds; + if (wreq.seconds_to_fill > MAX_SECONDS_TO_SEND) { + wreq.seconds_to_fill = MAX_SECONDS_TO_SEND; } - if (seconds_to_fill < MIN_SECONDS_TO_SEND) { - seconds_to_fill = MIN_SECONDS_TO_SEND; + if (wreq.seconds_to_fill < MIN_SECONDS_TO_SEND) { + wreq.seconds_to_fill = MIN_SECONDS_TO_SEND; } // give priority to results that were infeasible for some other host // - scan_work_array( - true, seconds_to_fill, disk_available, - nresults, sreq, reply, platform, ss, insufficient_disk - ); - scan_work_array( - false, seconds_to_fill, disk_available, - nresults, sreq, reply, platform, ss, insufficient_disk - ); + wreq.infeasible_only = true; + scan_work_array(wreq, sreq, reply, platform, ss); + + wreq.infeasible_only = false; + scan_work_array(wreq, sreq, reply, platform, ss); log_messages.printf( SchedMessages::NORMAL, "[HOST#%d] Sent %d results\n", reply.host.id, nresults ); - if (nresults == 0) { + if (wreq.nresults == 0) { strcpy(reply.message, "No work available"); - if (insufficient_disk) { + if (wreq.no_app_version) { strcat(reply.message, - " (you may need to increase disk limits in global prefs)" + " (there was work for other platforms)" + ); + } + if (wreq.insufficient_disk) { + strcat(reply.message, + " (there was work but you don't have enough disk space allocated)" + ); + } + if (wreq.insufficient_mem) { + strcat(reply.message, + " (there was work but your computer doesn't have enough memory)" + ); + } + if (wreq.insufficient_mem) { + strcat(reply.message, + " (there was work but your computer would not finish it before it is due" ); } strcpy(reply.message_priority, "low"); diff --git a/sched/main.C b/sched/main.C index 2a1d043f28..31b93e4788 100644 --- a/sched/main.C +++ b/sched/main.C @@ -57,11 +57,12 @@ int main() { FILE* fin, *fout; int i, retval, pid; char req_path[256], reply_path[256], path[256]; - SCHED_SHMEM* ssp; + SCHED_SHMEM* ssp=0; void* p; unsigned int counter=0; char* code_sign_key; bool found; + bool project_stopped = false; if (!freopen(STDERR_FILENAME, "a", stderr)) { fprintf(stderr, "Can't redirect stderr\n"); @@ -79,45 +80,52 @@ int main() { sprintf(path, "%s/code_sign_public", config.key_dir); retval = read_file_malloc(path, code_sign_key); if (retval) { - log_messages.printf(SchedMessages::CRITICAL, "Can't read code sign key file (%s)\n", path); + log_messages.printf(SchedMessages::CRITICAL, + "Can't read code sign key file (%s)\n", path + ); exit(1); } retval = attach_shmem(config.shmem_key, &p); if (retval) { - log_messages.printf(SchedMessages::CRITICAL, "Can't attach shmem (feeder not running?)\n"); - exit(1); - } - ssp = (SCHED_SHMEM*)p; - retval = ssp->verify(); - if (retval) { - log_messages.printf(SchedMessages::CRITICAL, "shmem has wrong struct sizes - recompile\n"); - exit(1); - } + log_messages.printf(SchedMessages::CRITICAL, + "Can't attach shmem (feeder not running?)\n" + ); + project_stopped = true; + } else { + ssp = (SCHED_SHMEM*)p; + retval = ssp->verify(); + if (retval) { + log_messages.printf(SchedMessages::CRITICAL, + "shmem has wrong struct sizes - recompile\n" + ); + exit(1); + } - for (i=0; i<10; i++) { - if (ssp->ready) break; - log_messages.printf(SchedMessages::DEBUG, "waiting for ready flag\n"); - sleep(1); - } - if (!ssp->ready) { - log_messages.printf(SchedMessages::CRITICAL, "feeder doesn't seem to be running\n"); - exit(1); + for (i=0; i<10; i++) { + if (ssp->ready) break; + log_messages.printf(SchedMessages::DEBUG, "waiting for ready flag\n"); + sleep(1); + } + if (!ssp->ready) { + log_messages.printf(SchedMessages::CRITICAL, "feeder doesn't seem to be running\n"); + exit(1); + } } retval = boinc_db.open(config.db_name, config.db_passwd); if (retval) { log_messages.printf(SchedMessages::CRITICAL, "can't open database\n"); - exit(1); - } - - found = false; - while (!gproject.enumerate("")) { - found = true; - } - if (!found) { - log_messages.printf(SchedMessages::CRITICAL, "can't find project\n"); - exit(1); + project_stopped = true; + } else { + found = false; + while (!gproject.enumerate("")) { + found = true; + } + if (!found) { + log_messages.printf(SchedMessages::CRITICAL, "can't find project\n"); + exit(1); + } } pid = getpid(); @@ -126,6 +134,14 @@ int main() { counter++; #endif printf("Content-type: text/plain\n\n"); + if (project_stopped) { + printf( + "\n" + " Project is temporarily shut down for maintenance\n" + "\n" + ); + goto done; + } if (use_files) { // the code below is convoluted because, // instead of going from stdin to stdout directly, @@ -167,6 +183,7 @@ int main() { } else { handle_request(stdin, stdout, *ssp, code_sign_key); } +done: #ifdef _USING_FCGI_ } #endif