From 58f5b862884f13e453f1eb6cfe2a9c8b1741d516 Mon Sep 17 00:00:00 2001 From: Bruce Allen Date: Sat, 19 Mar 2005 18:24:24 +0000 Subject: [PATCH] - Overdue checkins on a bunch of minor scheduler code. Some that is E@H specific is now included (but protected by #ifdef EINSTEIN_AT_HOME to make it simpler for me to maintain consistency with BOINC cvs. - Added project-specific unacceptable_os() function for rejecting hosts. - Transitioner and scheduler now initalize host.max_results_day correctly in database under all circumstances. - Browser requests are now correctly identifed (REQUEST_METHO=="GET") and properly redirected. This was broken. David, please see comment near one of the probable_user_browser=true in handle_request.C. I think something is wrong here (or I am missing the point!). - More info about requests is logged - If the scheduler hangs (incoming request incomplete) it will normally be killed by Apache after a timeout. But this happens silently. So I now install a signal handler and catch this SIGTERM. In this case an error message is logged and all open files are flushed before exit(1)ing. - If IO is passed through files, check that request length and content length agree and log a message if they do NOT. - active_frac not correctly reported by 4.19 and earlier core clients. Adjust for this in estimating wallclock execution times. - Added a small block into validator code to attach a debugger. svn path=/trunk/boinc/; revision=5688 --- checkin_notes | 31 ++++++++++++++ sched/handle_request.C | 97 ++++++++++++++++++++++++++++++++++++------ sched/main.C | 70 +++++++++++++++++++++++++++++- sched/sched_send.C | 34 ++++++++++----- sched/transitioner.C | 3 ++ 5 files changed, 210 insertions(+), 25 deletions(-) diff --git a/checkin_notes b/checkin_notes index 0df76bc193..29619c92a9 100755 --- a/checkin_notes +++ b/checkin_notes @@ -26028,3 +26028,34 @@ David 19 Mar 2005 license_1.0.txt (removed) win_build/installerv2/redist/0409/ eula.rtf + +Bruce 19 Mar 2005 + - Overdue checkins on a bunch of minor scheduler code. Some that is + E@H specific is now included (but protected by + #ifdef EINSTEIN_AT_HOME + to make it simpler for me to maintain consistency with BOINC cvs. + - Added project-specific unacceptable_os() function for rejecting hosts. + - Transitioner and scheduler now initalize host.max_results_day correctly + in database under all circumstances. + - Browser requests are now correctly identifed (REQUEST_METHO=="GET") and + properly redirected. This was broken. David, please see comment near + one of the probable_user_browser=true in handle_request.C. I think + something is wrong here (or I am missing the point!). + - More info about requests is logged + - If the scheduler hangs (incoming request incomplete) it will normally be + killed by Apache after a timeout. But this happens silently. So I now + install a signal handler and catch this SIGTERM. In this case an + error message is logged and all open files are flushed before exit(1)ing. + - If IO is passed through files, check that request length and content length + agree and log a message if they do NOT. + - active_frac not correctly reported by 4.19 and earlier core clients. + Adjust for this in estimating wallclock execution times. + - Added a small block into validator code to attach a debugger. + + sched/ + handle_request.C + sched_send.C + main.C + transitioner.C + validator.C + diff --git a/sched/handle_request.C b/sched/handle_request.C index 585709f125..f96bae55be 100644 --- a/sched/handle_request.C +++ b/sched/handle_request.C @@ -99,6 +99,11 @@ int authenticate_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { goto lookup_user_and_make_new_host; } reply.host = host; + log_messages.printf( + SCHED_MSG_LOG::DEBUG, + "Request [HOST#%d] Database [HOST#%d] Request [RPC#%d] Database [RPC#%d]\n", + sreq.hostid, host.id, sreq.rpc_seqno, host.rpc_seqno + ); strlcpy( user.authenticator, sreq.authenticator, @@ -513,7 +518,7 @@ int handle_results( srip->exit_status = rp->exit_status; srip->app_version_num = rp->app_version_num; srip->claimed_credit = rp->cpu_time * reply.host.credit_per_cpu_sec; -#if 1 +#ifdef EINSTEIN_AT_HOME log_messages.printf(SCHED_MSG_LOG::DEBUG, "cpu %f cpcs %f, cc %f\n", srip->cpu_time, reply.host.credit_per_cpu_sec, srip->claimed_credit ); @@ -715,6 +720,41 @@ void warn_user_if_core_client_upgrade_scheduled( return; } +#ifdef EINSTEIN_AT_HOME +bool unacceptable_os( + SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply +) { + log_messages.printf( + SCHED_MSG_LOG::NORMAL, + "OS version %s %s\n", + sreq.host.os_name, sreq.host.os_version + ); + + if (!strcmp(sreq.host.os_name, "Darwin") && + (!strncmp(sreq.host.os_version, "5.", 2) || + !strncmp(sreq.host.os_version, "6.", 2) + ) + ) { + log_messages.printf( + SCHED_MSG_LOG::NORMAL, + "Unacceptable OS %s %s\n", + sreq.host.os_name, sreq.host.os_version + ); + USER_MESSAGE um("Project only supports MacOS Darwin versions 7.X and above", + "low"); + reply.insert_message(um); + reply.set_delay(3600*24); + return true; + } + return false; +} +#else +bool unacceptable_os( + SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply +) { + return false; +} +#endif // EINSTEIN_AT_HOME bool wrong_core_client_version( SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply @@ -725,9 +765,7 @@ bool wrong_core_client_version( // TODO: check for user-agent not empty and not BOINC wrong_version = true; sprintf(msg, - "To participate in this project, " - "you must use major version %d of the BOINC core client. " - "Your core client is major version %d.", + "Need major version %d of the BOINC core client. You have %d.", BOINC_MAJOR_VERSION, sreq.core_client_major_version ); @@ -743,9 +781,7 @@ bool wrong_core_client_version( if (sreq.core_client_minor_version < minor) { wrong_version = true; sprintf(msg, - "To participate in this project, " - "you must use version %d.%02d or higher of the BOINC core client. " - "Your core client is version %d.%02d.", + "Need version %d.%02d or higher of the BOINC core client. You have %d.%02d.", major, minor, sreq.core_client_major_version, sreq.core_client_minor_version ); @@ -760,6 +796,7 @@ bool wrong_core_client_version( if (wrong_version) { USER_MESSAGE um(msg, "low"); reply.insert_message(um); + // IS THE FOLLOWING LINE CORRECT? I DON'T UNDERSTAND IT. Bruce reply.probable_user_browser = true; reply.set_delay(3600*24); return true; @@ -852,7 +889,7 @@ void process_request( // if different major version of BOINC, just send a message // - if (wrong_core_client_version(sreq, reply)) { + if (wrong_core_client_version(sreq, reply) || unacceptable_os(sreq, reply)) { ok_to_send_work = false; // if no results, return without accessing DB @@ -863,7 +900,7 @@ void process_request( } else { warn_user_if_core_client_upgrade_scheduled(sreq, reply); } - + if (config.locality_scheduling) { have_no_work = false; } else { @@ -927,6 +964,13 @@ void process_request( rpc_time_tm = localtime((const time_t*)&reply.host.rpc_time); current_rpc_dayofyear = rpc_time_tm->tm_yday; + if (config.daily_result_quota) { + if (reply.host.max_results_day <= 0 || reply.host.max_results_day > config.daily_result_quota) { + reply.host.max_results_day = config.daily_result_quota; + log_messages.printf(SCHED_MSG_LOG::DEBUG, "[HOST#%d] Initializing max_results_day to %d\n", reply.host.id, config.daily_result_quota); + } + } + if (last_rpc_dayofyear != current_rpc_dayofyear) { log_messages.printf(SCHED_MSG_LOG::DEBUG, "[HOST#%d] Resetting nresults_today\n", reply.host.id); reply.host.nresults_today = 0; @@ -1139,19 +1183,38 @@ void handle_request( ); process_request(sreq, sreply, ss, code_sign_key); } else { + // BOINC scheduler requests use method POST. So method GET + // means that someone is trying a browser. + char *rm=getenv("REQUEST_METHOD"); + if (rm && !strcmp(rm, "GET")) sreply.probable_user_browser=true; + log_messages.printf( SCHED_MSG_LOG::NORMAL, - "Incomplete request received from IP %s, auth %s, platform %s, version %d.%02d\n", + "Incomplete request received %sfrom IP %s, auth %s, platform %s, version %d.%02d\n", + sreply.probable_user_browser?"(probably a browser) ":"", get_remote_addr(), sreq.authenticator, sreq.platform_name, sreq.core_client_major_version, sreq.core_client_minor_version ); + USER_MESSAGE um("Incomplete request received.", "low"); sreply.insert_message(um); sreply.nucleus_only = true; } + +#ifdef EINSTEIN_AT_HOME + // for testing + if (sreply.user.id==3) { + USER_MESSAGE um("THIS IS A SHORT MESSAGE. \n AND ANOTHER", "high"); + // USER_MESSAGE um("THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE. \n" + // "THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE.", "low"); + sreply.insert_message(um); + // USER_MESSAGE um2("THIS IS A VERY LONG TEST MESSAGE2. THIS IS A VERY LONG TEST MESSAGE. \n" + // "THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE.", "high"); + // sreply.insert_message(um2); + } +#endif // if we got no work, and we have no file space, delete some files - // if (sreply.results.size()==0 && (sreply.wreq.insufficient_disk || sreply.wreq.disk_available<0)) { // try to delete a file to make more space. // Also give some hints to the user about what's going wrong @@ -1160,7 +1223,17 @@ void handle_request( delete_file_from_host(sreq, sreply); } -#if 1 + // write all messages to log file + for (unsigned int i=0; i +#include +#include #include #include #include @@ -95,6 +97,48 @@ int open_database() { return 0; } +void debug_sched(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply, const char *trigger); + +// If the scheduler 'hangs', which it can do if a request is not fully processed +// or some other process arises, then Apache will send a SIGTERM to the cgi. +// This signal handler ensures that rather than dying silently, the cgi process +// will leave behind some record in the log file. +// +void sigterm_handler(int signo) { + log_messages.printf(SCHED_MSG_LOG::CRITICAL, + "BOINC scheduler (pid=%d) caught signal %d. Exit(1)ing\n", + (int)getpid(), signo + ); + fflush(NULL); + exit(1); + return; +} + +void log_request_info(int& length) { + char *cl=getenv("CONTENT_LENGTH"); + char *ri=getenv("REMOTE_ADDR"); + char *rm=getenv("REQUEST_METHOD"); + char *ct=getenv("CONTENT_TYPE"); + char *ha=getenv("HTTP_ACCEPT"); + char *hu=getenv("HTTP_USER_AGENT"); + + log_messages.printf(SCHED_MSG_LOG::DEBUG, + "REQUEST_METHOD=%s " + "CONTENT_TYPE=%s " + "HTTP_ACCEPT=%s " + "HTTP_USER_AGENT=%s\n", + rm?rm:"" , ct?ct:"", ha?ha:"", hu?hu:"" + ); + + if (!cl) { + log_messages.printf(SCHED_MSG_LOG::CRITICAL, "CONTENT_LENGTH environment variable not set\n"); + } + else { + length=atoi(cl); + log_messages.printf(SCHED_MSG_LOG::DEBUG, "CONTENT_LENGTH=%d from %s\n", length, ri?ri:"[Unknown]"); + } +} + int main() { FILE* fin, *fout; int i, retval; @@ -104,6 +148,12 @@ int main() { unsigned int counter=0; char* code_sign_key; bool project_stopped = false; + int length=-1; + + // install a signal handler that catches SIGTERMS sent by Apache if the cgi + // times out. + // + signal(SIGTERM, sigterm_handler); #ifndef _USING_FCGI_ char *stderr_buffer, buf[256]; @@ -235,8 +285,11 @@ int main() { send_message("Project is temporarily shut down for maintenance", 3600); goto done; } + log_request_info(length); fprintf(stdout,"Content-type: text/plain\n\n"); + if (use_files) { + struct stat statbuf; // the code below is convoluted because, // instead of going from stdin to stdout directly, // we go via a pair of disk files @@ -252,6 +305,14 @@ int main() { } copy_stream(stdin, fout); fclose(fout); + stat(req_path, &statbuf); + if (length>=0 && (statbuf.st_size != length)) { + log_messages.printf(SCHED_MSG_LOG::CRITICAL, + "Request length %d != CONTENT_LENGTH %d\n", + (int)statbuf.st_size, length + ); + } + fin = fopen(req_path, "r"); if (!fin) { log_messages.printf(SCHED_MSG_LOG::CRITICAL, "can't read request file\n"); @@ -273,8 +334,13 @@ int main() { } copy_stream(fin, stdout); fclose(fin); - //unlink(req_path); - //unlink(reply_path); +#ifdef EINSTEIN_AT_HOME + if (getenv("CONTENT_LENGTH")) unlink(req_path); + if (getenv("CONTENT_LENGTH")) unlink(reply_path); +#else + // unlink(req_path); + // unlink(reply_path); +#endif } else { handle_request(stdin, stdout, *ssp, code_sign_key); } diff --git a/sched/sched_send.C b/sched/sched_send.C index 866c502dd6..b76276d579 100644 --- a/sched/sched_send.C +++ b/sched/sched_send.C @@ -178,14 +178,19 @@ static double estimate_wallclock_duration( WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply ) { double running_frac; - running_frac = reply.host.active_frac * reply.host.on_frac; + if (reply.wreq.core_client_version<=419) { + running_frac = reply.host.on_frac; + } + else { + running_frac = reply.host.active_frac * reply.host.on_frac; + } if (running_frac < HOST_ACTIVE_FRAC_MIN) { running_frac = HOST_ACTIVE_FRAC_MIN; } if (running_frac > 1) running_frac = 1; double ecd = estimate_cpu_duration(wu, reply); double ewd = ecd/(running_frac*request.resource_share_fraction); -#if 0 +#ifdef EINSTEIN_AT_HOME log_messages.printf( SCHED_MSG_LOG::DEBUG, "est cpu dur %f; running_frac %f; rsf %f; est %f\n", ecd, running_frac, request.resource_share_fraction, ewd @@ -533,15 +538,13 @@ bool SCHEDULER_REPLY::work_needed(bool locality_sched) { return false; } if (wreq.nresults >= config.max_wus_to_send) return false; - if (config.daily_result_quota) { - if (host.max_results_day == 0) { - host.max_results_day = config.daily_result_quota; - } + if (config.daily_result_quota) { // scale daily quota by #CPUs, up to a limit of 4 // int ncpus = host.p_ncpus; if (ncpus > 4) ncpus = 4; + if (ncpus < 1) ncpus = 1; wreq.daily_result_quota = ncpus*host.max_results_day; if (host.nresults_today >= wreq.daily_result_quota) { wreq.daily_result_quota_exceeded = true; @@ -903,11 +906,20 @@ int send_work( } if (reply.wreq.insufficient_speed) { char helpful[512]; - sprintf(helpful, - "(won't finish in time) " - "Computer on %.1f%% of time, BOINC on %.1f%% of that, this project gets %.1f%% of that", - 100.0*reply.host.on_frac, 100.0*reply.host.active_frac, 100.0*sreq.resource_share_fraction - ); + if (reply.wreq.core_client_version>419) { + sprintf(helpful, + "(won't finish in time) " + "Computer on %.1f%% of time, BOINC on %.1f%% of that, this project gets %.1f%% of that", + 100.0*reply.host.on_frac, 100.0*reply.host.active_frac, 100.0*sreq.resource_share_fraction + ); + } + else { + sprintf(helpful, + "(won't finish in time) " + "Computer available %.1f%% of time, this project gets %.1f%% of that", + 100.0*reply.host.on_frac, 100.0*sreq.resource_share_fraction + ); + } USER_MESSAGE um(helpful, "high"); reply.insert_message(um); } diff --git a/sched/transitioner.C b/sched/transitioner.C index a5e546231e..ea98c02e46 100644 --- a/sched/transitioner.C +++ b/sched/transitioner.C @@ -70,6 +70,9 @@ int penalize_host(int hostid, double delay_bound) { int retval = host.lookup_id(hostid); if (retval) return retval; compute_avg_turnaround(host, delay_bound); + if (host.max_results_day <= 0 || host.max_results_day > config.daily_result_quota) { + host.max_results_day = config.daily_result_quota; + } host.max_results_day -= 1; if (host.max_results_day < 1) { host.max_results_day = 1;