diff --git a/checkin_notes b/checkin_notes index 5099f79247..97c9797a9f 100644 --- a/checkin_notes +++ b/checkin_notes @@ -3538,3 +3538,18 @@ Rom May 2 2008 (HEAD) configure.ac version.h +David May 2 2008 + - scheduler: clean up message log. + Merge redundant messages. + Condition various messages on config flags. + - client (Unix) fix to CUDA detection if LD_LIBRARY_PATH is "" + + lib/ + coproc.C + sched/ + handle_request.C + main.C + sched_config.C,h + sched_result.C + sched_send.C + server_types.C,h diff --git a/lib/coproc.C b/lib/coproc.C index 2860aa1ef9..1bea1b2533 100644 --- a/lib/coproc.C +++ b/lib/coproc.C @@ -132,7 +132,7 @@ void COPROC_CUDA::get(COPROCS& coprocs) { // Leave it there, shouldn't hurt. // char* p = getenv("LD_LIBRARY_PATH"); - if (p) { + if (p && strlen(p)) { if (!strstr(p, "/usr/local/cuda/lib")) { char libpath[8192]; sprintf(libpath, "%s:/usr/local/cuda/lib", p); diff --git a/sched/handle_request.C b/sched/handle_request.C index e8e1a17a08..b4f0f0801b 100644 --- a/sched/handle_request.C +++ b/sched/handle_request.C @@ -258,10 +258,6 @@ int authenticate_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { } reply.host = host; - log_messages.printf(MSG_DEBUG, - "Request [HOST#%d] Database [HOST#%d] Request [RPC#%d] Database [RPC#%d]\n", - sreq.hostid, host.id, sreq.rpc_seqno, host.rpc_seqno - ); // look up user based on the ID in host record, // and see if the authenticator matches (regular or weak) @@ -1133,6 +1129,23 @@ void handle_msgs_to_host(SCHEDULER_REPLY& reply) { } } +static void log_request(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { + log_messages.printf(MSG_NORMAL, + "Request: [USER#%d] [HOST#%d] [IP %s] client %d.%d.%d, work req %d sec\n", + reply.user.id, reply.host.id, get_remote_addr(), + sreq.core_client_major_version, sreq.core_client_minor_version, + sreq.core_client_release, + (int)sreq.work_req_seconds + ); + if (config.debug_request_details) { + log_messages.printf(MSG_DEBUG, + "Request details: auth %s, RPC seqno %d, platform %s\n", + sreq.authenticator, sreq.rpc_seqno, sreq.platform.name + ); + } + log_messages.set_indent_level(2); +} + void process_request( SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, char* code_sign_key ) { @@ -1215,12 +1228,7 @@ void process_request( initial_host = reply.host; reply.host.rpc_seqno = sreq.rpc_seqno; - log_messages.printf(MSG_NORMAL, - "Processing request from [USER#%d] [HOST#%d] [IP %s] [RPC#%d] core client version %d.%d.%d\n", - reply.user.id, reply.host.id, get_remote_addr(), sreq.rpc_seqno, - sreq.core_client_major_version, sreq.core_client_minor_version, - sreq.core_client_release - ); + log_request(sreq, reply); // is host blacklisted? // @@ -1361,6 +1369,33 @@ leave: } } +static void log_incomplete_request(SCHEDULER_REQUEST& sreq) { + // BOINC scheduler requests use method POST. + // So method GET means that someone is trying a browser. + // + char *rm=getenv("REQUEST_METHOD"); + bool used_get = false; + if (rm && !strcmp(rm, "GET")) { + used_get = true; + } + log_messages.printf(MSG_NORMAL, + "Incomplete request received %sfrom IP %s, auth %s, platform %s, version %d.%d.%d\n", + used_get?"(used GET method - probably a browser) ":"", + get_remote_addr(), sreq.authenticator, sreq.platform.name, + sreq.core_client_major_version, sreq.core_client_minor_version, + sreq.core_client_release + ); +} + +static void log_user_messages(SCHEDULER_REPLY& sreply) { + for (unsigned int i=0; iid, srip->name, srip->server_state, - srip->outcome, srip->client_state, srip->validate_state, - srip->file_delete_state - ); + if (config.debug_handle_results) { + log_messages.printf(MSG_DEBUG, + "[HOST#%d] [RESULT#%d %s] got result (DB: server_state=%d outcome=%d client_state=%d validate_state=%d delete_state=%d)\n", + reply.host.id, srip->id, srip->name, srip->server_state, + srip->outcome, srip->client_state, srip->validate_state, + srip->file_delete_state + ); + } // Do various sanity checks. // If one of them fails, set srip->id = 0, @@ -166,8 +169,9 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { if (srip->received_time) { log_messages.printf(MSG_CRITICAL, - "[HOST#%d] [RESULT#%d %s] got result twice\n", - reply.host.id, srip->id, srip->name + "[HOST#%d] [RESULT#%d %s] already got result, at %s \n", + reply.host.id, srip->id, srip->name, + time_to_string(srip->received_time) ); srip->id = 0; reply.result_acks.push_back(std::string(rp->name)); @@ -246,9 +250,11 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { } else { srip->claimed_credit = srip->cpu_time * reply.host.claimed_credit_per_cpu_sec; } - log_messages.printf(MSG_DEBUG, - "cpu %f cpcs %f, cc %f\n", srip->cpu_time, reply.host.claimed_credit_per_cpu_sec, srip->claimed_credit - ); + if (config.debug_handle_results) { + log_messages.printf(MSG_DEBUG, + "cpu time %f credit/sec %f, claimed credit %f\n", srip->cpu_time, reply.host.claimed_credit_per_cpu_sec, srip->claimed_credit + ); + } srip->server_state = RESULT_SERVER_STATE_OVER; strlcpy(srip->stderr_out, rp->stderr_out, sizeof(srip->stderr_out)); @@ -262,16 +268,20 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { if ((srip->client_state == RESULT_FILES_UPLOADED) && (srip->exit_status == 0)) { srip->outcome = RESULT_OUTCOME_SUCCESS; - log_messages.printf(MSG_DEBUG, - "[RESULT#%d %s]: setting outcome SUCCESS\n", - srip->id, srip->name - ); + if (config.debug_handle_results) { + log_messages.printf(MSG_DEBUG, + "[RESULT#%d %s]: setting outcome SUCCESS\n", + srip->id, srip->name + ); + } reply.got_good_result(); } else { - log_messages.printf(MSG_DEBUG, - "[RESULT#%d %s]: client_state %d exit_status %d; setting outcome ERROR\n", - srip->id, srip->name, srip->client_state, srip->exit_status - ); + if (config.debug_handle_results) { + log_messages.printf(MSG_DEBUG, + "[RESULT#%d %s]: client_state %d exit_status %d; setting outcome ERROR\n", + srip->id, srip->name, srip->client_state, srip->exit_status + ); + } srip->outcome = RESULT_OUTCOME_CLIENT_ERROR; srip->validate_state = VALIDATE_STATE_INVALID; reply.got_bad_result(); diff --git a/sched/sched_send.C b/sched/sched_send.C index 69c69e105b..14ed31df14 100644 --- a/sched/sched_send.C +++ b/sched/sched_send.C @@ -330,18 +330,8 @@ double estimate_cpu_duration(WORKUNIT& wu, SCHEDULER_REPLY& reply) { static double estimate_wallclock_duration( WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply ) { - double running_frac; - if (reply.wreq.core_client_version<=419) { - running_frac = reply.host.on_frac; - } else { - running_frac = reply.host.active_frac * reply.host.on_frac; - } - if (running_frac < HOST_ACTIVE_FRAC_MIN) { - running_frac = HOST_ACTIVE_FRAC_MIN; - } - if (running_frac > 1) running_frac = 1; double ecd = estimate_cpu_duration(wu, reply); - double ewd = ecd/running_frac; + double ewd = ecd/reply.wreq.running_frac; if (reply.host.duration_correction_factor) { ewd *= reply.host.duration_correction_factor; } @@ -350,8 +340,8 @@ static double estimate_wallclock_duration( } if (config.debug_send) { log_messages.printf(MSG_DEBUG, - "est cpu dur %f; running_frac %f; est %f\n", - ecd, running_frac, ewd + "est cpu dur %f; est wall dur %f\n", + ecd, reply.wreq.running_frac, ewd ); } return ewd; @@ -401,13 +391,6 @@ static int get_host_info(SCHEDULER_REPLY& reply) { double expavg_time = reply.host.expavg_time; double avg_turnaround = reply.host.avg_turnaround; update_average(0, 0, CREDIT_HALF_LIFE, expavg_credit, expavg_time); - if (config.debug_send) { - log_messages.printf(MSG_DEBUG, - "[HOST#%d] Checking if reliable (OS = %s) error_rate = %.3f avg_turnaround = %.0f hrs\n", - reply.host.id, reply.host.os_name, reply.host.error_rate, - reply.host.avg_turnaround/3600 - ); - } // Platforms other then Windows, Linux and Intel Macs need a // larger set of computers to be marked reliable @@ -428,13 +411,15 @@ static int get_host_info(SCHEDULER_REPLY& reply) { && (config.daily_result_quota == 0 || reply.host.max_results_day >= config.daily_result_quota) ) { reply.wreq.host_info.reliable = true; - if (config.debug_send) { - log_messages.printf(MSG_DEBUG, - "[HOST#%d] is reliable (OS = %s) error_rate = %.3f avg_turnaround(hours) = %.0f \n", - reply.host.id, reply.host.os_name, reply.host.error_rate, - reply.host.avg_turnaround/3600 - ); - } + } + if (config.debug_send) { + log_messages.printf(MSG_DEBUG, + "[HOST#%d] is%s reliable (OS = %s) error_rate = %.3f avg_turn_hrs = %.0f \n", + reply.host.id, + reply.wreq.host_info.reliable?"":" not", + reply.host.os_name, reply.host.error_rate, + reply.host.avg_turnaround/3600 + ); } return 0; } @@ -563,9 +548,9 @@ static inline int check_deadline( if (diff > 0) { if (config.debug_send) { log_messages.printf(MSG_DEBUG, - "[WU#%d %s] needs %d seconds on [HOST#%d]; delay_bound is %d (estimated_delay is %f)\n", - wu.id, wu.name, (int)ewd, reply.host.id, wu.delay_bound, - request.estimated_delay + "[WU#%d %s] est report delay %d on [HOST#%d]; delay_bound is %d\n", + wu.id, wu.name, (int)est_report_delay, + reply.host.id, wu.delay_bound ); } reply.wreq.speed.set_insufficient(diff); @@ -1201,7 +1186,21 @@ static void explain_to_user(SCHEDULER_REPLY& reply) { } } +static void get_running_frac(SCHEDULER_REPLY& reply) { + if (reply.wreq.core_client_version<=419) { + reply.wreq.running_frac = reply.host.on_frac; + } else { + reply.wreq.running_frac = reply.host.active_frac * reply.host.on_frac; + } + if (reply.wreq.running_frac < HOST_ACTIVE_FRAC_MIN) { + reply.wreq.running_frac = HOST_ACTIVE_FRAC_MIN; + } + if (reply.wreq.running_frac > 1) reply.wreq.running_frac = 1; +} + void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { + if (sreq.work_req_seconds <= 0) return; + reply.wreq.core_client_version = sreq.core_client_major_version*100 + sreq.core_client_minor_version; reply.wreq.disk_available = max_allowable_disk(sreq, reply); @@ -1214,18 +1213,33 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { } get_host_info(reply); // parse project prefs for app details - reply.wreq.beta_only = false; - reply.wreq.user_apps_only = true; + + get_running_frac(reply); if (config.debug_send) { log_messages.printf(MSG_DEBUG, - "[HOST#%d] got request for %f seconds of work; available disk %f GB\n", - reply.host.id, sreq.work_req_seconds, reply.wreq.disk_available/1e9 + "%s matchmaking scheduling; %s EDF sim\n", +#ifdef MATCHMAKER + "Using", +#else + "Not using", +#endif + config.workload_sim?"Using":"Not using" + ); + log_messages.printf(MSG_DEBUG, + "available disk %f GB, work_buf_min %d\n", + reply.wreq.disk_available/1e9, + (int)sreq.global_prefs.work_buf_min() + ); + log_messages.printf(MSG_DEBUG, + "running frac %f DCF %f CPU effic %f est delay %d\n", + reply.wreq.running_frac, + reply.host.duration_correction_factor, + reply.host.cpu_efficiency, + (int)sreq.estimated_delay ); } - if (sreq.work_req_seconds <= 0) return; - reply.wreq.seconds_to_fill = sreq.work_req_seconds; if (reply.wreq.seconds_to_fill > MAX_SECONDS_TO_SEND) { reply.wreq.seconds_to_fill = MAX_SECONDS_TO_SEND; @@ -1258,6 +1272,9 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { #ifdef MATCHMAKER send_work_matchmaker(sreq, reply); #else + reply.wreq.beta_only = false; + reply.wreq.user_apps_only = true; + // give top priority to results that require a 'reliable host' // if (reply.wreq.host_info.reliable) { @@ -1308,11 +1325,6 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { #endif } - log_messages.printf(MSG_NORMAL, - "[HOST#%d] Sent %d results [scheduler ran %f seconds]\n", - reply.host.id, reply.wreq.nresults, elapsed_wallclock_time() - ); - explain_to_user(reply); } @@ -1635,9 +1647,11 @@ void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { JOB job; job.index = i; job.get_score(sreq, reply); - log_messages.printf(MSG_NORMAL, - "score for %s: %f\n", wu_result.workunit.name, job.score - ); + if (config.debug_send) { + log_messages.printf(MSG_DEBUG, + "score for %s: %f\n", wu_result.workunit.name, job.score + ); + } if (job.score > jobs.lowest_score()) { ssp->wu_results[i].state = pid; unlock_sema(); diff --git a/sched/server_types.C b/sched/server_types.C index d40a47784e..ffa6176240 100644 --- a/sched/server_types.C +++ b/sched/server_types.C @@ -496,7 +496,6 @@ SCHEDULER_REPLY::SCHEDULER_REPLY() { memset(&team, 0, sizeof(team)); nucleus_only = false; project_is_down = false; - probable_user_browser = false; send_msg_ack = false; strcpy(email_hash, ""); } @@ -543,10 +542,12 @@ int SCHEDULER_REPLY::write(FILE* fout) { request_delay=min_delay_needed; } fprintf(fout, "%f\n", request_delay); - log_messages.printf(MSG_NORMAL, - "sending delay request %f\n", request_delay - ); } + log_messages.printf(MSG_NORMAL, + "Sending reply to [HOST#%d]: %d results, delay req %f [scheduler ran %f seconds]\n", + host.id, wreq.nresults, request_delay, elapsed_wallclock_time() + ); + if (wreq.core_client_version <= 419) { std::string msg; std::string pri = "low"; @@ -940,6 +941,7 @@ int RESULT::parse_from_client(FILE* fin) { } if (match_tag(buf, "")) continue; if (match_tag(buf, "")) continue; + if (match_tag(buf, "")) continue; if (match_tag(buf, "")) continue; if (match_tag(buf, "")) continue; if (match_tag(buf, "")) continue; diff --git a/sched/server_types.h b/sched/server_types.h index 3dedd1bc22..b9da154b24 100644 --- a/sched/server_types.h +++ b/sched/server_types.h @@ -110,6 +110,7 @@ struct WORK_REQ { double disk_available; int nresults; int core_client_version; + double running_frac; RESOURCE disk; RESOURCE mem; @@ -301,7 +302,6 @@ struct SCHEDULER_REPLY { int lockfile_fd; // file descriptor of lockfile, or -1 if no lock. bool send_global_prefs; bool nucleus_only; // send only message - bool probable_user_browser; USER user; char email_hash[MD5_LEN]; HOST host; // after validation, contains full host rec