- scheduler: clean up message log.

Merge redundant messages.
    Condition various messages on config flags.
- client (Unix) fix to CUDA detection if LD_LIBRARY_PATH is ""

svn path=/trunk/boinc/; revision=15122
This commit is contained in:
David Anderson 2008-05-02 17:48:29 +00:00
parent 2411314ddc
commit 6e6fab3e7c
10 changed files with 170 additions and 112 deletions

View File

@ -3538,3 +3538,18 @@ Rom May 2 2008 (HEAD)
configure.ac
version.h
David May 2 2008
- scheduler: clean up message log.
Merge redundant messages.
Condition various messages on config flags.
- client (Unix) fix to CUDA detection if LD_LIBRARY_PATH is ""
lib/
coproc.C
sched/
handle_request.C
main.C
sched_config.C,h
sched_result.C
sched_send.C
server_types.C,h

View File

@ -132,7 +132,7 @@ void COPROC_CUDA::get(COPROCS& coprocs) {
// Leave it there, shouldn't hurt.
//
char* p = getenv("LD_LIBRARY_PATH");
if (p) {
if (p && strlen(p)) {
if (!strstr(p, "/usr/local/cuda/lib")) {
char libpath[8192];
sprintf(libpath, "%s:/usr/local/cuda/lib", p);

View File

@ -258,10 +258,6 @@ int authenticate_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
}
reply.host = host;
log_messages.printf(MSG_DEBUG,
"Request [HOST#%d] Database [HOST#%d] Request [RPC#%d] Database [RPC#%d]\n",
sreq.hostid, host.id, sreq.rpc_seqno, host.rpc_seqno
);
// look up user based on the ID in host record,
// and see if the authenticator matches (regular or weak)
@ -1133,6 +1129,23 @@ void handle_msgs_to_host(SCHEDULER_REPLY& reply) {
}
}
static void log_request(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
log_messages.printf(MSG_NORMAL,
"Request: [USER#%d] [HOST#%d] [IP %s] client %d.%d.%d, work req %d sec\n",
reply.user.id, reply.host.id, get_remote_addr(),
sreq.core_client_major_version, sreq.core_client_minor_version,
sreq.core_client_release,
(int)sreq.work_req_seconds
);
if (config.debug_request_details) {
log_messages.printf(MSG_DEBUG,
"Request details: auth %s, RPC seqno %d, platform %s\n",
sreq.authenticator, sreq.rpc_seqno, sreq.platform.name
);
}
log_messages.set_indent_level(2);
}
void process_request(
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, char* code_sign_key
) {
@ -1215,12 +1228,7 @@ void process_request(
initial_host = reply.host;
reply.host.rpc_seqno = sreq.rpc_seqno;
log_messages.printf(MSG_NORMAL,
"Processing request from [USER#%d] [HOST#%d] [IP %s] [RPC#%d] core client version %d.%d.%d\n",
reply.user.id, reply.host.id, get_remote_addr(), sreq.rpc_seqno,
sreq.core_client_major_version, sreq.core_client_minor_version,
sreq.core_client_release
);
log_request(sreq, reply);
// is host blacklisted?
//
@ -1361,6 +1369,33 @@ leave:
}
}
static void log_incomplete_request(SCHEDULER_REQUEST& sreq) {
// BOINC scheduler requests use method POST.
// So method GET means that someone is trying a browser.
//
char *rm=getenv("REQUEST_METHOD");
bool used_get = false;
if (rm && !strcmp(rm, "GET")) {
used_get = true;
}
log_messages.printf(MSG_NORMAL,
"Incomplete request received %sfrom IP %s, auth %s, platform %s, version %d.%d.%d\n",
used_get?"(used GET method - probably a browser) ":"",
get_remote_addr(), sreq.authenticator, sreq.platform.name,
sreq.core_client_major_version, sreq.core_client_minor_version,
sreq.core_client_release
);
}
static void log_user_messages(SCHEDULER_REPLY& sreply) {
for (unsigned int i=0; i<sreply.messages.size(); i++) {
USER_MESSAGE um = sreply.messages[i];
log_messages.printf(MSG_DEBUG,
"[HOST#%d] MSG(%4s) %s \n", sreply.host.id, um.priority.c_str(), um.message.c_str()
);
}
}
void handle_request(FILE* fin, FILE* fout, char* code_sign_key) {
SCHEDULER_REQUEST sreq;
SCHEDULER_REPLY sreply;
@ -1370,32 +1405,9 @@ void handle_request(FILE* fin, FILE* fout, char* code_sign_key) {
log_messages.set_indent_level(1);
if (sreq.parse(fin) == 0){
log_messages.printf(MSG_NORMAL,
"Handling request: IP %s, auth %s, host %d, platform %s, version %d.%d.%d, work req %d sec\n",
get_remote_addr(), sreq.authenticator, sreq.hostid, sreq.platform.name,
sreq.core_client_major_version, sreq.core_client_minor_version,
sreq.core_client_release,
(int)sreq.work_req_seconds
);
process_request(sreq, sreply, code_sign_key);
log_messages.set_indent_level(2);
} else {
// BOINC scheduler requests use method POST.
// So method GET means that someone is trying a browser.
//
char *rm=getenv("REQUEST_METHOD");
if (rm && !strcmp(rm, "GET")) {
sreply.probable_user_browser=true;
}
log_messages.set_indent_level(2);
log_messages.printf(MSG_NORMAL,
"Incomplete request received %sfrom IP %s, auth %s, platform %s, version %d.%d.%d\n",
sreply.probable_user_browser?"(probably a browser) ":"",
get_remote_addr(), sreq.authenticator, sreq.platform.name,
sreq.core_client_major_version, sreq.core_client_minor_version,
sreq.core_client_release
);
log_incomplete_request(sreq);
USER_MESSAGE um("Incomplete request received.", "low");
sreply.insert_message(um);
sreply.nucleus_only = true;
@ -1405,13 +1417,8 @@ void handle_request(FILE* fin, FILE* fout, char* code_sign_key) {
send_file_deletes(sreq, sreply);
}
// write all messages to log file
//
for (unsigned int i=0; i<sreply.messages.size(); i++) {
USER_MESSAGE um = sreply.messages[i];
log_messages.printf(MSG_DEBUG,
"[HOST#%d] MSG(%4s) %s \n", sreply.host.id, um.priority.c_str(), um.message.c_str()
);
if (config.debug_user_messages) {
log_user_messages(sreply);
}
sreply.write(fout);

View File

@ -186,7 +186,7 @@ void sigterm_handler(int signo) {
return;
}
void log_request_info(int& length) {
static void log_request_headers(int& length) {
char *cl=getenv("CONTENT_LENGTH");
char *ri=getenv("REMOTE_ADDR");
char *rm=getenv("REQUEST_METHOD");
@ -399,7 +399,9 @@ int main(int argc, char** argv) {
counter++;
log_messages.set_indent_level(0);
#endif
log_request_info(length);
if (config.debug_request_headers) {
log_request_headers(length);
}
if (check_stop_sched()) {
send_message("Project is temporarily shut down for maintenance", 3600);

View File

@ -187,6 +187,10 @@ int SCHED_CONFIG::parse(FILE* f) {
if (xp.parse_bool(tag, "debug_assignment", debug_assignment)) continue;
if (xp.parse_bool(tag, "debug_prefs", debug_prefs)) continue;
if (xp.parse_bool(tag, "debug_send", debug_send)) continue;
if (xp.parse_bool(tag, "debug_request_headers", debug_request_headers)) continue;
if (xp.parse_bool(tag, "debug_user_messages", debug_user_messages)) continue;
if (xp.parse_bool(tag, "debug_request_details", debug_request_details)) continue;
if (xp.parse_bool(tag, "debug_handle_results", debug_handle_results)) continue;
// don't complain about unparsed XML;
// there are lots of tags the scheduler doesn't know about

View File

@ -126,6 +126,10 @@ public:
bool debug_assignment;
bool debug_prefs;
bool debug_send;
bool debug_request_headers;
bool debug_user_messages;
bool debug_request_details;
bool debug_handle_results;
int parse(FILE*);
int parse_file(const char* dir=".");

View File

@ -24,6 +24,7 @@
#include "server_types.h"
#include "sched_msgs.h"
#include "sched_util.h"
#include "sched_config.h"
#include "sched_result.h"
@ -86,12 +87,14 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
continue;
}
log_messages.printf(MSG_NORMAL,
"[HOST#%d] [RESULT#%d %s] got result (DB: server_state=%d outcome=%d client_state=%d validate_state=%d delete_state=%d)\n",
reply.host.id, srip->id, srip->name, srip->server_state,
srip->outcome, srip->client_state, srip->validate_state,
srip->file_delete_state
);
if (config.debug_handle_results) {
log_messages.printf(MSG_DEBUG,
"[HOST#%d] [RESULT#%d %s] got result (DB: server_state=%d outcome=%d client_state=%d validate_state=%d delete_state=%d)\n",
reply.host.id, srip->id, srip->name, srip->server_state,
srip->outcome, srip->client_state, srip->validate_state,
srip->file_delete_state
);
}
// Do various sanity checks.
// If one of them fails, set srip->id = 0,
@ -166,8 +169,9 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
if (srip->received_time) {
log_messages.printf(MSG_CRITICAL,
"[HOST#%d] [RESULT#%d %s] got result twice\n",
reply.host.id, srip->id, srip->name
"[HOST#%d] [RESULT#%d %s] already got result, at %s \n",
reply.host.id, srip->id, srip->name,
time_to_string(srip->received_time)
);
srip->id = 0;
reply.result_acks.push_back(std::string(rp->name));
@ -246,9 +250,11 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
} else {
srip->claimed_credit = srip->cpu_time * reply.host.claimed_credit_per_cpu_sec;
}
log_messages.printf(MSG_DEBUG,
"cpu %f cpcs %f, cc %f\n", srip->cpu_time, reply.host.claimed_credit_per_cpu_sec, srip->claimed_credit
);
if (config.debug_handle_results) {
log_messages.printf(MSG_DEBUG,
"cpu time %f credit/sec %f, claimed credit %f\n", srip->cpu_time, reply.host.claimed_credit_per_cpu_sec, srip->claimed_credit
);
}
srip->server_state = RESULT_SERVER_STATE_OVER;
strlcpy(srip->stderr_out, rp->stderr_out, sizeof(srip->stderr_out));
@ -262,16 +268,20 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
if ((srip->client_state == RESULT_FILES_UPLOADED) && (srip->exit_status == 0)) {
srip->outcome = RESULT_OUTCOME_SUCCESS;
log_messages.printf(MSG_DEBUG,
"[RESULT#%d %s]: setting outcome SUCCESS\n",
srip->id, srip->name
);
if (config.debug_handle_results) {
log_messages.printf(MSG_DEBUG,
"[RESULT#%d %s]: setting outcome SUCCESS\n",
srip->id, srip->name
);
}
reply.got_good_result();
} else {
log_messages.printf(MSG_DEBUG,
"[RESULT#%d %s]: client_state %d exit_status %d; setting outcome ERROR\n",
srip->id, srip->name, srip->client_state, srip->exit_status
);
if (config.debug_handle_results) {
log_messages.printf(MSG_DEBUG,
"[RESULT#%d %s]: client_state %d exit_status %d; setting outcome ERROR\n",
srip->id, srip->name, srip->client_state, srip->exit_status
);
}
srip->outcome = RESULT_OUTCOME_CLIENT_ERROR;
srip->validate_state = VALIDATE_STATE_INVALID;
reply.got_bad_result();

View File

@ -330,18 +330,8 @@ double estimate_cpu_duration(WORKUNIT& wu, SCHEDULER_REPLY& reply) {
static double estimate_wallclock_duration(
WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply
) {
double running_frac;
if (reply.wreq.core_client_version<=419) {
running_frac = reply.host.on_frac;
} else {
running_frac = reply.host.active_frac * reply.host.on_frac;
}
if (running_frac < HOST_ACTIVE_FRAC_MIN) {
running_frac = HOST_ACTIVE_FRAC_MIN;
}
if (running_frac > 1) running_frac = 1;
double ecd = estimate_cpu_duration(wu, reply);
double ewd = ecd/running_frac;
double ewd = ecd/reply.wreq.running_frac;
if (reply.host.duration_correction_factor) {
ewd *= reply.host.duration_correction_factor;
}
@ -350,8 +340,8 @@ static double estimate_wallclock_duration(
}
if (config.debug_send) {
log_messages.printf(MSG_DEBUG,
"est cpu dur %f; running_frac %f; est %f\n",
ecd, running_frac, ewd
"est cpu dur %f; est wall dur %f\n",
ecd, reply.wreq.running_frac, ewd
);
}
return ewd;
@ -401,13 +391,6 @@ static int get_host_info(SCHEDULER_REPLY& reply) {
double expavg_time = reply.host.expavg_time;
double avg_turnaround = reply.host.avg_turnaround;
update_average(0, 0, CREDIT_HALF_LIFE, expavg_credit, expavg_time);
if (config.debug_send) {
log_messages.printf(MSG_DEBUG,
"[HOST#%d] Checking if reliable (OS = %s) error_rate = %.3f avg_turnaround = %.0f hrs\n",
reply.host.id, reply.host.os_name, reply.host.error_rate,
reply.host.avg_turnaround/3600
);
}
// Platforms other then Windows, Linux and Intel Macs need a
// larger set of computers to be marked reliable
@ -428,13 +411,15 @@ static int get_host_info(SCHEDULER_REPLY& reply) {
&& (config.daily_result_quota == 0 || reply.host.max_results_day >= config.daily_result_quota)
) {
reply.wreq.host_info.reliable = true;
if (config.debug_send) {
log_messages.printf(MSG_DEBUG,
"[HOST#%d] is reliable (OS = %s) error_rate = %.3f avg_turnaround(hours) = %.0f \n",
reply.host.id, reply.host.os_name, reply.host.error_rate,
reply.host.avg_turnaround/3600
);
}
}
if (config.debug_send) {
log_messages.printf(MSG_DEBUG,
"[HOST#%d] is%s reliable (OS = %s) error_rate = %.3f avg_turn_hrs = %.0f \n",
reply.host.id,
reply.wreq.host_info.reliable?"":" not",
reply.host.os_name, reply.host.error_rate,
reply.host.avg_turnaround/3600
);
}
return 0;
}
@ -563,9 +548,9 @@ static inline int check_deadline(
if (diff > 0) {
if (config.debug_send) {
log_messages.printf(MSG_DEBUG,
"[WU#%d %s] needs %d seconds on [HOST#%d]; delay_bound is %d (estimated_delay is %f)\n",
wu.id, wu.name, (int)ewd, reply.host.id, wu.delay_bound,
request.estimated_delay
"[WU#%d %s] est report delay %d on [HOST#%d]; delay_bound is %d\n",
wu.id, wu.name, (int)est_report_delay,
reply.host.id, wu.delay_bound
);
}
reply.wreq.speed.set_insufficient(diff);
@ -1201,7 +1186,21 @@ static void explain_to_user(SCHEDULER_REPLY& reply) {
}
}
static void get_running_frac(SCHEDULER_REPLY& reply) {
if (reply.wreq.core_client_version<=419) {
reply.wreq.running_frac = reply.host.on_frac;
} else {
reply.wreq.running_frac = reply.host.active_frac * reply.host.on_frac;
}
if (reply.wreq.running_frac < HOST_ACTIVE_FRAC_MIN) {
reply.wreq.running_frac = HOST_ACTIVE_FRAC_MIN;
}
if (reply.wreq.running_frac > 1) reply.wreq.running_frac = 1;
}
void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
if (sreq.work_req_seconds <= 0) return;
reply.wreq.core_client_version =
sreq.core_client_major_version*100 + sreq.core_client_minor_version;
reply.wreq.disk_available = max_allowable_disk(sreq, reply);
@ -1214,18 +1213,33 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
}
get_host_info(reply); // parse project prefs for app details
reply.wreq.beta_only = false;
reply.wreq.user_apps_only = true;
get_running_frac(reply);
if (config.debug_send) {
log_messages.printf(MSG_DEBUG,
"[HOST#%d] got request for %f seconds of work; available disk %f GB\n",
reply.host.id, sreq.work_req_seconds, reply.wreq.disk_available/1e9
"%s matchmaking scheduling; %s EDF sim\n",
#ifdef MATCHMAKER
"Using",
#else
"Not using",
#endif
config.workload_sim?"Using":"Not using"
);
log_messages.printf(MSG_DEBUG,
"available disk %f GB, work_buf_min %d\n",
reply.wreq.disk_available/1e9,
(int)sreq.global_prefs.work_buf_min()
);
log_messages.printf(MSG_DEBUG,
"running frac %f DCF %f CPU effic %f est delay %d\n",
reply.wreq.running_frac,
reply.host.duration_correction_factor,
reply.host.cpu_efficiency,
(int)sreq.estimated_delay
);
}
if (sreq.work_req_seconds <= 0) return;
reply.wreq.seconds_to_fill = sreq.work_req_seconds;
if (reply.wreq.seconds_to_fill > MAX_SECONDS_TO_SEND) {
reply.wreq.seconds_to_fill = MAX_SECONDS_TO_SEND;
@ -1258,6 +1272,9 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
#ifdef MATCHMAKER
send_work_matchmaker(sreq, reply);
#else
reply.wreq.beta_only = false;
reply.wreq.user_apps_only = true;
// give top priority to results that require a 'reliable host'
//
if (reply.wreq.host_info.reliable) {
@ -1308,11 +1325,6 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
#endif
}
log_messages.printf(MSG_NORMAL,
"[HOST#%d] Sent %d results [scheduler ran %f seconds]\n",
reply.host.id, reply.wreq.nresults, elapsed_wallclock_time()
);
explain_to_user(reply);
}
@ -1635,9 +1647,11 @@ void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
JOB job;
job.index = i;
job.get_score(sreq, reply);
log_messages.printf(MSG_NORMAL,
"score for %s: %f\n", wu_result.workunit.name, job.score
);
if (config.debug_send) {
log_messages.printf(MSG_DEBUG,
"score for %s: %f\n", wu_result.workunit.name, job.score
);
}
if (job.score > jobs.lowest_score()) {
ssp->wu_results[i].state = pid;
unlock_sema();

View File

@ -496,7 +496,6 @@ SCHEDULER_REPLY::SCHEDULER_REPLY() {
memset(&team, 0, sizeof(team));
nucleus_only = false;
project_is_down = false;
probable_user_browser = false;
send_msg_ack = false;
strcpy(email_hash, "");
}
@ -543,10 +542,12 @@ int SCHEDULER_REPLY::write(FILE* fout) {
request_delay=min_delay_needed;
}
fprintf(fout, "<request_delay>%f</request_delay>\n", request_delay);
log_messages.printf(MSG_NORMAL,
"sending delay request %f\n", request_delay
);
}
log_messages.printf(MSG_NORMAL,
"Sending reply to [HOST#%d]: %d results, delay req %f [scheduler ran %f seconds]\n",
host.id, wreq.nresults, request_delay, elapsed_wallclock_time()
);
if (wreq.core_client_version <= 419) {
std::string msg;
std::string pri = "low";
@ -940,6 +941,7 @@ int RESULT::parse_from_client(FILE* fin) {
}
if (match_tag(buf, "<platform>")) continue;
if (match_tag(buf, "<version_num>")) continue;
if (match_tag(buf, "<plan_class>")) continue;
if (match_tag(buf, "<completed_time>")) continue;
if (match_tag(buf, "<file_name>")) continue;
if (match_tag(buf, "<file_ref>")) continue;

View File

@ -110,6 +110,7 @@ struct WORK_REQ {
double disk_available;
int nresults;
int core_client_version;
double running_frac;
RESOURCE disk;
RESOURCE mem;
@ -301,7 +302,6 @@ struct SCHEDULER_REPLY {
int lockfile_fd; // file descriptor of lockfile, or -1 if no lock.
bool send_global_prefs;
bool nucleus_only; // send only message
bool probable_user_browser;
USER user;
char email_hash[MD5_LEN];
HOST host; // after validation, contains full host rec