- Overdue checkins on a bunch of minor scheduler code. Some that is

E@H specific is now included (but protected by
      #ifdef EINSTEIN_AT_HOME
      to make it simpler for me to maintain consistency with BOINC cvs.
    - Added project-specific unacceptable_os() function for rejecting hosts.
    - Transitioner and scheduler now initalize host.max_results_day correctly
      in database under all circumstances.
    - Browser requests are now correctly identifed (REQUEST_METHO=="GET") and
      properly redirected.  This was broken. David, please see comment near
      one of the probable_user_browser=true in handle_request.C.  I think
      something is wrong here (or I am missing the point!).
    - More info about requests is logged
    - If the scheduler hangs (incoming request incomplete) it will normally be
      killed by Apache after a timeout.  But this happens silently.  So I now
      install a signal handler and catch this SIGTERM.  In this case an
      error message is logged and all open files are flushed before exit(1)ing.
    - If IO is passed through files, check that request length and content length
      agree and log a message if they do NOT.
    - active_frac not correctly reported by 4.19 and earlier core clients.
      Adjust for this in estimating wallclock execution times.
    - Added a small block into validator code to attach a debugger.

svn path=/trunk/boinc/; revision=5688
This commit is contained in:
Bruce Allen 2005-03-19 18:24:24 +00:00
parent 94d0e9b63f
commit 58f5b86288
5 changed files with 210 additions and 25 deletions

View File

@ -26028,3 +26028,34 @@ David 19 Mar 2005
license_1.0.txt (removed)
win_build/installerv2/redist/0409/
eula.rtf
Bruce 19 Mar 2005
- Overdue checkins on a bunch of minor scheduler code. Some that is
E@H specific is now included (but protected by
#ifdef EINSTEIN_AT_HOME
to make it simpler for me to maintain consistency with BOINC cvs.
- Added project-specific unacceptable_os() function for rejecting hosts.
- Transitioner and scheduler now initalize host.max_results_day correctly
in database under all circumstances.
- Browser requests are now correctly identifed (REQUEST_METHO=="GET") and
properly redirected. This was broken. David, please see comment near
one of the probable_user_browser=true in handle_request.C. I think
something is wrong here (or I am missing the point!).
- More info about requests is logged
- If the scheduler hangs (incoming request incomplete) it will normally be
killed by Apache after a timeout. But this happens silently. So I now
install a signal handler and catch this SIGTERM. In this case an
error message is logged and all open files are flushed before exit(1)ing.
- If IO is passed through files, check that request length and content length
agree and log a message if they do NOT.
- active_frac not correctly reported by 4.19 and earlier core clients.
Adjust for this in estimating wallclock execution times.
- Added a small block into validator code to attach a debugger.
sched/
handle_request.C
sched_send.C
main.C
transitioner.C
validator.C

View File

@ -99,6 +99,11 @@ int authenticate_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
goto lookup_user_and_make_new_host;
}
reply.host = host;
log_messages.printf(
SCHED_MSG_LOG::DEBUG,
"Request [HOST#%d] Database [HOST#%d] Request [RPC#%d] Database [RPC#%d]\n",
sreq.hostid, host.id, sreq.rpc_seqno, host.rpc_seqno
);
strlcpy(
user.authenticator, sreq.authenticator,
@ -513,7 +518,7 @@ int handle_results(
srip->exit_status = rp->exit_status;
srip->app_version_num = rp->app_version_num;
srip->claimed_credit = rp->cpu_time * reply.host.credit_per_cpu_sec;
#if 1
#ifdef EINSTEIN_AT_HOME
log_messages.printf(SCHED_MSG_LOG::DEBUG,
"cpu %f cpcs %f, cc %f\n", srip->cpu_time, reply.host.credit_per_cpu_sec, srip->claimed_credit
);
@ -715,6 +720,41 @@ void warn_user_if_core_client_upgrade_scheduled(
return;
}
#ifdef EINSTEIN_AT_HOME
bool unacceptable_os(
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
) {
log_messages.printf(
SCHED_MSG_LOG::NORMAL,
"OS version %s %s\n",
sreq.host.os_name, sreq.host.os_version
);
if (!strcmp(sreq.host.os_name, "Darwin") &&
(!strncmp(sreq.host.os_version, "5.", 2) ||
!strncmp(sreq.host.os_version, "6.", 2)
)
) {
log_messages.printf(
SCHED_MSG_LOG::NORMAL,
"Unacceptable OS %s %s\n",
sreq.host.os_name, sreq.host.os_version
);
USER_MESSAGE um("Project only supports MacOS Darwin versions 7.X and above",
"low");
reply.insert_message(um);
reply.set_delay(3600*24);
return true;
}
return false;
}
#else
bool unacceptable_os(
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
) {
return false;
}
#endif // EINSTEIN_AT_HOME
bool wrong_core_client_version(
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
@ -725,9 +765,7 @@ bool wrong_core_client_version(
// TODO: check for user-agent not empty and not BOINC
wrong_version = true;
sprintf(msg,
"To participate in this project, "
"you must use major version %d of the BOINC core client. "
"Your core client is major version %d.",
"Need major version %d of the BOINC core client. You have %d.",
BOINC_MAJOR_VERSION,
sreq.core_client_major_version
);
@ -743,9 +781,7 @@ bool wrong_core_client_version(
if (sreq.core_client_minor_version < minor) {
wrong_version = true;
sprintf(msg,
"To participate in this project, "
"you must use version %d.%02d or higher of the BOINC core client. "
"Your core client is version %d.%02d.",
"Need version %d.%02d or higher of the BOINC core client. You have %d.%02d.",
major, minor,
sreq.core_client_major_version, sreq.core_client_minor_version
);
@ -760,6 +796,7 @@ bool wrong_core_client_version(
if (wrong_version) {
USER_MESSAGE um(msg, "low");
reply.insert_message(um);
// IS THE FOLLOWING LINE CORRECT? I DON'T UNDERSTAND IT. Bruce
reply.probable_user_browser = true;
reply.set_delay(3600*24);
return true;
@ -852,7 +889,7 @@ void process_request(
// if different major version of BOINC, just send a message
//
if (wrong_core_client_version(sreq, reply)) {
if (wrong_core_client_version(sreq, reply) || unacceptable_os(sreq, reply)) {
ok_to_send_work = false;
// if no results, return without accessing DB
@ -863,7 +900,7 @@ void process_request(
} else {
warn_user_if_core_client_upgrade_scheduled(sreq, reply);
}
if (config.locality_scheduling) {
have_no_work = false;
} else {
@ -927,6 +964,13 @@ void process_request(
rpc_time_tm = localtime((const time_t*)&reply.host.rpc_time);
current_rpc_dayofyear = rpc_time_tm->tm_yday;
if (config.daily_result_quota) {
if (reply.host.max_results_day <= 0 || reply.host.max_results_day > config.daily_result_quota) {
reply.host.max_results_day = config.daily_result_quota;
log_messages.printf(SCHED_MSG_LOG::DEBUG, "[HOST#%d] Initializing max_results_day to %d\n", reply.host.id, config.daily_result_quota);
}
}
if (last_rpc_dayofyear != current_rpc_dayofyear) {
log_messages.printf(SCHED_MSG_LOG::DEBUG, "[HOST#%d] Resetting nresults_today\n", reply.host.id);
reply.host.nresults_today = 0;
@ -1139,19 +1183,38 @@ void handle_request(
);
process_request(sreq, sreply, ss, code_sign_key);
} else {
// BOINC scheduler requests use method POST. So method GET
// means that someone is trying a browser.
char *rm=getenv("REQUEST_METHOD");
if (rm && !strcmp(rm, "GET")) sreply.probable_user_browser=true;
log_messages.printf(
SCHED_MSG_LOG::NORMAL,
"Incomplete request received from IP %s, auth %s, platform %s, version %d.%02d\n",
"Incomplete request received %sfrom IP %s, auth %s, platform %s, version %d.%02d\n",
sreply.probable_user_browser?"(probably a browser) ":"",
get_remote_addr(), sreq.authenticator, sreq.platform_name,
sreq.core_client_major_version, sreq.core_client_minor_version
);
USER_MESSAGE um("Incomplete request received.", "low");
sreply.insert_message(um);
sreply.nucleus_only = true;
}
#ifdef EINSTEIN_AT_HOME
// for testing
if (sreply.user.id==3) {
USER_MESSAGE um("THIS IS A SHORT MESSAGE. \n AND ANOTHER", "high");
// USER_MESSAGE um("THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE. \n"
// "THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE.", "low");
sreply.insert_message(um);
// USER_MESSAGE um2("THIS IS A VERY LONG TEST MESSAGE2. THIS IS A VERY LONG TEST MESSAGE. \n"
// "THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE.", "high");
// sreply.insert_message(um2);
}
#endif
// if we got no work, and we have no file space, delete some files
//
if (sreply.results.size()==0 && (sreply.wreq.insufficient_disk || sreply.wreq.disk_available<0)) {
// try to delete a file to make more space.
// Also give some hints to the user about what's going wrong
@ -1160,7 +1223,17 @@ void handle_request(
delete_file_from_host(sreq, sreply);
}
#if 1
// write all messages to log file
for (unsigned int i=0; i<sreply.messages.size(); i++) {
USER_MESSAGE um = sreply.messages[i];
log_messages.printf(SCHED_MSG_LOG::DEBUG,
"[HOST#%d] MSG(%4s) %s \n", sreply.host.id, um.priority.c_str(), um.message.c_str()
);
}
debug_sched(sreq, sreply, "../debug_sched");
#ifdef EINSTEIN_AT_HOME
// You can call debug_sched() for whatever situation is of
// interest to you. It won't do anything unless you create
// (touch) the file 'debug_sched' in the project root directory.

View File

@ -28,6 +28,8 @@
using namespace std;
#include <unistd.h>
#include <signal.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
@ -95,6 +97,48 @@ int open_database() {
return 0;
}
void debug_sched(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply, const char *trigger);
// If the scheduler 'hangs', which it can do if a request is not fully processed
// or some other process arises, then Apache will send a SIGTERM to the cgi.
// This signal handler ensures that rather than dying silently, the cgi process
// will leave behind some record in the log file.
//
void sigterm_handler(int signo) {
log_messages.printf(SCHED_MSG_LOG::CRITICAL,
"BOINC scheduler (pid=%d) caught signal %d. Exit(1)ing\n",
(int)getpid(), signo
);
fflush(NULL);
exit(1);
return;
}
void log_request_info(int& length) {
char *cl=getenv("CONTENT_LENGTH");
char *ri=getenv("REMOTE_ADDR");
char *rm=getenv("REQUEST_METHOD");
char *ct=getenv("CONTENT_TYPE");
char *ha=getenv("HTTP_ACCEPT");
char *hu=getenv("HTTP_USER_AGENT");
log_messages.printf(SCHED_MSG_LOG::DEBUG,
"REQUEST_METHOD=%s "
"CONTENT_TYPE=%s "
"HTTP_ACCEPT=%s "
"HTTP_USER_AGENT=%s\n",
rm?rm:"" , ct?ct:"", ha?ha:"", hu?hu:""
);
if (!cl) {
log_messages.printf(SCHED_MSG_LOG::CRITICAL, "CONTENT_LENGTH environment variable not set\n");
}
else {
length=atoi(cl);
log_messages.printf(SCHED_MSG_LOG::DEBUG, "CONTENT_LENGTH=%d from %s\n", length, ri?ri:"[Unknown]");
}
}
int main() {
FILE* fin, *fout;
int i, retval;
@ -104,6 +148,12 @@ int main() {
unsigned int counter=0;
char* code_sign_key;
bool project_stopped = false;
int length=-1;
// install a signal handler that catches SIGTERMS sent by Apache if the cgi
// times out.
//
signal(SIGTERM, sigterm_handler);
#ifndef _USING_FCGI_
char *stderr_buffer, buf[256];
@ -235,8 +285,11 @@ int main() {
send_message("Project is temporarily shut down for maintenance", 3600);
goto done;
}
log_request_info(length);
fprintf(stdout,"Content-type: text/plain\n\n");
if (use_files) {
struct stat statbuf;
// the code below is convoluted because,
// instead of going from stdin to stdout directly,
// we go via a pair of disk files
@ -252,6 +305,14 @@ int main() {
}
copy_stream(stdin, fout);
fclose(fout);
stat(req_path, &statbuf);
if (length>=0 && (statbuf.st_size != length)) {
log_messages.printf(SCHED_MSG_LOG::CRITICAL,
"Request length %d != CONTENT_LENGTH %d\n",
(int)statbuf.st_size, length
);
}
fin = fopen(req_path, "r");
if (!fin) {
log_messages.printf(SCHED_MSG_LOG::CRITICAL, "can't read request file\n");
@ -273,8 +334,13 @@ int main() {
}
copy_stream(fin, stdout);
fclose(fin);
//unlink(req_path);
//unlink(reply_path);
#ifdef EINSTEIN_AT_HOME
if (getenv("CONTENT_LENGTH")) unlink(req_path);
if (getenv("CONTENT_LENGTH")) unlink(reply_path);
#else
// unlink(req_path);
// unlink(reply_path);
#endif
} else {
handle_request(stdin, stdout, *ssp, code_sign_key);
}

View File

@ -178,14 +178,19 @@ static double estimate_wallclock_duration(
WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply
) {
double running_frac;
running_frac = reply.host.active_frac * reply.host.on_frac;
if (reply.wreq.core_client_version<=419) {
running_frac = reply.host.on_frac;
}
else {
running_frac = reply.host.active_frac * reply.host.on_frac;
}
if (running_frac < HOST_ACTIVE_FRAC_MIN) {
running_frac = HOST_ACTIVE_FRAC_MIN;
}
if (running_frac > 1) running_frac = 1;
double ecd = estimate_cpu_duration(wu, reply);
double ewd = ecd/(running_frac*request.resource_share_fraction);
#if 0
#ifdef EINSTEIN_AT_HOME
log_messages.printf(
SCHED_MSG_LOG::DEBUG, "est cpu dur %f; running_frac %f; rsf %f; est %f\n",
ecd, running_frac, request.resource_share_fraction, ewd
@ -533,15 +538,13 @@ bool SCHEDULER_REPLY::work_needed(bool locality_sched) {
return false;
}
if (wreq.nresults >= config.max_wus_to_send) return false;
if (config.daily_result_quota) {
if (host.max_results_day == 0) {
host.max_results_day = config.daily_result_quota;
}
if (config.daily_result_quota) {
// scale daily quota by #CPUs, up to a limit of 4
//
int ncpus = host.p_ncpus;
if (ncpus > 4) ncpus = 4;
if (ncpus < 1) ncpus = 1;
wreq.daily_result_quota = ncpus*host.max_results_day;
if (host.nresults_today >= wreq.daily_result_quota) {
wreq.daily_result_quota_exceeded = true;
@ -903,11 +906,20 @@ int send_work(
}
if (reply.wreq.insufficient_speed) {
char helpful[512];
sprintf(helpful,
"(won't finish in time) "
"Computer on %.1f%% of time, BOINC on %.1f%% of that, this project gets %.1f%% of that",
100.0*reply.host.on_frac, 100.0*reply.host.active_frac, 100.0*sreq.resource_share_fraction
);
if (reply.wreq.core_client_version>419) {
sprintf(helpful,
"(won't finish in time) "
"Computer on %.1f%% of time, BOINC on %.1f%% of that, this project gets %.1f%% of that",
100.0*reply.host.on_frac, 100.0*reply.host.active_frac, 100.0*sreq.resource_share_fraction
);
}
else {
sprintf(helpful,
"(won't finish in time) "
"Computer available %.1f%% of time, this project gets %.1f%% of that",
100.0*reply.host.on_frac, 100.0*sreq.resource_share_fraction
);
}
USER_MESSAGE um(helpful, "high");
reply.insert_message(um);
}

View File

@ -70,6 +70,9 @@ int penalize_host(int hostid, double delay_bound) {
int retval = host.lookup_id(hostid);
if (retval) return retval;
compute_avg_turnaround(host, delay_bound);
if (host.max_results_day <= 0 || host.max_results_day > config.daily_result_quota) {
host.max_results_day = config.daily_result_quota;
}
host.max_results_day -= 1;
if (host.max_results_day < 1) {
host.max_results_day = 1;