mirror of https://github.com/BOINC/boinc.git
- Overdue checkins on a bunch of minor scheduler code. Some that is
E@H specific is now included (but protected by #ifdef EINSTEIN_AT_HOME to make it simpler for me to maintain consistency with BOINC cvs. - Added project-specific unacceptable_os() function for rejecting hosts. - Transitioner and scheduler now initalize host.max_results_day correctly in database under all circumstances. - Browser requests are now correctly identifed (REQUEST_METHO=="GET") and properly redirected. This was broken. David, please see comment near one of the probable_user_browser=true in handle_request.C. I think something is wrong here (or I am missing the point!). - More info about requests is logged - If the scheduler hangs (incoming request incomplete) it will normally be killed by Apache after a timeout. But this happens silently. So I now install a signal handler and catch this SIGTERM. In this case an error message is logged and all open files are flushed before exit(1)ing. - If IO is passed through files, check that request length and content length agree and log a message if they do NOT. - active_frac not correctly reported by 4.19 and earlier core clients. Adjust for this in estimating wallclock execution times. - Added a small block into validator code to attach a debugger. svn path=/trunk/boinc/; revision=5688
This commit is contained in:
parent
94d0e9b63f
commit
58f5b86288
|
@ -26028,3 +26028,34 @@ David 19 Mar 2005
|
|||
license_1.0.txt (removed)
|
||||
win_build/installerv2/redist/0409/
|
||||
eula.rtf
|
||||
|
||||
Bruce 19 Mar 2005
|
||||
- Overdue checkins on a bunch of minor scheduler code. Some that is
|
||||
E@H specific is now included (but protected by
|
||||
#ifdef EINSTEIN_AT_HOME
|
||||
to make it simpler for me to maintain consistency with BOINC cvs.
|
||||
- Added project-specific unacceptable_os() function for rejecting hosts.
|
||||
- Transitioner and scheduler now initalize host.max_results_day correctly
|
||||
in database under all circumstances.
|
||||
- Browser requests are now correctly identifed (REQUEST_METHO=="GET") and
|
||||
properly redirected. This was broken. David, please see comment near
|
||||
one of the probable_user_browser=true in handle_request.C. I think
|
||||
something is wrong here (or I am missing the point!).
|
||||
- More info about requests is logged
|
||||
- If the scheduler hangs (incoming request incomplete) it will normally be
|
||||
killed by Apache after a timeout. But this happens silently. So I now
|
||||
install a signal handler and catch this SIGTERM. In this case an
|
||||
error message is logged and all open files are flushed before exit(1)ing.
|
||||
- If IO is passed through files, check that request length and content length
|
||||
agree and log a message if they do NOT.
|
||||
- active_frac not correctly reported by 4.19 and earlier core clients.
|
||||
Adjust for this in estimating wallclock execution times.
|
||||
- Added a small block into validator code to attach a debugger.
|
||||
|
||||
sched/
|
||||
handle_request.C
|
||||
sched_send.C
|
||||
main.C
|
||||
transitioner.C
|
||||
validator.C
|
||||
|
||||
|
|
|
@ -99,6 +99,11 @@ int authenticate_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
|
|||
goto lookup_user_and_make_new_host;
|
||||
}
|
||||
reply.host = host;
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::DEBUG,
|
||||
"Request [HOST#%d] Database [HOST#%d] Request [RPC#%d] Database [RPC#%d]\n",
|
||||
sreq.hostid, host.id, sreq.rpc_seqno, host.rpc_seqno
|
||||
);
|
||||
|
||||
strlcpy(
|
||||
user.authenticator, sreq.authenticator,
|
||||
|
@ -513,7 +518,7 @@ int handle_results(
|
|||
srip->exit_status = rp->exit_status;
|
||||
srip->app_version_num = rp->app_version_num;
|
||||
srip->claimed_credit = rp->cpu_time * reply.host.credit_per_cpu_sec;
|
||||
#if 1
|
||||
#ifdef EINSTEIN_AT_HOME
|
||||
log_messages.printf(SCHED_MSG_LOG::DEBUG,
|
||||
"cpu %f cpcs %f, cc %f\n", srip->cpu_time, reply.host.credit_per_cpu_sec, srip->claimed_credit
|
||||
);
|
||||
|
@ -715,6 +720,41 @@ void warn_user_if_core_client_upgrade_scheduled(
|
|||
return;
|
||||
}
|
||||
|
||||
#ifdef EINSTEIN_AT_HOME
|
||||
bool unacceptable_os(
|
||||
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
|
||||
) {
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::NORMAL,
|
||||
"OS version %s %s\n",
|
||||
sreq.host.os_name, sreq.host.os_version
|
||||
);
|
||||
|
||||
if (!strcmp(sreq.host.os_name, "Darwin") &&
|
||||
(!strncmp(sreq.host.os_version, "5.", 2) ||
|
||||
!strncmp(sreq.host.os_version, "6.", 2)
|
||||
)
|
||||
) {
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::NORMAL,
|
||||
"Unacceptable OS %s %s\n",
|
||||
sreq.host.os_name, sreq.host.os_version
|
||||
);
|
||||
USER_MESSAGE um("Project only supports MacOS Darwin versions 7.X and above",
|
||||
"low");
|
||||
reply.insert_message(um);
|
||||
reply.set_delay(3600*24);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
bool unacceptable_os(
|
||||
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
#endif // EINSTEIN_AT_HOME
|
||||
|
||||
bool wrong_core_client_version(
|
||||
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
|
||||
|
@ -725,9 +765,7 @@ bool wrong_core_client_version(
|
|||
// TODO: check for user-agent not empty and not BOINC
|
||||
wrong_version = true;
|
||||
sprintf(msg,
|
||||
"To participate in this project, "
|
||||
"you must use major version %d of the BOINC core client. "
|
||||
"Your core client is major version %d.",
|
||||
"Need major version %d of the BOINC core client. You have %d.",
|
||||
BOINC_MAJOR_VERSION,
|
||||
sreq.core_client_major_version
|
||||
);
|
||||
|
@ -743,9 +781,7 @@ bool wrong_core_client_version(
|
|||
if (sreq.core_client_minor_version < minor) {
|
||||
wrong_version = true;
|
||||
sprintf(msg,
|
||||
"To participate in this project, "
|
||||
"you must use version %d.%02d or higher of the BOINC core client. "
|
||||
"Your core client is version %d.%02d.",
|
||||
"Need version %d.%02d or higher of the BOINC core client. You have %d.%02d.",
|
||||
major, minor,
|
||||
sreq.core_client_major_version, sreq.core_client_minor_version
|
||||
);
|
||||
|
@ -760,6 +796,7 @@ bool wrong_core_client_version(
|
|||
if (wrong_version) {
|
||||
USER_MESSAGE um(msg, "low");
|
||||
reply.insert_message(um);
|
||||
// IS THE FOLLOWING LINE CORRECT? I DON'T UNDERSTAND IT. Bruce
|
||||
reply.probable_user_browser = true;
|
||||
reply.set_delay(3600*24);
|
||||
return true;
|
||||
|
@ -852,7 +889,7 @@ void process_request(
|
|||
|
||||
// if different major version of BOINC, just send a message
|
||||
//
|
||||
if (wrong_core_client_version(sreq, reply)) {
|
||||
if (wrong_core_client_version(sreq, reply) || unacceptable_os(sreq, reply)) {
|
||||
ok_to_send_work = false;
|
||||
|
||||
// if no results, return without accessing DB
|
||||
|
@ -863,7 +900,7 @@ void process_request(
|
|||
} else {
|
||||
warn_user_if_core_client_upgrade_scheduled(sreq, reply);
|
||||
}
|
||||
|
||||
|
||||
if (config.locality_scheduling) {
|
||||
have_no_work = false;
|
||||
} else {
|
||||
|
@ -927,6 +964,13 @@ void process_request(
|
|||
rpc_time_tm = localtime((const time_t*)&reply.host.rpc_time);
|
||||
current_rpc_dayofyear = rpc_time_tm->tm_yday;
|
||||
|
||||
if (config.daily_result_quota) {
|
||||
if (reply.host.max_results_day <= 0 || reply.host.max_results_day > config.daily_result_quota) {
|
||||
reply.host.max_results_day = config.daily_result_quota;
|
||||
log_messages.printf(SCHED_MSG_LOG::DEBUG, "[HOST#%d] Initializing max_results_day to %d\n", reply.host.id, config.daily_result_quota);
|
||||
}
|
||||
}
|
||||
|
||||
if (last_rpc_dayofyear != current_rpc_dayofyear) {
|
||||
log_messages.printf(SCHED_MSG_LOG::DEBUG, "[HOST#%d] Resetting nresults_today\n", reply.host.id);
|
||||
reply.host.nresults_today = 0;
|
||||
|
@ -1139,19 +1183,38 @@ void handle_request(
|
|||
);
|
||||
process_request(sreq, sreply, ss, code_sign_key);
|
||||
} else {
|
||||
// BOINC scheduler requests use method POST. So method GET
|
||||
// means that someone is trying a browser.
|
||||
char *rm=getenv("REQUEST_METHOD");
|
||||
if (rm && !strcmp(rm, "GET")) sreply.probable_user_browser=true;
|
||||
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::NORMAL,
|
||||
"Incomplete request received from IP %s, auth %s, platform %s, version %d.%02d\n",
|
||||
"Incomplete request received %sfrom IP %s, auth %s, platform %s, version %d.%02d\n",
|
||||
sreply.probable_user_browser?"(probably a browser) ":"",
|
||||
get_remote_addr(), sreq.authenticator, sreq.platform_name,
|
||||
sreq.core_client_major_version, sreq.core_client_minor_version
|
||||
);
|
||||
|
||||
USER_MESSAGE um("Incomplete request received.", "low");
|
||||
sreply.insert_message(um);
|
||||
sreply.nucleus_only = true;
|
||||
}
|
||||
|
||||
#ifdef EINSTEIN_AT_HOME
|
||||
// for testing
|
||||
if (sreply.user.id==3) {
|
||||
USER_MESSAGE um("THIS IS A SHORT MESSAGE. \n AND ANOTHER", "high");
|
||||
// USER_MESSAGE um("THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE. \n"
|
||||
// "THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE.", "low");
|
||||
sreply.insert_message(um);
|
||||
// USER_MESSAGE um2("THIS IS A VERY LONG TEST MESSAGE2. THIS IS A VERY LONG TEST MESSAGE. \n"
|
||||
// "THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE.", "high");
|
||||
// sreply.insert_message(um2);
|
||||
}
|
||||
#endif
|
||||
|
||||
// if we got no work, and we have no file space, delete some files
|
||||
//
|
||||
if (sreply.results.size()==0 && (sreply.wreq.insufficient_disk || sreply.wreq.disk_available<0)) {
|
||||
// try to delete a file to make more space.
|
||||
// Also give some hints to the user about what's going wrong
|
||||
|
@ -1160,7 +1223,17 @@ void handle_request(
|
|||
delete_file_from_host(sreq, sreply);
|
||||
}
|
||||
|
||||
#if 1
|
||||
// write all messages to log file
|
||||
for (unsigned int i=0; i<sreply.messages.size(); i++) {
|
||||
USER_MESSAGE um = sreply.messages[i];
|
||||
log_messages.printf(SCHED_MSG_LOG::DEBUG,
|
||||
"[HOST#%d] MSG(%4s) %s \n", sreply.host.id, um.priority.c_str(), um.message.c_str()
|
||||
);
|
||||
}
|
||||
|
||||
debug_sched(sreq, sreply, "../debug_sched");
|
||||
|
||||
#ifdef EINSTEIN_AT_HOME
|
||||
// You can call debug_sched() for whatever situation is of
|
||||
// interest to you. It won't do anything unless you create
|
||||
// (touch) the file 'debug_sched' in the project root directory.
|
||||
|
|
70
sched/main.C
70
sched/main.C
|
@ -28,6 +28,8 @@
|
|||
using namespace std;
|
||||
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
#include <errno.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
|
@ -95,6 +97,48 @@ int open_database() {
|
|||
return 0;
|
||||
}
|
||||
|
||||
void debug_sched(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply, const char *trigger);
|
||||
|
||||
// If the scheduler 'hangs', which it can do if a request is not fully processed
|
||||
// or some other process arises, then Apache will send a SIGTERM to the cgi.
|
||||
// This signal handler ensures that rather than dying silently, the cgi process
|
||||
// will leave behind some record in the log file.
|
||||
//
|
||||
void sigterm_handler(int signo) {
|
||||
log_messages.printf(SCHED_MSG_LOG::CRITICAL,
|
||||
"BOINC scheduler (pid=%d) caught signal %d. Exit(1)ing\n",
|
||||
(int)getpid(), signo
|
||||
);
|
||||
fflush(NULL);
|
||||
exit(1);
|
||||
return;
|
||||
}
|
||||
|
||||
void log_request_info(int& length) {
|
||||
char *cl=getenv("CONTENT_LENGTH");
|
||||
char *ri=getenv("REMOTE_ADDR");
|
||||
char *rm=getenv("REQUEST_METHOD");
|
||||
char *ct=getenv("CONTENT_TYPE");
|
||||
char *ha=getenv("HTTP_ACCEPT");
|
||||
char *hu=getenv("HTTP_USER_AGENT");
|
||||
|
||||
log_messages.printf(SCHED_MSG_LOG::DEBUG,
|
||||
"REQUEST_METHOD=%s "
|
||||
"CONTENT_TYPE=%s "
|
||||
"HTTP_ACCEPT=%s "
|
||||
"HTTP_USER_AGENT=%s\n",
|
||||
rm?rm:"" , ct?ct:"", ha?ha:"", hu?hu:""
|
||||
);
|
||||
|
||||
if (!cl) {
|
||||
log_messages.printf(SCHED_MSG_LOG::CRITICAL, "CONTENT_LENGTH environment variable not set\n");
|
||||
}
|
||||
else {
|
||||
length=atoi(cl);
|
||||
log_messages.printf(SCHED_MSG_LOG::DEBUG, "CONTENT_LENGTH=%d from %s\n", length, ri?ri:"[Unknown]");
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
FILE* fin, *fout;
|
||||
int i, retval;
|
||||
|
@ -104,6 +148,12 @@ int main() {
|
|||
unsigned int counter=0;
|
||||
char* code_sign_key;
|
||||
bool project_stopped = false;
|
||||
int length=-1;
|
||||
|
||||
// install a signal handler that catches SIGTERMS sent by Apache if the cgi
|
||||
// times out.
|
||||
//
|
||||
signal(SIGTERM, sigterm_handler);
|
||||
|
||||
#ifndef _USING_FCGI_
|
||||
char *stderr_buffer, buf[256];
|
||||
|
@ -235,8 +285,11 @@ int main() {
|
|||
send_message("Project is temporarily shut down for maintenance", 3600);
|
||||
goto done;
|
||||
}
|
||||
log_request_info(length);
|
||||
fprintf(stdout,"Content-type: text/plain\n\n");
|
||||
|
||||
if (use_files) {
|
||||
struct stat statbuf;
|
||||
// the code below is convoluted because,
|
||||
// instead of going from stdin to stdout directly,
|
||||
// we go via a pair of disk files
|
||||
|
@ -252,6 +305,14 @@ int main() {
|
|||
}
|
||||
copy_stream(stdin, fout);
|
||||
fclose(fout);
|
||||
stat(req_path, &statbuf);
|
||||
if (length>=0 && (statbuf.st_size != length)) {
|
||||
log_messages.printf(SCHED_MSG_LOG::CRITICAL,
|
||||
"Request length %d != CONTENT_LENGTH %d\n",
|
||||
(int)statbuf.st_size, length
|
||||
);
|
||||
}
|
||||
|
||||
fin = fopen(req_path, "r");
|
||||
if (!fin) {
|
||||
log_messages.printf(SCHED_MSG_LOG::CRITICAL, "can't read request file\n");
|
||||
|
@ -273,8 +334,13 @@ int main() {
|
|||
}
|
||||
copy_stream(fin, stdout);
|
||||
fclose(fin);
|
||||
//unlink(req_path);
|
||||
//unlink(reply_path);
|
||||
#ifdef EINSTEIN_AT_HOME
|
||||
if (getenv("CONTENT_LENGTH")) unlink(req_path);
|
||||
if (getenv("CONTENT_LENGTH")) unlink(reply_path);
|
||||
#else
|
||||
// unlink(req_path);
|
||||
// unlink(reply_path);
|
||||
#endif
|
||||
} else {
|
||||
handle_request(stdin, stdout, *ssp, code_sign_key);
|
||||
}
|
||||
|
|
|
@ -178,14 +178,19 @@ static double estimate_wallclock_duration(
|
|||
WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply
|
||||
) {
|
||||
double running_frac;
|
||||
running_frac = reply.host.active_frac * reply.host.on_frac;
|
||||
if (reply.wreq.core_client_version<=419) {
|
||||
running_frac = reply.host.on_frac;
|
||||
}
|
||||
else {
|
||||
running_frac = reply.host.active_frac * reply.host.on_frac;
|
||||
}
|
||||
if (running_frac < HOST_ACTIVE_FRAC_MIN) {
|
||||
running_frac = HOST_ACTIVE_FRAC_MIN;
|
||||
}
|
||||
if (running_frac > 1) running_frac = 1;
|
||||
double ecd = estimate_cpu_duration(wu, reply);
|
||||
double ewd = ecd/(running_frac*request.resource_share_fraction);
|
||||
#if 0
|
||||
#ifdef EINSTEIN_AT_HOME
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::DEBUG, "est cpu dur %f; running_frac %f; rsf %f; est %f\n",
|
||||
ecd, running_frac, request.resource_share_fraction, ewd
|
||||
|
@ -533,15 +538,13 @@ bool SCHEDULER_REPLY::work_needed(bool locality_sched) {
|
|||
return false;
|
||||
}
|
||||
if (wreq.nresults >= config.max_wus_to_send) return false;
|
||||
if (config.daily_result_quota) {
|
||||
if (host.max_results_day == 0) {
|
||||
host.max_results_day = config.daily_result_quota;
|
||||
}
|
||||
|
||||
if (config.daily_result_quota) {
|
||||
// scale daily quota by #CPUs, up to a limit of 4
|
||||
//
|
||||
int ncpus = host.p_ncpus;
|
||||
if (ncpus > 4) ncpus = 4;
|
||||
if (ncpus < 1) ncpus = 1;
|
||||
wreq.daily_result_quota = ncpus*host.max_results_day;
|
||||
if (host.nresults_today >= wreq.daily_result_quota) {
|
||||
wreq.daily_result_quota_exceeded = true;
|
||||
|
@ -903,11 +906,20 @@ int send_work(
|
|||
}
|
||||
if (reply.wreq.insufficient_speed) {
|
||||
char helpful[512];
|
||||
sprintf(helpful,
|
||||
"(won't finish in time) "
|
||||
"Computer on %.1f%% of time, BOINC on %.1f%% of that, this project gets %.1f%% of that",
|
||||
100.0*reply.host.on_frac, 100.0*reply.host.active_frac, 100.0*sreq.resource_share_fraction
|
||||
);
|
||||
if (reply.wreq.core_client_version>419) {
|
||||
sprintf(helpful,
|
||||
"(won't finish in time) "
|
||||
"Computer on %.1f%% of time, BOINC on %.1f%% of that, this project gets %.1f%% of that",
|
||||
100.0*reply.host.on_frac, 100.0*reply.host.active_frac, 100.0*sreq.resource_share_fraction
|
||||
);
|
||||
}
|
||||
else {
|
||||
sprintf(helpful,
|
||||
"(won't finish in time) "
|
||||
"Computer available %.1f%% of time, this project gets %.1f%% of that",
|
||||
100.0*reply.host.on_frac, 100.0*sreq.resource_share_fraction
|
||||
);
|
||||
}
|
||||
USER_MESSAGE um(helpful, "high");
|
||||
reply.insert_message(um);
|
||||
}
|
||||
|
|
|
@ -70,6 +70,9 @@ int penalize_host(int hostid, double delay_bound) {
|
|||
int retval = host.lookup_id(hostid);
|
||||
if (retval) return retval;
|
||||
compute_avg_turnaround(host, delay_bound);
|
||||
if (host.max_results_day <= 0 || host.max_results_day > config.daily_result_quota) {
|
||||
host.max_results_day = config.daily_result_quota;
|
||||
}
|
||||
host.max_results_day -= 1;
|
||||
if (host.max_results_day < 1) {
|
||||
host.max_results_day = 1;
|
||||
|
|
Loading…
Reference in New Issue