*** empty log message ***

svn path=/trunk/boinc/; revision=2660
This commit is contained in:
David Anderson 2003-11-11 23:14:49 +00:00
parent 032644aaaf
commit 133a8d8db7
3 changed files with 139 additions and 77 deletions

View File

@ -7695,3 +7695,23 @@ David 11 Nov 2003
sched_shmem.C sched_shmem.C
tools/ tools/
backend_lib.C backend_lib.C
David 11 Nov 2003
- modify the scheduler so that if a client requests work,
and there are unsent results, but none can be sent,
then the client gets an explanatory message: either
- no app version available
- insufficient disk
- insufficient memory
- host is too slow
This involved creating a WORK_REQ structure that gets passed
around, to avoid huge numbers of arguments
- modify the scheduler so that if either
1) the shared-mem segment is absent or
2) the DB can't be opened
then it writes a "project temporarily down for maintenance"
message instead of just exiting
sched/
handle_request.C
main.C

View File

@ -47,6 +47,21 @@ const int MAX_WUS_TO_SEND = 10;
const double COBBLESTONE_FACTOR = 300.0; const double COBBLESTONE_FACTOR = 300.0;
struct WORK_REQ {
bool infeasible_only;
double seconds_to_fill;
double disk_available;
int nresults;
// the following flags are set whenever a result is infeasible;
// used to construct explanatory message to user
//
bool insufficient_disk;
bool insufficient_mem;
bool insufficient_speed;
bool no_app_version;
};
// compute the max disk usage we can request of the host // compute the max disk usage we can request of the host
// //
double max_allowable_disk(USER& user, SCHEDULER_REQUEST& req) { double max_allowable_disk(USER& user, SCHEDULER_REQUEST& req) {
@ -58,7 +73,7 @@ double max_allowable_disk(USER& user, SCHEDULER_REQUEST& req) {
// fill in default values for missing prefs // fill in default values for missing prefs
// //
if (prefs.disk_max_used_gb == 0) prefs.disk_max_used_gb = 0.01; // 10 MB if (prefs.disk_max_used_gb == 0) prefs.disk_max_used_gb = 0.1; // 100 MB
if (prefs.disk_max_used_pct == 0) prefs.disk_max_used_pct = 10; if (prefs.disk_max_used_pct == 0) prefs.disk_max_used_pct = 10;
// min_free_gb can be zero // min_free_gb can be zero
@ -124,19 +139,13 @@ inline double estimate_wallclock_duration(WORKUNIT& wu, HOST& host) {
// return true if the WU can be executed on the host // return true if the WU can be executed on the host
// //
bool wu_is_feasible(WORKUNIT& wu, HOST& host) { bool wu_is_feasible(WORKUNIT& wu, HOST& host, WORK_REQ& wreq) {
if(host.d_free && wu.rsc_disk_bound > host.d_free) {
log_messages.printf(
SchedMessages::DEBUG, "[WU#%d %s] needs %f disk; [HOST#%d] has %f\n",
wu.id, wu.name, wu.rsc_disk_bound, host.id, host.d_free
);
return false;
}
if (host.m_nbytes && wu.rsc_memory_bound > host.m_nbytes) { if (host.m_nbytes && wu.rsc_memory_bound > host.m_nbytes) {
log_messages.printf( log_messages.printf(
SchedMessages::DEBUG, "[WU#%d %s] needs %f mem; [HOST#%d] has %f\n", SchedMessages::DEBUG, "[WU#%d %s] needs %f mem; [HOST#%d] has %f\n",
wu.id, wu.name, wu.rsc_memory_bound, host.id, host.m_nbytes wu.id, wu.name, wu.rsc_memory_bound, host.id, host.m_nbytes
); );
wreq.insufficient_mem = true;
return false; return false;
} }
@ -148,6 +157,7 @@ bool wu_is_feasible(WORKUNIT& wu, HOST& host) {
SchedMessages::DEBUG, "[WU#%d %s] needs requires %d seconds on [HOST#%d]; delay_bound is %d\n", SchedMessages::DEBUG, "[WU#%d %s] needs requires %d seconds on [HOST#%d]; delay_bound is %d\n",
wu.id, wu.name, (int)wu_wallclock_time, host.id, wu.delay_bound wu.id, wu.name, (int)wu_wallclock_time, host.id, wu.delay_bound
); );
wreq.insufficient_speed = true;
return false; return false;
} }
@ -204,7 +214,8 @@ int insert_wu_tags(WORKUNIT& wu, APP& app) {
// Add the app and app_version to the reply also. // Add the app and app_version to the reply also.
// //
int add_wu_to_reply( int add_wu_to_reply(
WORKUNIT& wu, SCHEDULER_REPLY& reply, PLATFORM& platform, SCHED_SHMEM& ss WORKUNIT& wu, SCHEDULER_REPLY& reply, PLATFORM& platform, SCHED_SHMEM& ss,
WORK_REQ& wreq
) { ) {
APP* app; APP* app;
APP_VERSION* avp; APP_VERSION* avp;
@ -221,10 +232,11 @@ int add_wu_to_reply(
avp = ss.lookup_app_version(app->id, platform.id, app->min_version); avp = ss.lookup_app_version(app->id, platform.id, app->min_version);
if (!avp) { if (!avp) {
log_messages.printf( log_messages.printf(
SchedMessages::CRITICAL, SchedMessages::DEBUG,
"Can't find app version: APP#%d PLATFORM#%d min_version %d\n", "no app version available: APP#%d PLATFORM#%d min_version %d\n",
app->id, platform.id, app->min_version app->id, platform.id, app->min_version
); );
wreq.no_app_version = true;
return ERR_NULL; return ERR_NULL;
} }
@ -640,10 +652,9 @@ static bool already_in_reply(WU_RESULT& wu_result, SCHEDULER_REPLY& reply) {
// previously infeasible for some host // previously infeasible for some host
// //
static void scan_work_array( static void scan_work_array(
bool infeasible_only, double& seconds_to_fill, double& disk_available, WORK_REQ& wreq,
int& nresults,
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, PLATFORM& platform, SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, PLATFORM& platform,
SCHED_SHMEM& ss, bool& insufficient_disk SCHED_SHMEM& ss
) { ) {
int i, retval, n; int i, retval, n;
WORKUNIT wu; WORKUNIT wu;
@ -651,12 +662,12 @@ static void scan_work_array(
double wu_seconds_filled; double wu_seconds_filled;
char buf[256]; char buf[256];
if (disk_available < 0) insufficient_disk = true; if (wreq.disk_available < 0) wreq.insufficient_disk = true;
for (i=0; i<ss.nwu_results; i++) { for (i=0; i<ss.nwu_results; i++) {
if (seconds_to_fill <= 0) break; if (wreq.seconds_to_fill <= 0) break;
if (disk_available <= 0) break; if (wreq.disk_available <= 0) break;
WU_RESULT& wu_result = ss.wu_results[i]; WU_RESULT& wu_result = ss.wu_results[i];
@ -666,13 +677,13 @@ static void scan_work_array(
continue; continue;
} }
if (wu_result.workunit.rsc_disk_bound > disk_available) { if (wu_result.workunit.rsc_disk_bound > wreq.disk_available) {
insufficient_disk = true; wreq.insufficient_disk = true;
wu_result.infeasible_count++; wu_result.infeasible_count++;
continue; continue;
} }
if (infeasible_only && wu_result.infeasible_count==0) { if (wreq.infeasible_only && wu_result.infeasible_count==0) {
continue; continue;
} }
@ -713,7 +724,7 @@ static void scan_work_array(
// don't send if host can't handle it // don't send if host can't handle it
// //
wu = wu_result.workunit; wu = wu_result.workunit;
if (!wu_is_feasible(wu, reply.host)) { if (!wu_is_feasible(wu, reply.host, wreq)) {
log_messages.printf( log_messages.printf(
SchedMessages::DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible\n", SchedMessages::DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible\n",
reply.host.id, wu.id, wu.name reply.host.id, wu.id, wu.name
@ -726,7 +737,7 @@ static void scan_work_array(
// for the client's platform. // for the client's platform.
// Treat the same as the WU being infeasible // Treat the same as the WU being infeasible
// //
retval = add_wu_to_reply(wu, reply, platform, ss); retval = add_wu_to_reply(wu, reply, platform, ss, wreq);
if (retval) { if (retval) {
wu_result.infeasible_count++; wu_result.infeasible_count++;
continue; continue;
@ -749,7 +760,7 @@ static void scan_work_array(
// ****** HERE WE'VE COMMITTED TO SENDING THIS RESULT TO HOST ****** // ****** HERE WE'VE COMMITTED TO SENDING THIS RESULT TO HOST ******
// //
disk_available -= wu.rsc_disk_bound; wreq.disk_available -= wu.rsc_disk_bound;
// update the result in DB // update the result in DB
// //
@ -793,10 +804,10 @@ static void scan_work_array(
} }
reply.insert_result(result); reply.insert_result(result);
seconds_to_fill -= wu_seconds_filled; wreq.seconds_to_fill -= wu_seconds_filled;
nresults++; wreq.nresults++;
if (nresults == MAX_WUS_TO_SEND) break; if (wreq.nresults == MAX_WUS_TO_SEND) break;
} }
} }
@ -805,49 +816,63 @@ int send_work(
SCHED_SHMEM& ss SCHED_SHMEM& ss
) { ) {
int nresults = 0; int nresults = 0;
double seconds_to_fill; WORK_REQ wreq;
double disk_available;
bool insufficient_disk = false;
disk_available = max_allowable_disk(reply.user, sreq); wreq.disk_available = max_allowable_disk(reply.user, sreq);
wreq.insufficient_disk = false;
wreq.insufficient_mem = false;
wreq.insufficient_speed = false;
wreq.no_app_version = false;
log_messages.printf( log_messages.printf(
SchedMessages::NORMAL, SchedMessages::NORMAL,
"[HOST#%d] got request for %d seconds of work; available disk %f GB\n", "[HOST#%d] got request for %d seconds of work; available disk %f GB\n",
reply.host.id, sreq.work_req_seconds, disk_available/1e9 reply.host.id, sreq.work_req_seconds, wreq.disk_available/1e9
); );
if (sreq.work_req_seconds <= 0) return 0; if (sreq.work_req_seconds <= 0) return 0;
seconds_to_fill = sreq.work_req_seconds; wreq.seconds_to_fill = sreq.work_req_seconds;
if (seconds_to_fill > MAX_SECONDS_TO_SEND) { if (wreq.seconds_to_fill > MAX_SECONDS_TO_SEND) {
seconds_to_fill = MAX_SECONDS_TO_SEND; wreq.seconds_to_fill = MAX_SECONDS_TO_SEND;
} }
if (seconds_to_fill < MIN_SECONDS_TO_SEND) { if (wreq.seconds_to_fill < MIN_SECONDS_TO_SEND) {
seconds_to_fill = MIN_SECONDS_TO_SEND; wreq.seconds_to_fill = MIN_SECONDS_TO_SEND;
} }
// give priority to results that were infeasible for some other host // give priority to results that were infeasible for some other host
// //
scan_work_array( wreq.infeasible_only = true;
true, seconds_to_fill, disk_available, scan_work_array(wreq, sreq, reply, platform, ss);
nresults, sreq, reply, platform, ss, insufficient_disk
); wreq.infeasible_only = false;
scan_work_array( scan_work_array(wreq, sreq, reply, platform, ss);
false, seconds_to_fill, disk_available,
nresults, sreq, reply, platform, ss, insufficient_disk
);
log_messages.printf( log_messages.printf(
SchedMessages::NORMAL, "[HOST#%d] Sent %d results\n", SchedMessages::NORMAL, "[HOST#%d] Sent %d results\n",
reply.host.id, nresults reply.host.id, nresults
); );
if (nresults == 0) { if (wreq.nresults == 0) {
strcpy(reply.message, "No work available"); strcpy(reply.message, "No work available");
if (insufficient_disk) { if (wreq.no_app_version) {
strcat(reply.message, strcat(reply.message,
" (you may need to increase disk limits in global prefs)" " (there was work for other platforms)"
);
}
if (wreq.insufficient_disk) {
strcat(reply.message,
" (there was work but you don't have enough disk space allocated)"
);
}
if (wreq.insufficient_mem) {
strcat(reply.message,
" (there was work but your computer doesn't have enough memory)"
);
}
if (wreq.insufficient_mem) {
strcat(reply.message,
" (there was work but your computer would not finish it before it is due"
); );
} }
strcpy(reply.message_priority, "low"); strcpy(reply.message_priority, "low");

View File

@ -57,11 +57,12 @@ int main() {
FILE* fin, *fout; FILE* fin, *fout;
int i, retval, pid; int i, retval, pid;
char req_path[256], reply_path[256], path[256]; char req_path[256], reply_path[256], path[256];
SCHED_SHMEM* ssp; SCHED_SHMEM* ssp=0;
void* p; void* p;
unsigned int counter=0; unsigned int counter=0;
char* code_sign_key; char* code_sign_key;
bool found; bool found;
bool project_stopped = false;
if (!freopen(STDERR_FILENAME, "a", stderr)) { if (!freopen(STDERR_FILENAME, "a", stderr)) {
fprintf(stderr, "Can't redirect stderr\n"); fprintf(stderr, "Can't redirect stderr\n");
@ -79,45 +80,52 @@ int main() {
sprintf(path, "%s/code_sign_public", config.key_dir); sprintf(path, "%s/code_sign_public", config.key_dir);
retval = read_file_malloc(path, code_sign_key); retval = read_file_malloc(path, code_sign_key);
if (retval) { if (retval) {
log_messages.printf(SchedMessages::CRITICAL, "Can't read code sign key file (%s)\n", path); log_messages.printf(SchedMessages::CRITICAL,
"Can't read code sign key file (%s)\n", path
);
exit(1); exit(1);
} }
retval = attach_shmem(config.shmem_key, &p); retval = attach_shmem(config.shmem_key, &p);
if (retval) { if (retval) {
log_messages.printf(SchedMessages::CRITICAL, "Can't attach shmem (feeder not running?)\n"); log_messages.printf(SchedMessages::CRITICAL,
exit(1); "Can't attach shmem (feeder not running?)\n"
} );
ssp = (SCHED_SHMEM*)p; project_stopped = true;
retval = ssp->verify(); } else {
if (retval) { ssp = (SCHED_SHMEM*)p;
log_messages.printf(SchedMessages::CRITICAL, "shmem has wrong struct sizes - recompile\n"); retval = ssp->verify();
exit(1); if (retval) {
} log_messages.printf(SchedMessages::CRITICAL,
"shmem has wrong struct sizes - recompile\n"
);
exit(1);
}
for (i=0; i<10; i++) { for (i=0; i<10; i++) {
if (ssp->ready) break; if (ssp->ready) break;
log_messages.printf(SchedMessages::DEBUG, "waiting for ready flag\n"); log_messages.printf(SchedMessages::DEBUG, "waiting for ready flag\n");
sleep(1); sleep(1);
} }
if (!ssp->ready) { if (!ssp->ready) {
log_messages.printf(SchedMessages::CRITICAL, "feeder doesn't seem to be running\n"); log_messages.printf(SchedMessages::CRITICAL, "feeder doesn't seem to be running\n");
exit(1); exit(1);
}
} }
retval = boinc_db.open(config.db_name, config.db_passwd); retval = boinc_db.open(config.db_name, config.db_passwd);
if (retval) { if (retval) {
log_messages.printf(SchedMessages::CRITICAL, "can't open database\n"); log_messages.printf(SchedMessages::CRITICAL, "can't open database\n");
exit(1); project_stopped = true;
} } else {
found = false;
found = false; while (!gproject.enumerate("")) {
while (!gproject.enumerate("")) { found = true;
found = true; }
} if (!found) {
if (!found) { log_messages.printf(SchedMessages::CRITICAL, "can't find project\n");
log_messages.printf(SchedMessages::CRITICAL, "can't find project\n"); exit(1);
exit(1); }
} }
pid = getpid(); pid = getpid();
@ -126,6 +134,14 @@ int main() {
counter++; counter++;
#endif #endif
printf("Content-type: text/plain\n\n"); printf("Content-type: text/plain\n\n");
if (project_stopped) {
printf(
"<scheduler_reply>\n"
" <message priority=\"low\">Project is temporarily shut down for maintenance</message>\n"
"</scheduler_reply>\n"
);
goto done;
}
if (use_files) { if (use_files) {
// the code below is convoluted because, // the code below is convoluted because,
// instead of going from stdin to stdout directly, // instead of going from stdin to stdout directly,
@ -167,6 +183,7 @@ int main() {
} else { } else {
handle_request(stdin, stdout, *ssp, code_sign_key); handle_request(stdin, stdout, *ssp, code_sign_key);
} }
done:
#ifdef _USING_FCGI_ #ifdef _USING_FCGI_
} }
#endif #endif