*** empty log message ***

svn path=/trunk/boinc/; revision=2660
This commit is contained in:
David Anderson 2003-11-11 23:14:49 +00:00
parent 032644aaaf
commit 133a8d8db7
3 changed files with 139 additions and 77 deletions

View File

@ -7695,3 +7695,23 @@ David 11 Nov 2003
sched_shmem.C
tools/
backend_lib.C
David 11 Nov 2003
- modify the scheduler so that if a client requests work,
and there are unsent results, but none can be sent,
then the client gets an explanatory message: either
- no app version available
- insufficient disk
- insufficient memory
- host is too slow
This involved creating a WORK_REQ structure that gets passed
around, to avoid huge numbers of arguments
- modify the scheduler so that if either
1) the shared-mem segment is absent or
2) the DB can't be opened
then it writes a "project temporarily down for maintenance"
message instead of just exiting
sched/
handle_request.C
main.C

View File

@ -47,6 +47,21 @@ const int MAX_WUS_TO_SEND = 10;
const double COBBLESTONE_FACTOR = 300.0;
struct WORK_REQ {
bool infeasible_only;
double seconds_to_fill;
double disk_available;
int nresults;
// the following flags are set whenever a result is infeasible;
// used to construct explanatory message to user
//
bool insufficient_disk;
bool insufficient_mem;
bool insufficient_speed;
bool no_app_version;
};
// compute the max disk usage we can request of the host
//
double max_allowable_disk(USER& user, SCHEDULER_REQUEST& req) {
@ -58,7 +73,7 @@ double max_allowable_disk(USER& user, SCHEDULER_REQUEST& req) {
// fill in default values for missing prefs
//
if (prefs.disk_max_used_gb == 0) prefs.disk_max_used_gb = 0.01; // 10 MB
if (prefs.disk_max_used_gb == 0) prefs.disk_max_used_gb = 0.1; // 100 MB
if (prefs.disk_max_used_pct == 0) prefs.disk_max_used_pct = 10;
// min_free_gb can be zero
@ -124,19 +139,13 @@ inline double estimate_wallclock_duration(WORKUNIT& wu, HOST& host) {
// return true if the WU can be executed on the host
//
bool wu_is_feasible(WORKUNIT& wu, HOST& host) {
if(host.d_free && wu.rsc_disk_bound > host.d_free) {
log_messages.printf(
SchedMessages::DEBUG, "[WU#%d %s] needs %f disk; [HOST#%d] has %f\n",
wu.id, wu.name, wu.rsc_disk_bound, host.id, host.d_free
);
return false;
}
bool wu_is_feasible(WORKUNIT& wu, HOST& host, WORK_REQ& wreq) {
if (host.m_nbytes && wu.rsc_memory_bound > host.m_nbytes) {
log_messages.printf(
SchedMessages::DEBUG, "[WU#%d %s] needs %f mem; [HOST#%d] has %f\n",
wu.id, wu.name, wu.rsc_memory_bound, host.id, host.m_nbytes
);
wreq.insufficient_mem = true;
return false;
}
@ -148,6 +157,7 @@ bool wu_is_feasible(WORKUNIT& wu, HOST& host) {
SchedMessages::DEBUG, "[WU#%d %s] needs requires %d seconds on [HOST#%d]; delay_bound is %d\n",
wu.id, wu.name, (int)wu_wallclock_time, host.id, wu.delay_bound
);
wreq.insufficient_speed = true;
return false;
}
@ -204,7 +214,8 @@ int insert_wu_tags(WORKUNIT& wu, APP& app) {
// Add the app and app_version to the reply also.
//
int add_wu_to_reply(
WORKUNIT& wu, SCHEDULER_REPLY& reply, PLATFORM& platform, SCHED_SHMEM& ss
WORKUNIT& wu, SCHEDULER_REPLY& reply, PLATFORM& platform, SCHED_SHMEM& ss,
WORK_REQ& wreq
) {
APP* app;
APP_VERSION* avp;
@ -221,10 +232,11 @@ int add_wu_to_reply(
avp = ss.lookup_app_version(app->id, platform.id, app->min_version);
if (!avp) {
log_messages.printf(
SchedMessages::CRITICAL,
"Can't find app version: APP#%d PLATFORM#%d min_version %d\n",
SchedMessages::DEBUG,
"no app version available: APP#%d PLATFORM#%d min_version %d\n",
app->id, platform.id, app->min_version
);
wreq.no_app_version = true;
return ERR_NULL;
}
@ -640,10 +652,9 @@ static bool already_in_reply(WU_RESULT& wu_result, SCHEDULER_REPLY& reply) {
// previously infeasible for some host
//
static void scan_work_array(
bool infeasible_only, double& seconds_to_fill, double& disk_available,
int& nresults,
WORK_REQ& wreq,
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, PLATFORM& platform,
SCHED_SHMEM& ss, bool& insufficient_disk
SCHED_SHMEM& ss
) {
int i, retval, n;
WORKUNIT wu;
@ -651,12 +662,12 @@ static void scan_work_array(
double wu_seconds_filled;
char buf[256];
if (disk_available < 0) insufficient_disk = true;
if (wreq.disk_available < 0) wreq.insufficient_disk = true;
for (i=0; i<ss.nwu_results; i++) {
if (seconds_to_fill <= 0) break;
if (disk_available <= 0) break;
if (wreq.seconds_to_fill <= 0) break;
if (wreq.disk_available <= 0) break;
WU_RESULT& wu_result = ss.wu_results[i];
@ -666,13 +677,13 @@ static void scan_work_array(
continue;
}
if (wu_result.workunit.rsc_disk_bound > disk_available) {
insufficient_disk = true;
if (wu_result.workunit.rsc_disk_bound > wreq.disk_available) {
wreq.insufficient_disk = true;
wu_result.infeasible_count++;
continue;
}
if (infeasible_only && wu_result.infeasible_count==0) {
if (wreq.infeasible_only && wu_result.infeasible_count==0) {
continue;
}
@ -713,7 +724,7 @@ static void scan_work_array(
// don't send if host can't handle it
//
wu = wu_result.workunit;
if (!wu_is_feasible(wu, reply.host)) {
if (!wu_is_feasible(wu, reply.host, wreq)) {
log_messages.printf(
SchedMessages::DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible\n",
reply.host.id, wu.id, wu.name
@ -726,7 +737,7 @@ static void scan_work_array(
// for the client's platform.
// Treat the same as the WU being infeasible
//
retval = add_wu_to_reply(wu, reply, platform, ss);
retval = add_wu_to_reply(wu, reply, platform, ss, wreq);
if (retval) {
wu_result.infeasible_count++;
continue;
@ -749,7 +760,7 @@ static void scan_work_array(
// ****** HERE WE'VE COMMITTED TO SENDING THIS RESULT TO HOST ******
//
disk_available -= wu.rsc_disk_bound;
wreq.disk_available -= wu.rsc_disk_bound;
// update the result in DB
//
@ -793,10 +804,10 @@ static void scan_work_array(
}
reply.insert_result(result);
seconds_to_fill -= wu_seconds_filled;
wreq.seconds_to_fill -= wu_seconds_filled;
nresults++;
if (nresults == MAX_WUS_TO_SEND) break;
wreq.nresults++;
if (wreq.nresults == MAX_WUS_TO_SEND) break;
}
}
@ -805,49 +816,63 @@ int send_work(
SCHED_SHMEM& ss
) {
int nresults = 0;
double seconds_to_fill;
double disk_available;
bool insufficient_disk = false;
WORK_REQ wreq;
disk_available = max_allowable_disk(reply.user, sreq);
wreq.disk_available = max_allowable_disk(reply.user, sreq);
wreq.insufficient_disk = false;
wreq.insufficient_mem = false;
wreq.insufficient_speed = false;
wreq.no_app_version = false;
log_messages.printf(
SchedMessages::NORMAL,
"[HOST#%d] got request for %d seconds of work; available disk %f GB\n",
reply.host.id, sreq.work_req_seconds, disk_available/1e9
reply.host.id, sreq.work_req_seconds, wreq.disk_available/1e9
);
if (sreq.work_req_seconds <= 0) return 0;
seconds_to_fill = sreq.work_req_seconds;
if (seconds_to_fill > MAX_SECONDS_TO_SEND) {
seconds_to_fill = MAX_SECONDS_TO_SEND;
wreq.seconds_to_fill = sreq.work_req_seconds;
if (wreq.seconds_to_fill > MAX_SECONDS_TO_SEND) {
wreq.seconds_to_fill = MAX_SECONDS_TO_SEND;
}
if (seconds_to_fill < MIN_SECONDS_TO_SEND) {
seconds_to_fill = MIN_SECONDS_TO_SEND;
if (wreq.seconds_to_fill < MIN_SECONDS_TO_SEND) {
wreq.seconds_to_fill = MIN_SECONDS_TO_SEND;
}
// give priority to results that were infeasible for some other host
//
scan_work_array(
true, seconds_to_fill, disk_available,
nresults, sreq, reply, platform, ss, insufficient_disk
);
scan_work_array(
false, seconds_to_fill, disk_available,
nresults, sreq, reply, platform, ss, insufficient_disk
);
wreq.infeasible_only = true;
scan_work_array(wreq, sreq, reply, platform, ss);
wreq.infeasible_only = false;
scan_work_array(wreq, sreq, reply, platform, ss);
log_messages.printf(
SchedMessages::NORMAL, "[HOST#%d] Sent %d results\n",
reply.host.id, nresults
);
if (nresults == 0) {
if (wreq.nresults == 0) {
strcpy(reply.message, "No work available");
if (insufficient_disk) {
if (wreq.no_app_version) {
strcat(reply.message,
" (you may need to increase disk limits in global prefs)"
" (there was work for other platforms)"
);
}
if (wreq.insufficient_disk) {
strcat(reply.message,
" (there was work but you don't have enough disk space allocated)"
);
}
if (wreq.insufficient_mem) {
strcat(reply.message,
" (there was work but your computer doesn't have enough memory)"
);
}
if (wreq.insufficient_mem) {
strcat(reply.message,
" (there was work but your computer would not finish it before it is due"
);
}
strcpy(reply.message_priority, "low");

View File

@ -57,11 +57,12 @@ int main() {
FILE* fin, *fout;
int i, retval, pid;
char req_path[256], reply_path[256], path[256];
SCHED_SHMEM* ssp;
SCHED_SHMEM* ssp=0;
void* p;
unsigned int counter=0;
char* code_sign_key;
bool found;
bool project_stopped = false;
if (!freopen(STDERR_FILENAME, "a", stderr)) {
fprintf(stderr, "Can't redirect stderr\n");
@ -79,45 +80,52 @@ int main() {
sprintf(path, "%s/code_sign_public", config.key_dir);
retval = read_file_malloc(path, code_sign_key);
if (retval) {
log_messages.printf(SchedMessages::CRITICAL, "Can't read code sign key file (%s)\n", path);
log_messages.printf(SchedMessages::CRITICAL,
"Can't read code sign key file (%s)\n", path
);
exit(1);
}
retval = attach_shmem(config.shmem_key, &p);
if (retval) {
log_messages.printf(SchedMessages::CRITICAL, "Can't attach shmem (feeder not running?)\n");
exit(1);
}
ssp = (SCHED_SHMEM*)p;
retval = ssp->verify();
if (retval) {
log_messages.printf(SchedMessages::CRITICAL, "shmem has wrong struct sizes - recompile\n");
exit(1);
}
log_messages.printf(SchedMessages::CRITICAL,
"Can't attach shmem (feeder not running?)\n"
);
project_stopped = true;
} else {
ssp = (SCHED_SHMEM*)p;
retval = ssp->verify();
if (retval) {
log_messages.printf(SchedMessages::CRITICAL,
"shmem has wrong struct sizes - recompile\n"
);
exit(1);
}
for (i=0; i<10; i++) {
if (ssp->ready) break;
log_messages.printf(SchedMessages::DEBUG, "waiting for ready flag\n");
sleep(1);
}
if (!ssp->ready) {
log_messages.printf(SchedMessages::CRITICAL, "feeder doesn't seem to be running\n");
exit(1);
for (i=0; i<10; i++) {
if (ssp->ready) break;
log_messages.printf(SchedMessages::DEBUG, "waiting for ready flag\n");
sleep(1);
}
if (!ssp->ready) {
log_messages.printf(SchedMessages::CRITICAL, "feeder doesn't seem to be running\n");
exit(1);
}
}
retval = boinc_db.open(config.db_name, config.db_passwd);
if (retval) {
log_messages.printf(SchedMessages::CRITICAL, "can't open database\n");
exit(1);
}
found = false;
while (!gproject.enumerate("")) {
found = true;
}
if (!found) {
log_messages.printf(SchedMessages::CRITICAL, "can't find project\n");
exit(1);
project_stopped = true;
} else {
found = false;
while (!gproject.enumerate("")) {
found = true;
}
if (!found) {
log_messages.printf(SchedMessages::CRITICAL, "can't find project\n");
exit(1);
}
}
pid = getpid();
@ -126,6 +134,14 @@ int main() {
counter++;
#endif
printf("Content-type: text/plain\n\n");
if (project_stopped) {
printf(
"<scheduler_reply>\n"
" <message priority=\"low\">Project is temporarily shut down for maintenance</message>\n"
"</scheduler_reply>\n"
);
goto done;
}
if (use_files) {
// the code below is convoluted because,
// instead of going from stdin to stdout directly,
@ -167,6 +183,7 @@ int main() {
} else {
handle_request(stdin, stdout, *ssp, code_sign_key);
}
done:
#ifdef _USING_FCGI_
}
#endif