mirror of https://github.com/BOINC/boinc.git
*** empty log message ***
svn path=/trunk/boinc/; revision=2660
This commit is contained in:
parent
032644aaaf
commit
133a8d8db7
|
@ -7695,3 +7695,23 @@ David 11 Nov 2003
|
|||
sched_shmem.C
|
||||
tools/
|
||||
backend_lib.C
|
||||
|
||||
David 11 Nov 2003
|
||||
- modify the scheduler so that if a client requests work,
|
||||
and there are unsent results, but none can be sent,
|
||||
then the client gets an explanatory message: either
|
||||
- no app version available
|
||||
- insufficient disk
|
||||
- insufficient memory
|
||||
- host is too slow
|
||||
This involved creating a WORK_REQ structure that gets passed
|
||||
around, to avoid huge numbers of arguments
|
||||
- modify the scheduler so that if either
|
||||
1) the shared-mem segment is absent or
|
||||
2) the DB can't be opened
|
||||
then it writes a "project temporarily down for maintenance"
|
||||
message instead of just exiting
|
||||
|
||||
sched/
|
||||
handle_request.C
|
||||
main.C
|
||||
|
|
|
@ -47,6 +47,21 @@ const int MAX_WUS_TO_SEND = 10;
|
|||
|
||||
const double COBBLESTONE_FACTOR = 300.0;
|
||||
|
||||
struct WORK_REQ {
|
||||
bool infeasible_only;
|
||||
double seconds_to_fill;
|
||||
double disk_available;
|
||||
int nresults;
|
||||
|
||||
// the following flags are set whenever a result is infeasible;
|
||||
// used to construct explanatory message to user
|
||||
//
|
||||
bool insufficient_disk;
|
||||
bool insufficient_mem;
|
||||
bool insufficient_speed;
|
||||
bool no_app_version;
|
||||
};
|
||||
|
||||
// compute the max disk usage we can request of the host
|
||||
//
|
||||
double max_allowable_disk(USER& user, SCHEDULER_REQUEST& req) {
|
||||
|
@ -58,7 +73,7 @@ double max_allowable_disk(USER& user, SCHEDULER_REQUEST& req) {
|
|||
|
||||
// fill in default values for missing prefs
|
||||
//
|
||||
if (prefs.disk_max_used_gb == 0) prefs.disk_max_used_gb = 0.01; // 10 MB
|
||||
if (prefs.disk_max_used_gb == 0) prefs.disk_max_used_gb = 0.1; // 100 MB
|
||||
if (prefs.disk_max_used_pct == 0) prefs.disk_max_used_pct = 10;
|
||||
// min_free_gb can be zero
|
||||
|
||||
|
@ -124,19 +139,13 @@ inline double estimate_wallclock_duration(WORKUNIT& wu, HOST& host) {
|
|||
|
||||
// return true if the WU can be executed on the host
|
||||
//
|
||||
bool wu_is_feasible(WORKUNIT& wu, HOST& host) {
|
||||
if(host.d_free && wu.rsc_disk_bound > host.d_free) {
|
||||
log_messages.printf(
|
||||
SchedMessages::DEBUG, "[WU#%d %s] needs %f disk; [HOST#%d] has %f\n",
|
||||
wu.id, wu.name, wu.rsc_disk_bound, host.id, host.d_free
|
||||
);
|
||||
return false;
|
||||
}
|
||||
bool wu_is_feasible(WORKUNIT& wu, HOST& host, WORK_REQ& wreq) {
|
||||
if (host.m_nbytes && wu.rsc_memory_bound > host.m_nbytes) {
|
||||
log_messages.printf(
|
||||
SchedMessages::DEBUG, "[WU#%d %s] needs %f mem; [HOST#%d] has %f\n",
|
||||
wu.id, wu.name, wu.rsc_memory_bound, host.id, host.m_nbytes
|
||||
);
|
||||
wreq.insufficient_mem = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -148,6 +157,7 @@ bool wu_is_feasible(WORKUNIT& wu, HOST& host) {
|
|||
SchedMessages::DEBUG, "[WU#%d %s] needs requires %d seconds on [HOST#%d]; delay_bound is %d\n",
|
||||
wu.id, wu.name, (int)wu_wallclock_time, host.id, wu.delay_bound
|
||||
);
|
||||
wreq.insufficient_speed = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -204,7 +214,8 @@ int insert_wu_tags(WORKUNIT& wu, APP& app) {
|
|||
// Add the app and app_version to the reply also.
|
||||
//
|
||||
int add_wu_to_reply(
|
||||
WORKUNIT& wu, SCHEDULER_REPLY& reply, PLATFORM& platform, SCHED_SHMEM& ss
|
||||
WORKUNIT& wu, SCHEDULER_REPLY& reply, PLATFORM& platform, SCHED_SHMEM& ss,
|
||||
WORK_REQ& wreq
|
||||
) {
|
||||
APP* app;
|
||||
APP_VERSION* avp;
|
||||
|
@ -221,10 +232,11 @@ int add_wu_to_reply(
|
|||
avp = ss.lookup_app_version(app->id, platform.id, app->min_version);
|
||||
if (!avp) {
|
||||
log_messages.printf(
|
||||
SchedMessages::CRITICAL,
|
||||
"Can't find app version: APP#%d PLATFORM#%d min_version %d\n",
|
||||
SchedMessages::DEBUG,
|
||||
"no app version available: APP#%d PLATFORM#%d min_version %d\n",
|
||||
app->id, platform.id, app->min_version
|
||||
);
|
||||
wreq.no_app_version = true;
|
||||
return ERR_NULL;
|
||||
}
|
||||
|
||||
|
@ -640,10 +652,9 @@ static bool already_in_reply(WU_RESULT& wu_result, SCHEDULER_REPLY& reply) {
|
|||
// previously infeasible for some host
|
||||
//
|
||||
static void scan_work_array(
|
||||
bool infeasible_only, double& seconds_to_fill, double& disk_available,
|
||||
int& nresults,
|
||||
WORK_REQ& wreq,
|
||||
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, PLATFORM& platform,
|
||||
SCHED_SHMEM& ss, bool& insufficient_disk
|
||||
SCHED_SHMEM& ss
|
||||
) {
|
||||
int i, retval, n;
|
||||
WORKUNIT wu;
|
||||
|
@ -651,12 +662,12 @@ static void scan_work_array(
|
|||
double wu_seconds_filled;
|
||||
char buf[256];
|
||||
|
||||
if (disk_available < 0) insufficient_disk = true;
|
||||
if (wreq.disk_available < 0) wreq.insufficient_disk = true;
|
||||
|
||||
for (i=0; i<ss.nwu_results; i++) {
|
||||
|
||||
if (seconds_to_fill <= 0) break;
|
||||
if (disk_available <= 0) break;
|
||||
if (wreq.seconds_to_fill <= 0) break;
|
||||
if (wreq.disk_available <= 0) break;
|
||||
|
||||
WU_RESULT& wu_result = ss.wu_results[i];
|
||||
|
||||
|
@ -666,13 +677,13 @@ static void scan_work_array(
|
|||
continue;
|
||||
}
|
||||
|
||||
if (wu_result.workunit.rsc_disk_bound > disk_available) {
|
||||
insufficient_disk = true;
|
||||
if (wu_result.workunit.rsc_disk_bound > wreq.disk_available) {
|
||||
wreq.insufficient_disk = true;
|
||||
wu_result.infeasible_count++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (infeasible_only && wu_result.infeasible_count==0) {
|
||||
if (wreq.infeasible_only && wu_result.infeasible_count==0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -713,7 +724,7 @@ static void scan_work_array(
|
|||
// don't send if host can't handle it
|
||||
//
|
||||
wu = wu_result.workunit;
|
||||
if (!wu_is_feasible(wu, reply.host)) {
|
||||
if (!wu_is_feasible(wu, reply.host, wreq)) {
|
||||
log_messages.printf(
|
||||
SchedMessages::DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible\n",
|
||||
reply.host.id, wu.id, wu.name
|
||||
|
@ -726,7 +737,7 @@ static void scan_work_array(
|
|||
// for the client's platform.
|
||||
// Treat the same as the WU being infeasible
|
||||
//
|
||||
retval = add_wu_to_reply(wu, reply, platform, ss);
|
||||
retval = add_wu_to_reply(wu, reply, platform, ss, wreq);
|
||||
if (retval) {
|
||||
wu_result.infeasible_count++;
|
||||
continue;
|
||||
|
@ -749,7 +760,7 @@ static void scan_work_array(
|
|||
// ****** HERE WE'VE COMMITTED TO SENDING THIS RESULT TO HOST ******
|
||||
//
|
||||
|
||||
disk_available -= wu.rsc_disk_bound;
|
||||
wreq.disk_available -= wu.rsc_disk_bound;
|
||||
|
||||
// update the result in DB
|
||||
//
|
||||
|
@ -793,10 +804,10 @@ static void scan_work_array(
|
|||
}
|
||||
reply.insert_result(result);
|
||||
|
||||
seconds_to_fill -= wu_seconds_filled;
|
||||
wreq.seconds_to_fill -= wu_seconds_filled;
|
||||
|
||||
nresults++;
|
||||
if (nresults == MAX_WUS_TO_SEND) break;
|
||||
wreq.nresults++;
|
||||
if (wreq.nresults == MAX_WUS_TO_SEND) break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -805,49 +816,63 @@ int send_work(
|
|||
SCHED_SHMEM& ss
|
||||
) {
|
||||
int nresults = 0;
|
||||
double seconds_to_fill;
|
||||
double disk_available;
|
||||
bool insufficient_disk = false;
|
||||
WORK_REQ wreq;
|
||||
|
||||
disk_available = max_allowable_disk(reply.user, sreq);
|
||||
wreq.disk_available = max_allowable_disk(reply.user, sreq);
|
||||
wreq.insufficient_disk = false;
|
||||
wreq.insufficient_mem = false;
|
||||
wreq.insufficient_speed = false;
|
||||
wreq.no_app_version = false;
|
||||
|
||||
log_messages.printf(
|
||||
SchedMessages::NORMAL,
|
||||
"[HOST#%d] got request for %d seconds of work; available disk %f GB\n",
|
||||
reply.host.id, sreq.work_req_seconds, disk_available/1e9
|
||||
reply.host.id, sreq.work_req_seconds, wreq.disk_available/1e9
|
||||
);
|
||||
|
||||
if (sreq.work_req_seconds <= 0) return 0;
|
||||
|
||||
seconds_to_fill = sreq.work_req_seconds;
|
||||
if (seconds_to_fill > MAX_SECONDS_TO_SEND) {
|
||||
seconds_to_fill = MAX_SECONDS_TO_SEND;
|
||||
wreq.seconds_to_fill = sreq.work_req_seconds;
|
||||
if (wreq.seconds_to_fill > MAX_SECONDS_TO_SEND) {
|
||||
wreq.seconds_to_fill = MAX_SECONDS_TO_SEND;
|
||||
}
|
||||
if (seconds_to_fill < MIN_SECONDS_TO_SEND) {
|
||||
seconds_to_fill = MIN_SECONDS_TO_SEND;
|
||||
if (wreq.seconds_to_fill < MIN_SECONDS_TO_SEND) {
|
||||
wreq.seconds_to_fill = MIN_SECONDS_TO_SEND;
|
||||
}
|
||||
|
||||
// give priority to results that were infeasible for some other host
|
||||
//
|
||||
scan_work_array(
|
||||
true, seconds_to_fill, disk_available,
|
||||
nresults, sreq, reply, platform, ss, insufficient_disk
|
||||
);
|
||||
scan_work_array(
|
||||
false, seconds_to_fill, disk_available,
|
||||
nresults, sreq, reply, platform, ss, insufficient_disk
|
||||
);
|
||||
wreq.infeasible_only = true;
|
||||
scan_work_array(wreq, sreq, reply, platform, ss);
|
||||
|
||||
wreq.infeasible_only = false;
|
||||
scan_work_array(wreq, sreq, reply, platform, ss);
|
||||
|
||||
log_messages.printf(
|
||||
SchedMessages::NORMAL, "[HOST#%d] Sent %d results\n",
|
||||
reply.host.id, nresults
|
||||
);
|
||||
|
||||
if (nresults == 0) {
|
||||
if (wreq.nresults == 0) {
|
||||
strcpy(reply.message, "No work available");
|
||||
if (insufficient_disk) {
|
||||
if (wreq.no_app_version) {
|
||||
strcat(reply.message,
|
||||
" (you may need to increase disk limits in global prefs)"
|
||||
" (there was work for other platforms)"
|
||||
);
|
||||
}
|
||||
if (wreq.insufficient_disk) {
|
||||
strcat(reply.message,
|
||||
" (there was work but you don't have enough disk space allocated)"
|
||||
);
|
||||
}
|
||||
if (wreq.insufficient_mem) {
|
||||
strcat(reply.message,
|
||||
" (there was work but your computer doesn't have enough memory)"
|
||||
);
|
||||
}
|
||||
if (wreq.insufficient_mem) {
|
||||
strcat(reply.message,
|
||||
" (there was work but your computer would not finish it before it is due"
|
||||
);
|
||||
}
|
||||
strcpy(reply.message_priority, "low");
|
||||
|
|
75
sched/main.C
75
sched/main.C
|
@ -57,11 +57,12 @@ int main() {
|
|||
FILE* fin, *fout;
|
||||
int i, retval, pid;
|
||||
char req_path[256], reply_path[256], path[256];
|
||||
SCHED_SHMEM* ssp;
|
||||
SCHED_SHMEM* ssp=0;
|
||||
void* p;
|
||||
unsigned int counter=0;
|
||||
char* code_sign_key;
|
||||
bool found;
|
||||
bool project_stopped = false;
|
||||
|
||||
if (!freopen(STDERR_FILENAME, "a", stderr)) {
|
||||
fprintf(stderr, "Can't redirect stderr\n");
|
||||
|
@ -79,45 +80,52 @@ int main() {
|
|||
sprintf(path, "%s/code_sign_public", config.key_dir);
|
||||
retval = read_file_malloc(path, code_sign_key);
|
||||
if (retval) {
|
||||
log_messages.printf(SchedMessages::CRITICAL, "Can't read code sign key file (%s)\n", path);
|
||||
log_messages.printf(SchedMessages::CRITICAL,
|
||||
"Can't read code sign key file (%s)\n", path
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
retval = attach_shmem(config.shmem_key, &p);
|
||||
if (retval) {
|
||||
log_messages.printf(SchedMessages::CRITICAL, "Can't attach shmem (feeder not running?)\n");
|
||||
exit(1);
|
||||
}
|
||||
ssp = (SCHED_SHMEM*)p;
|
||||
retval = ssp->verify();
|
||||
if (retval) {
|
||||
log_messages.printf(SchedMessages::CRITICAL, "shmem has wrong struct sizes - recompile\n");
|
||||
exit(1);
|
||||
}
|
||||
log_messages.printf(SchedMessages::CRITICAL,
|
||||
"Can't attach shmem (feeder not running?)\n"
|
||||
);
|
||||
project_stopped = true;
|
||||
} else {
|
||||
ssp = (SCHED_SHMEM*)p;
|
||||
retval = ssp->verify();
|
||||
if (retval) {
|
||||
log_messages.printf(SchedMessages::CRITICAL,
|
||||
"shmem has wrong struct sizes - recompile\n"
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (i=0; i<10; i++) {
|
||||
if (ssp->ready) break;
|
||||
log_messages.printf(SchedMessages::DEBUG, "waiting for ready flag\n");
|
||||
sleep(1);
|
||||
}
|
||||
if (!ssp->ready) {
|
||||
log_messages.printf(SchedMessages::CRITICAL, "feeder doesn't seem to be running\n");
|
||||
exit(1);
|
||||
for (i=0; i<10; i++) {
|
||||
if (ssp->ready) break;
|
||||
log_messages.printf(SchedMessages::DEBUG, "waiting for ready flag\n");
|
||||
sleep(1);
|
||||
}
|
||||
if (!ssp->ready) {
|
||||
log_messages.printf(SchedMessages::CRITICAL, "feeder doesn't seem to be running\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
retval = boinc_db.open(config.db_name, config.db_passwd);
|
||||
if (retval) {
|
||||
log_messages.printf(SchedMessages::CRITICAL, "can't open database\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
found = false;
|
||||
while (!gproject.enumerate("")) {
|
||||
found = true;
|
||||
}
|
||||
if (!found) {
|
||||
log_messages.printf(SchedMessages::CRITICAL, "can't find project\n");
|
||||
exit(1);
|
||||
project_stopped = true;
|
||||
} else {
|
||||
found = false;
|
||||
while (!gproject.enumerate("")) {
|
||||
found = true;
|
||||
}
|
||||
if (!found) {
|
||||
log_messages.printf(SchedMessages::CRITICAL, "can't find project\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
pid = getpid();
|
||||
|
@ -126,6 +134,14 @@ int main() {
|
|||
counter++;
|
||||
#endif
|
||||
printf("Content-type: text/plain\n\n");
|
||||
if (project_stopped) {
|
||||
printf(
|
||||
"<scheduler_reply>\n"
|
||||
" <message priority=\"low\">Project is temporarily shut down for maintenance</message>\n"
|
||||
"</scheduler_reply>\n"
|
||||
);
|
||||
goto done;
|
||||
}
|
||||
if (use_files) {
|
||||
// the code below is convoluted because,
|
||||
// instead of going from stdin to stdout directly,
|
||||
|
@ -167,6 +183,7 @@ int main() {
|
|||
} else {
|
||||
handle_request(stdin, stdout, *ssp, code_sign_key);
|
||||
}
|
||||
done:
|
||||
#ifdef _USING_FCGI_
|
||||
}
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue