- scheduler:

- if WU is infeasible, print message instead of number
    - remove useless messages
    - remove EDF simulations printfs
    - don't update nresults_on_host in resend_lost_work()
        (it's already done in add_result_to_reply())

svn path=/trunk/boinc/; revision=14336
This commit is contained in:
David Anderson 2007-11-30 23:02:55 +00:00
parent 6b2c7f38a2
commit df8cbdb294
7 changed files with 159 additions and 139 deletions

View File

@ -11964,3 +11964,18 @@ Rom 30 Nov 2007
boinctray.vcproj
win_build/installerv2/
BOINC.ism
David 30 Nov 2007
- scheduler:
- if WU is infeasible, print message instead of number
- remove useless messages
- remove EDF simulations printfs
- don't update nresults_on_host in resend_lost_work()
(it's already done in add_result_to_reply())
sched/
edf_sim.C
handle_request.C
sched_array.C
sched_resend.C
sched_send.C,h

View File

@ -25,16 +25,6 @@
using std::vector;
//#define TEST
//#define DEBUG
#ifdef DEBUG
#define INFO0
#define INFO1
#define INFO2
#else
#define INFO0 //
#define INFO1 //
#define INFO2 //
#endif
// 0 shows initial workload and candidate decisions
// 1 shows function calls results of sim: make/miss deadline
@ -53,7 +43,9 @@ void mark_edf_misses (int ncpus, vector<IP_RESULT>& ip_results){
double booked_to[128];
int j;
INFO1 fprintf(stderr, "mark_edf_misses\n");
#ifdef DEBUG
fprintf(stderr, "mark_edf_misses\n");
#endif
// keeps track of when each cpu is next free
//
@ -82,20 +74,26 @@ void mark_edf_misses (int ncpus, vector<IP_RESULT>& ip_results){
}
booked_to[lowest_booked_cpu] += r.cpu_time_remaining;
INFO2 fprintf(stderr, " running %s on cpu %d; finishes at %f\n",
#ifdef DEBUG
fprintf(stderr, " running %s on cpu %d; finishes at %f\n",
r.name, lowest_booked_cpu, booked_to[lowest_booked_cpu]
);
#endif
if (booked_to[lowest_booked_cpu] > r.computation_deadline) {
r.misses_deadline = true;
r.estimated_completion_time = booked_to[lowest_booked_cpu];
INFO1 fprintf(stderr, " %s misses_deadline; est completion %f\n",
#ifdef DEBUG
fprintf(stderr, " %s misses_deadline; est completion %f\n",
r.name, booked_to[lowest_booked_cpu]
);
#endif
} else {
r.misses_deadline = false;
INFO1 fprintf(stderr, " %s makes deadline; est completion %f\n",
#ifdef DEBUG
fprintf(stderr, " %s makes deadline; est completion %f\n",
r.name, booked_to[lowest_booked_cpu]
);
#endif
// if result doesn't miss its deadline,
// then the estimated_completion_time is of no use
}
@ -112,13 +110,17 @@ void init_ip_results(
){
unsigned int i;
INFO0 fprintf(stderr, "init_ip_results; work_buf_min %f ncpus %d:\n", work_buf_min, ncpus);
#ifdef DEBUG
fprintf(stderr, "init_ip_results; work_buf_min %f ncpus %d:\n", work_buf_min, ncpus);
#endif
for (i=0; i<ip_results.size(); i++) {
IP_RESULT& r = ip_results[i];
r.computation_deadline = r.report_deadline - work_buf_min;
INFO0 fprintf(stderr, " %s: deadline %.2f cpu %.2f\n",
#ifdef DEBUG
fprintf(stderr, " %s: deadline %.2f cpu %.2f\n",
r.name, r.computation_deadline, r.cpu_time_remaining
);
#endif
}
// run edf simulation to determine whether any results miss their deadline
@ -189,10 +191,12 @@ bool check_candidate (
double booked_to[128]; // keeps track of when each cpu is free
int j;
INFO0 fprintf(stderr, "check_candidate %s: dl %f cpu %f\n",
#ifdef DEBUG
fprintf(stderr, "check_candidate %s: dl %f cpu %f\n",
candidate.name, candidate.computation_deadline,
candidate.cpu_time_remaining
);
#endif
for (j=0; j<ncpus; j++) {
booked_to[j] = 0;
@ -219,9 +223,11 @@ bool check_candidate (
}
}
booked_to[lowest_booked_cpu] += r.cpu_time_remaining;
INFO2 fprintf(stderr, " running %s on cpu %d; finishes at %f\n",
#ifdef DEBUG
fprintf(stderr, " running %s on cpu %d; finishes at %f\n",
r.name, lowest_booked_cpu, booked_to[lowest_booked_cpu]
);
#endif
// return false if completion time if > computation_deadline AND
// result would not have missed deadline to begin with
@ -229,9 +235,11 @@ bool check_candidate (
if (booked_to[lowest_booked_cpu] > r.computation_deadline
&& !r.misses_deadline
) {
INFO0 fprintf(stderr, " cand. fails; %s now misses deadline: %f\n",
#ifdef DEBUG
fprintf(stderr, " cand. fails; %s now misses deadline: %f\n",
r.name, booked_to[lowest_booked_cpu]
);
#endif
return false;
}
// check a late result (i.e., one that would have missed its
@ -240,11 +248,15 @@ bool check_candidate (
if (r.misses_deadline
&& booked_to[lowest_booked_cpu] > r.estimated_completion_time
){
INFO1 fprintf(stderr, " cand. fails; late result %s to be returned even later\n", r.name);
#ifdef DEBUG
fprintf(stderr, " cand. fails; late result %s to be returned even later\n", r.name);
#endif
return false;
}
}
INFO1 fprintf(stderr, " cand. succeeds\n");
#ifdef DEBUG
fprintf(stderr, " cand. succeeds\n");
#endif
return true;
}

View File

@ -530,7 +530,7 @@ static int update_host_record(HOST& initial_host, HOST& xhost, USER& user) {
return 0;
}
// Figure out which of the results the user currently has
// Figure out which of the results the host currently has
// should be aborted outright, or aborted if not started yet
//
int send_result_abort(
@ -556,10 +556,6 @@ int send_result_abort(
result_names.append("'");
result_names.append(orp.name);
result_names.append("'");
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
"Result is on [HOST#%d]: %s\n",
reply.host.id, orp.name.c_str()
);
}
// query the db for the results and set the appropriate flag
@ -1426,14 +1422,15 @@ void process_request(
handle_results(sreq, reply);
reply.wreq.nresults_on_host = sreq.other_results.size();
if (config.resend_lost_results && sreq.have_other_results_list) {
if (resend_lost_work(sreq, reply, platforms, ss)) {
ok_to_send_work = false;
if (sreq.have_other_results_list) {
if (config.resend_lost_results) {
if (resend_lost_work(sreq, reply, platforms, ss)) {
ok_to_send_work = false;
}
}
if (config.send_result_abort) {
send_result_abort(sreq, reply, ss);
}
}
if (config.send_result_abort && sreq.have_other_results_list) {
send_result_abort(sreq, reply, ss);
}
// if last RPC was within config.min_sendwork_interval, don't send work

View File

@ -116,8 +116,8 @@ void scan_work_array(
retval = wu_is_infeasible(wu, sreq, reply, *app);
if (retval) {
log_messages.printf(
SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible: %d\n",
reply.host.id, wu.id, wu.name, retval
SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible: %s\n",
reply.host.id, wu.id, wu.name, infeasible_string(retval)
);
continue;
}

View File

@ -54,19 +54,19 @@ static int possibly_give_result_new_deadline(
DB_RESULT& result, WORKUNIT& wu, SCHEDULER_REPLY& reply
) {
const double resend_frac = 0.5; // range [0, 1)
int result_sent_time = time(0);
int result_report_deadline = result_sent_time + (int)(resend_frac*(result.report_deadline - result.sent_time));
int now = time(0);
int result_report_deadline = now + (int)(resend_frac*(result.report_deadline - result.sent_time));
if (result_report_deadline < result.report_deadline) {
result_report_deadline = result.report_deadline;
}
if (result_report_deadline > result_sent_time + wu.delay_bound) {
result_report_deadline = result_sent_time + wu.delay_bound;
if (result_report_deadline > now + wu.delay_bound) {
result_report_deadline = now + wu.delay_bound;
}
// If infeasible, return without modifying result
//
if (estimate_cpu_duration(wu, reply) > result_report_deadline-result_sent_time) {
if (estimate_cpu_duration(wu, reply) > result_report_deadline-now) {
log_messages.printf(
SCHED_MSG_LOG::MSG_DEBUG,
"[RESULT#%d] [HOST#%d] not resending lost result: can't complete in time\n",
@ -83,11 +83,17 @@ static int possibly_give_result_new_deadline(
result.id, reply.host.id,
result_report_deadline==result.report_deadline?"NO update to":"Updated"
);
result.sent_time = result_sent_time;
result.sent_time = now;
result.report_deadline = result_report_deadline;
return 0;
}
// resend any jobs that:
// 1) we already sent to this host;
// 2) are still in progress (i.e. haven't timed out) and
// 3) aren't present on the host
// Return true if there were any such jobs
//
bool resend_lost_work(
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply,
PLATFORM_LIST& platforms, SCHED_SHMEM& ss
@ -96,116 +102,99 @@ bool resend_lost_work(
std::vector<DB_RESULT>results;
unsigned int i;
char buf[256];
char warning_msg[256];
bool did_any = false;
int num_to_resend=0;
int num_eligible_to_resend=0;
int num_resent=0;
int num_on_host=0;
APP* app;
APP_VERSION* avp;
int retval;
// print list of results on host
//
for (i=0; i<sreq.other_results.size(); i++) {
OTHER_RESULT& orp=sreq.other_results[i];
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
"Result is on [HOST#%d]: %s\n",
reply.host.id, orp.name.c_str()
);
}
sprintf(buf, " where hostid=%d and server_state=%d ",
reply.host.id, RESULT_SERVER_STATE_IN_PROGRESS
);
while (!result.enumerate(buf)) {
bool found = false;
num_on_host++;
for (i=0; i<sreq.other_results.size(); i++) {
OTHER_RESULT& orp = sreq.other_results[i];
if (!strcmp(orp.name.c_str(), result.name)) {
found = true;
break;
}
}
if (!found) {
num_to_resend++;
log_messages.printf(
SCHED_MSG_LOG::MSG_DEBUG,
"[HOST#%d] found lost [RESULT#%d]: %s\n",
reply.host.id, result.id, result.name
if (found) continue;
num_eligible_to_resend++;
log_messages.printf(
SCHED_MSG_LOG::MSG_DEBUG,
"[HOST#%d] found lost [RESULT#%d]: %s\n",
reply.host.id, result.id, result.name
);
DB_WORKUNIT wu;
retval = wu.lookup_id(result.workunitid);
if (retval) {
log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
"[HOST#%d] WU not found for [RESULT#%d]\n",
reply.host.id, result.id
);
continue;
}
DB_WORKUNIT wu;
retval = wu.lookup_id(result.workunitid);
if (retval) {
log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
"[HOST#%d] WU not found for [RESULT#%d]\n",
reply.host.id, result.id
);
continue;
}
reply.wreq.core_client_version =
sreq.core_client_major_version*100 + sreq.core_client_minor_version;
reply.wreq.core_client_version =
sreq.core_client_major_version*100 + sreq.core_client_minor_version;
retval = get_app_version(
wu, app, avp, sreq, reply, platforms, ss
retval = get_app_version(
wu, app, avp, sreq, reply, platforms, ss
);
if (retval) {
log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
"[HOST#%d] no app version [RESULT#%d]\n",
reply.host.id, result.id
);
continue;
}
// If time is too close to the deadline,
// or we already have a canonical result,
// or WU error flag is set,
// then don't bother to resend this result.
// Instead make it time out right away
// so that the transitioner does 'the right thing'.
//
if (
wu.error_mask ||
wu.canonical_resultid ||
possibly_give_result_new_deadline(result, wu, reply)
) {
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
"[HOST#%d][RESULT#%d] not needed or too close to deadline, expiring\n",
reply.host.id, result.id
);
result.report_deadline = time(0)-1;
retval = result.mark_as_sent(result.server_state);
if (retval) {
log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
"[HOST#%d] no app version [RESULT#%d]\n",
reply.host.id, result.id
);
continue;
}
// If time is too close to the deadline,
// or we already have a canonical result,
// or WU error flag is set,
// then don't bother to resend this result.
// Instead make it time out right away
// so that the transitioner does 'the right thing'.
//
char warning_msg[256];
if (
wu.error_mask ||
wu.canonical_resultid ||
possibly_give_result_new_deadline(result, wu, reply)
) {
result.report_deadline = time(0);
retval = result.mark_as_sent(result.server_state);
if (retval==ERR_DB_NOT_FOUND) {
log_messages.printf(
SCHED_MSG_LOG::MSG_CRITICAL,
"[RESULT#%d] [HOST#%d]: CAN'T SEND, already sent to another host\n",
result.id, reply.host.id
);
} else if (retval) {
log_messages.printf(
SCHED_MSG_LOG::MSG_CRITICAL,
"resend_lost_result: can't update result deadline: %d\n", retval
);
}
if (retval) continue;
retval = update_wu_transition_time(wu, result.report_deadline);
if (retval) {
log_messages.printf(
SCHED_MSG_LOG::MSG_CRITICAL,
"resend_lost_result: can't update WU transition time: %d\n", retval
);
continue;
}
log_messages.printf(
SCHED_MSG_LOG::MSG_DEBUG,
"[HOST#%d][RESULT#%d] not needed or too close to deadline, expiring\n",
reply.host.id, result.id
SCHED_MSG_LOG::MSG_CRITICAL,
"resend_lost_work: can't update result deadline: %d\n", retval
);
sprintf(warning_msg, "Didn't resend lost result %s (expired)", result.name);
USER_MESSAGE um(warning_msg, "high");
reply.insert_message(um);
continue;
}
retval = update_wu_transition_time(wu, result.report_deadline);
if (retval) {
log_messages.printf(
SCHED_MSG_LOG::MSG_CRITICAL,
"resend_lost_result: can't update WU transition time: %d\n", retval
);
continue;
}
sprintf(warning_msg,
"Didn't resend lost result %s (expired)", result.name
);
USER_MESSAGE um(warning_msg, "high");
reply.insert_message(um);
} else {
retval = add_result_to_reply(
result, wu, sreq, reply, platforms, app, avp
);
@ -224,15 +213,9 @@ bool resend_lost_work(
}
}
reply.wreq.nresults_on_host = num_on_host;
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
"[HOST#%d] %d results in progress, set for later checking\n",
reply.host.id, num_on_host
);
if (num_to_resend) {
if (num_eligible_to_resend) {
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
"[HOST#%d] %d lost results, resent %d\n", reply.host.id, num_to_resend, num_resent
"[HOST#%d] %d lost results, resent %d\n", reply.host.id, num_eligible_to_resend, num_resent
);
}

View File

@ -56,6 +56,19 @@ using namespace std;
#define FCGI_ToFILE(x) (x)
#endif
const char* infeasible_string(int code) {
switch (code) {
case INFEASIBLE_MEM: return "Not enough memory";
case INFEASIBLE_DISK: return "Not enough disk";
case INFEASIBLE_CPU: return "CPU too slow";
case INFEASIBLE_APP_SETTING: return "App not selected";
case INFEASIBLE_WORKLOAD: return "Existing workload";
case INFEASIBLE_DUP: return "Already in reply";
case INFEASIBLE_HR: return "Homogeneous redundancy";
}
return "Unknown";
}
const int MIN_SECONDS_TO_SEND = 0;
const int MAX_SECONDS_TO_SEND = (28*SECONDS_IN_DAY);
const int MAX_CPUS = 8;
@ -784,6 +797,8 @@ int add_result_to_reply(
int delay_bound = wu.delay_bound;
if (result.server_state != RESULT_SERVER_STATE_IN_PROGRESS) {
// We are sending this result for the first time
//
// If the workunit needs reliable and is being sent to a reliable host,
// then shorten the delay bound by the percent specified
//
@ -793,18 +808,16 @@ int add_result_to_reply(
}
}
// We are sending this result for the first time
//
result.report_deadline = result.sent_time + delay_bound;
result.server_state = RESULT_SERVER_STATE_IN_PROGRESS;
} else {
// Result was ALREADY sent to this host but never arrived.
// So we are resending it.
// result.report_deadline and time_sent
// have already been updated before this function was called.
// Result was already sent to this host but was lost,
// so we are resending it.
//
resent_result = true;
// TODO: explain the following
//
if (result.report_deadline < result.sent_time) {
result.report_deadline = result.sent_time + 10;
}

View File

@ -66,4 +66,4 @@ extern int update_wu_transition_time(WORKUNIT wu, time_t x);
extern void lock_sema();
extern void unlock_sema();
extern const char* infeasible_string(int);