mirror of https://github.com/BOINC/boinc.git
- scheduler:
- if WU is infeasible, print message instead of number - remove useless messages - remove EDF simulations printfs - don't update nresults_on_host in resend_lost_work() (it's already done in add_result_to_reply()) svn path=/trunk/boinc/; revision=14336
This commit is contained in:
parent
6b2c7f38a2
commit
df8cbdb294
|
@ -11964,3 +11964,18 @@ Rom 30 Nov 2007
|
|||
boinctray.vcproj
|
||||
win_build/installerv2/
|
||||
BOINC.ism
|
||||
|
||||
David 30 Nov 2007
|
||||
- scheduler:
|
||||
- if WU is infeasible, print message instead of number
|
||||
- remove useless messages
|
||||
- remove EDF simulations printfs
|
||||
- don't update nresults_on_host in resend_lost_work()
|
||||
(it's already done in add_result_to_reply())
|
||||
|
||||
sched/
|
||||
edf_sim.C
|
||||
handle_request.C
|
||||
sched_array.C
|
||||
sched_resend.C
|
||||
sched_send.C,h
|
||||
|
|
|
@ -25,16 +25,6 @@
|
|||
using std::vector;
|
||||
|
||||
//#define TEST
|
||||
//#define DEBUG
|
||||
#ifdef DEBUG
|
||||
#define INFO0
|
||||
#define INFO1
|
||||
#define INFO2
|
||||
#else
|
||||
#define INFO0 //
|
||||
#define INFO1 //
|
||||
#define INFO2 //
|
||||
#endif
|
||||
|
||||
// 0 shows initial workload and candidate decisions
|
||||
// 1 shows function calls results of sim: make/miss deadline
|
||||
|
@ -53,7 +43,9 @@ void mark_edf_misses (int ncpus, vector<IP_RESULT>& ip_results){
|
|||
double booked_to[128];
|
||||
int j;
|
||||
|
||||
INFO1 fprintf(stderr, "mark_edf_misses\n");
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, "mark_edf_misses\n");
|
||||
#endif
|
||||
|
||||
// keeps track of when each cpu is next free
|
||||
//
|
||||
|
@ -82,20 +74,26 @@ void mark_edf_misses (int ncpus, vector<IP_RESULT>& ip_results){
|
|||
}
|
||||
|
||||
booked_to[lowest_booked_cpu] += r.cpu_time_remaining;
|
||||
INFO2 fprintf(stderr, " running %s on cpu %d; finishes at %f\n",
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, " running %s on cpu %d; finishes at %f\n",
|
||||
r.name, lowest_booked_cpu, booked_to[lowest_booked_cpu]
|
||||
);
|
||||
#endif
|
||||
if (booked_to[lowest_booked_cpu] > r.computation_deadline) {
|
||||
r.misses_deadline = true;
|
||||
r.estimated_completion_time = booked_to[lowest_booked_cpu];
|
||||
INFO1 fprintf(stderr, " %s misses_deadline; est completion %f\n",
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, " %s misses_deadline; est completion %f\n",
|
||||
r.name, booked_to[lowest_booked_cpu]
|
||||
);
|
||||
#endif
|
||||
} else {
|
||||
r.misses_deadline = false;
|
||||
INFO1 fprintf(stderr, " %s makes deadline; est completion %f\n",
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, " %s makes deadline; est completion %f\n",
|
||||
r.name, booked_to[lowest_booked_cpu]
|
||||
);
|
||||
#endif
|
||||
// if result doesn't miss its deadline,
|
||||
// then the estimated_completion_time is of no use
|
||||
}
|
||||
|
@ -112,13 +110,17 @@ void init_ip_results(
|
|||
){
|
||||
unsigned int i;
|
||||
|
||||
INFO0 fprintf(stderr, "init_ip_results; work_buf_min %f ncpus %d:\n", work_buf_min, ncpus);
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, "init_ip_results; work_buf_min %f ncpus %d:\n", work_buf_min, ncpus);
|
||||
#endif
|
||||
for (i=0; i<ip_results.size(); i++) {
|
||||
IP_RESULT& r = ip_results[i];
|
||||
r.computation_deadline = r.report_deadline - work_buf_min;
|
||||
INFO0 fprintf(stderr, " %s: deadline %.2f cpu %.2f\n",
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, " %s: deadline %.2f cpu %.2f\n",
|
||||
r.name, r.computation_deadline, r.cpu_time_remaining
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
// run edf simulation to determine whether any results miss their deadline
|
||||
|
@ -189,10 +191,12 @@ bool check_candidate (
|
|||
double booked_to[128]; // keeps track of when each cpu is free
|
||||
int j;
|
||||
|
||||
INFO0 fprintf(stderr, "check_candidate %s: dl %f cpu %f\n",
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, "check_candidate %s: dl %f cpu %f\n",
|
||||
candidate.name, candidate.computation_deadline,
|
||||
candidate.cpu_time_remaining
|
||||
);
|
||||
#endif
|
||||
|
||||
for (j=0; j<ncpus; j++) {
|
||||
booked_to[j] = 0;
|
||||
|
@ -219,9 +223,11 @@ bool check_candidate (
|
|||
}
|
||||
}
|
||||
booked_to[lowest_booked_cpu] += r.cpu_time_remaining;
|
||||
INFO2 fprintf(stderr, " running %s on cpu %d; finishes at %f\n",
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, " running %s on cpu %d; finishes at %f\n",
|
||||
r.name, lowest_booked_cpu, booked_to[lowest_booked_cpu]
|
||||
);
|
||||
#endif
|
||||
|
||||
// return false if completion time if > computation_deadline AND
|
||||
// result would not have missed deadline to begin with
|
||||
|
@ -229,9 +235,11 @@ bool check_candidate (
|
|||
if (booked_to[lowest_booked_cpu] > r.computation_deadline
|
||||
&& !r.misses_deadline
|
||||
) {
|
||||
INFO0 fprintf(stderr, " cand. fails; %s now misses deadline: %f\n",
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, " cand. fails; %s now misses deadline: %f\n",
|
||||
r.name, booked_to[lowest_booked_cpu]
|
||||
);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
// check a late result (i.e., one that would have missed its
|
||||
|
@ -240,11 +248,15 @@ bool check_candidate (
|
|||
if (r.misses_deadline
|
||||
&& booked_to[lowest_booked_cpu] > r.estimated_completion_time
|
||||
){
|
||||
INFO1 fprintf(stderr, " cand. fails; late result %s to be returned even later\n", r.name);
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, " cand. fails; late result %s to be returned even later\n", r.name);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
}
|
||||
INFO1 fprintf(stderr, " cand. succeeds\n");
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, " cand. succeeds\n");
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -530,7 +530,7 @@ static int update_host_record(HOST& initial_host, HOST& xhost, USER& user) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
// Figure out which of the results the user currently has
|
||||
// Figure out which of the results the host currently has
|
||||
// should be aborted outright, or aborted if not started yet
|
||||
//
|
||||
int send_result_abort(
|
||||
|
@ -556,10 +556,6 @@ int send_result_abort(
|
|||
result_names.append("'");
|
||||
result_names.append(orp.name);
|
||||
result_names.append("'");
|
||||
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
|
||||
"Result is on [HOST#%d]: %s\n",
|
||||
reply.host.id, orp.name.c_str()
|
||||
);
|
||||
}
|
||||
|
||||
// query the db for the results and set the appropriate flag
|
||||
|
@ -1426,14 +1422,15 @@ void process_request(
|
|||
handle_results(sreq, reply);
|
||||
|
||||
reply.wreq.nresults_on_host = sreq.other_results.size();
|
||||
if (config.resend_lost_results && sreq.have_other_results_list) {
|
||||
if (resend_lost_work(sreq, reply, platforms, ss)) {
|
||||
ok_to_send_work = false;
|
||||
if (sreq.have_other_results_list) {
|
||||
if (config.resend_lost_results) {
|
||||
if (resend_lost_work(sreq, reply, platforms, ss)) {
|
||||
ok_to_send_work = false;
|
||||
}
|
||||
}
|
||||
if (config.send_result_abort) {
|
||||
send_result_abort(sreq, reply, ss);
|
||||
}
|
||||
}
|
||||
|
||||
if (config.send_result_abort && sreq.have_other_results_list) {
|
||||
send_result_abort(sreq, reply, ss);
|
||||
}
|
||||
|
||||
// if last RPC was within config.min_sendwork_interval, don't send work
|
||||
|
|
|
@ -116,8 +116,8 @@ void scan_work_array(
|
|||
retval = wu_is_infeasible(wu, sreq, reply, *app);
|
||||
if (retval) {
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible: %d\n",
|
||||
reply.host.id, wu.id, wu.name, retval
|
||||
SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible: %s\n",
|
||||
reply.host.id, wu.id, wu.name, infeasible_string(retval)
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -54,19 +54,19 @@ static int possibly_give_result_new_deadline(
|
|||
DB_RESULT& result, WORKUNIT& wu, SCHEDULER_REPLY& reply
|
||||
) {
|
||||
const double resend_frac = 0.5; // range [0, 1)
|
||||
int result_sent_time = time(0);
|
||||
int result_report_deadline = result_sent_time + (int)(resend_frac*(result.report_deadline - result.sent_time));
|
||||
int now = time(0);
|
||||
int result_report_deadline = now + (int)(resend_frac*(result.report_deadline - result.sent_time));
|
||||
|
||||
if (result_report_deadline < result.report_deadline) {
|
||||
result_report_deadline = result.report_deadline;
|
||||
}
|
||||
if (result_report_deadline > result_sent_time + wu.delay_bound) {
|
||||
result_report_deadline = result_sent_time + wu.delay_bound;
|
||||
if (result_report_deadline > now + wu.delay_bound) {
|
||||
result_report_deadline = now + wu.delay_bound;
|
||||
}
|
||||
|
||||
// If infeasible, return without modifying result
|
||||
//
|
||||
if (estimate_cpu_duration(wu, reply) > result_report_deadline-result_sent_time) {
|
||||
if (estimate_cpu_duration(wu, reply) > result_report_deadline-now) {
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::MSG_DEBUG,
|
||||
"[RESULT#%d] [HOST#%d] not resending lost result: can't complete in time\n",
|
||||
|
@ -83,11 +83,17 @@ static int possibly_give_result_new_deadline(
|
|||
result.id, reply.host.id,
|
||||
result_report_deadline==result.report_deadline?"NO update to":"Updated"
|
||||
);
|
||||
result.sent_time = result_sent_time;
|
||||
result.sent_time = now;
|
||||
result.report_deadline = result_report_deadline;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// resend any jobs that:
|
||||
// 1) we already sent to this host;
|
||||
// 2) are still in progress (i.e. haven't timed out) and
|
||||
// 3) aren't present on the host
|
||||
// Return true if there were any such jobs
|
||||
//
|
||||
bool resend_lost_work(
|
||||
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply,
|
||||
PLATFORM_LIST& platforms, SCHED_SHMEM& ss
|
||||
|
@ -96,116 +102,99 @@ bool resend_lost_work(
|
|||
std::vector<DB_RESULT>results;
|
||||
unsigned int i;
|
||||
char buf[256];
|
||||
char warning_msg[256];
|
||||
bool did_any = false;
|
||||
int num_to_resend=0;
|
||||
int num_eligible_to_resend=0;
|
||||
int num_resent=0;
|
||||
int num_on_host=0;
|
||||
APP* app;
|
||||
APP_VERSION* avp;
|
||||
int retval;
|
||||
|
||||
// print list of results on host
|
||||
//
|
||||
for (i=0; i<sreq.other_results.size(); i++) {
|
||||
OTHER_RESULT& orp=sreq.other_results[i];
|
||||
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
|
||||
"Result is on [HOST#%d]: %s\n",
|
||||
reply.host.id, orp.name.c_str()
|
||||
);
|
||||
}
|
||||
|
||||
sprintf(buf, " where hostid=%d and server_state=%d ",
|
||||
reply.host.id, RESULT_SERVER_STATE_IN_PROGRESS
|
||||
);
|
||||
while (!result.enumerate(buf)) {
|
||||
bool found = false;
|
||||
num_on_host++;
|
||||
for (i=0; i<sreq.other_results.size(); i++) {
|
||||
OTHER_RESULT& orp = sreq.other_results[i];
|
||||
if (!strcmp(orp.name.c_str(), result.name)) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
num_to_resend++;
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::MSG_DEBUG,
|
||||
"[HOST#%d] found lost [RESULT#%d]: %s\n",
|
||||
reply.host.id, result.id, result.name
|
||||
if (found) continue;
|
||||
|
||||
num_eligible_to_resend++;
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::MSG_DEBUG,
|
||||
"[HOST#%d] found lost [RESULT#%d]: %s\n",
|
||||
reply.host.id, result.id, result.name
|
||||
);
|
||||
|
||||
DB_WORKUNIT wu;
|
||||
retval = wu.lookup_id(result.workunitid);
|
||||
if (retval) {
|
||||
log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
|
||||
"[HOST#%d] WU not found for [RESULT#%d]\n",
|
||||
reply.host.id, result.id
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
DB_WORKUNIT wu;
|
||||
retval = wu.lookup_id(result.workunitid);
|
||||
if (retval) {
|
||||
log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
|
||||
"[HOST#%d] WU not found for [RESULT#%d]\n",
|
||||
reply.host.id, result.id
|
||||
);
|
||||
continue;
|
||||
}
|
||||
reply.wreq.core_client_version =
|
||||
sreq.core_client_major_version*100 + sreq.core_client_minor_version;
|
||||
|
||||
reply.wreq.core_client_version =
|
||||
sreq.core_client_major_version*100 + sreq.core_client_minor_version;
|
||||
|
||||
retval = get_app_version(
|
||||
wu, app, avp, sreq, reply, platforms, ss
|
||||
retval = get_app_version(
|
||||
wu, app, avp, sreq, reply, platforms, ss
|
||||
);
|
||||
if (retval) {
|
||||
log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
|
||||
"[HOST#%d] no app version [RESULT#%d]\n",
|
||||
reply.host.id, result.id
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// If time is too close to the deadline,
|
||||
// or we already have a canonical result,
|
||||
// or WU error flag is set,
|
||||
// then don't bother to resend this result.
|
||||
// Instead make it time out right away
|
||||
// so that the transitioner does 'the right thing'.
|
||||
//
|
||||
if (
|
||||
wu.error_mask ||
|
||||
wu.canonical_resultid ||
|
||||
possibly_give_result_new_deadline(result, wu, reply)
|
||||
) {
|
||||
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
|
||||
"[HOST#%d][RESULT#%d] not needed or too close to deadline, expiring\n",
|
||||
reply.host.id, result.id
|
||||
);
|
||||
result.report_deadline = time(0)-1;
|
||||
retval = result.mark_as_sent(result.server_state);
|
||||
if (retval) {
|
||||
log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
|
||||
"[HOST#%d] no app version [RESULT#%d]\n",
|
||||
reply.host.id, result.id
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// If time is too close to the deadline,
|
||||
// or we already have a canonical result,
|
||||
// or WU error flag is set,
|
||||
// then don't bother to resend this result.
|
||||
// Instead make it time out right away
|
||||
// so that the transitioner does 'the right thing'.
|
||||
//
|
||||
char warning_msg[256];
|
||||
if (
|
||||
wu.error_mask ||
|
||||
wu.canonical_resultid ||
|
||||
possibly_give_result_new_deadline(result, wu, reply)
|
||||
) {
|
||||
result.report_deadline = time(0);
|
||||
retval = result.mark_as_sent(result.server_state);
|
||||
if (retval==ERR_DB_NOT_FOUND) {
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::MSG_CRITICAL,
|
||||
"[RESULT#%d] [HOST#%d]: CAN'T SEND, already sent to another host\n",
|
||||
result.id, reply.host.id
|
||||
);
|
||||
} else if (retval) {
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::MSG_CRITICAL,
|
||||
"resend_lost_result: can't update result deadline: %d\n", retval
|
||||
);
|
||||
}
|
||||
if (retval) continue;
|
||||
|
||||
retval = update_wu_transition_time(wu, result.report_deadline);
|
||||
if (retval) {
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::MSG_CRITICAL,
|
||||
"resend_lost_result: can't update WU transition time: %d\n", retval
|
||||
);
|
||||
continue;
|
||||
}
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::MSG_DEBUG,
|
||||
"[HOST#%d][RESULT#%d] not needed or too close to deadline, expiring\n",
|
||||
reply.host.id, result.id
|
||||
SCHED_MSG_LOG::MSG_CRITICAL,
|
||||
"resend_lost_work: can't update result deadline: %d\n", retval
|
||||
);
|
||||
sprintf(warning_msg, "Didn't resend lost result %s (expired)", result.name);
|
||||
USER_MESSAGE um(warning_msg, "high");
|
||||
reply.insert_message(um);
|
||||
continue;
|
||||
}
|
||||
|
||||
retval = update_wu_transition_time(wu, result.report_deadline);
|
||||
if (retval) {
|
||||
log_messages.printf(
|
||||
SCHED_MSG_LOG::MSG_CRITICAL,
|
||||
"resend_lost_result: can't update WU transition time: %d\n", retval
|
||||
);
|
||||
continue;
|
||||
}
|
||||
sprintf(warning_msg,
|
||||
"Didn't resend lost result %s (expired)", result.name
|
||||
);
|
||||
USER_MESSAGE um(warning_msg, "high");
|
||||
reply.insert_message(um);
|
||||
} else {
|
||||
retval = add_result_to_reply(
|
||||
result, wu, sreq, reply, platforms, app, avp
|
||||
);
|
||||
|
@ -224,15 +213,9 @@ bool resend_lost_work(
|
|||
}
|
||||
}
|
||||
|
||||
reply.wreq.nresults_on_host = num_on_host;
|
||||
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
|
||||
"[HOST#%d] %d results in progress, set for later checking\n",
|
||||
reply.host.id, num_on_host
|
||||
);
|
||||
|
||||
if (num_to_resend) {
|
||||
if (num_eligible_to_resend) {
|
||||
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
|
||||
"[HOST#%d] %d lost results, resent %d\n", reply.host.id, num_to_resend, num_resent
|
||||
"[HOST#%d] %d lost results, resent %d\n", reply.host.id, num_eligible_to_resend, num_resent
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
@ -56,6 +56,19 @@ using namespace std;
|
|||
#define FCGI_ToFILE(x) (x)
|
||||
#endif
|
||||
|
||||
const char* infeasible_string(int code) {
|
||||
switch (code) {
|
||||
case INFEASIBLE_MEM: return "Not enough memory";
|
||||
case INFEASIBLE_DISK: return "Not enough disk";
|
||||
case INFEASIBLE_CPU: return "CPU too slow";
|
||||
case INFEASIBLE_APP_SETTING: return "App not selected";
|
||||
case INFEASIBLE_WORKLOAD: return "Existing workload";
|
||||
case INFEASIBLE_DUP: return "Already in reply";
|
||||
case INFEASIBLE_HR: return "Homogeneous redundancy";
|
||||
}
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
const int MIN_SECONDS_TO_SEND = 0;
|
||||
const int MAX_SECONDS_TO_SEND = (28*SECONDS_IN_DAY);
|
||||
const int MAX_CPUS = 8;
|
||||
|
@ -784,6 +797,8 @@ int add_result_to_reply(
|
|||
|
||||
int delay_bound = wu.delay_bound;
|
||||
if (result.server_state != RESULT_SERVER_STATE_IN_PROGRESS) {
|
||||
// We are sending this result for the first time
|
||||
//
|
||||
// If the workunit needs reliable and is being sent to a reliable host,
|
||||
// then shorten the delay bound by the percent specified
|
||||
//
|
||||
|
@ -793,18 +808,16 @@ int add_result_to_reply(
|
|||
}
|
||||
}
|
||||
|
||||
// We are sending this result for the first time
|
||||
//
|
||||
result.report_deadline = result.sent_time + delay_bound;
|
||||
result.server_state = RESULT_SERVER_STATE_IN_PROGRESS;
|
||||
} else {
|
||||
// Result was ALREADY sent to this host but never arrived.
|
||||
// So we are resending it.
|
||||
// result.report_deadline and time_sent
|
||||
// have already been updated before this function was called.
|
||||
// Result was already sent to this host but was lost,
|
||||
// so we are resending it.
|
||||
//
|
||||
resent_result = true;
|
||||
|
||||
// TODO: explain the following
|
||||
//
|
||||
if (result.report_deadline < result.sent_time) {
|
||||
result.report_deadline = result.sent_time + 10;
|
||||
}
|
||||
|
|
|
@ -66,4 +66,4 @@ extern int update_wu_transition_time(WORKUNIT wu, time_t x);
|
|||
|
||||
extern void lock_sema();
|
||||
extern void unlock_sema();
|
||||
|
||||
extern const char* infeasible_string(int);
|
||||
|
|
Loading…
Reference in New Issue