- client: use project STD, rather than arrival time,

as the major criterion in choosing non-EDF GPU jobs.
    GPU scheduling now respects resource share,
    and as a result STD should no longer diverge.
- client simulator: various improvements, most notably
    that we now generate gnuplot graphs of all debt types

NOTE: the client problem was found and fixed using the simulator!


svn path=/trunk/boinc/; revision=22536
This commit is contained in:
David Anderson 2010-10-15 20:16:00 +00:00
parent d3d1b17668
commit c9be64765c
9 changed files with 254 additions and 161 deletions

View File

@ -7365,3 +7365,23 @@ Rom 15 Oct 2010
clientgui/
AdvancedFrame.cpp
David 15 Oct 2010
- client: use project STD, rather than arrival time,
as the major criterion in choosing non-EDF GPU jobs.
GPU scheduling now respects resource share,
and as a result STD should no longer diverge.
- client simulator: various improvements, most notably
that we now generate gnuplot graphs of all debt types
NOTE: the client problem was found and fixed using the simulator!
client/
client_state.h
client_types.h
cpu_sched.cpp
sim.cpp,h
sim_util.cpp
work_fetch.cpp
lib/
coproc.h

View File

@ -261,7 +261,6 @@ struct CLIENT_STATE {
bool must_check_work_fetch;
void assign_results_to_projects();
RESULT* largest_debt_project_best_result();
RESULT* earliest_deadline_result(bool coproc_only);
void reset_debt_accounting();
bool possibly_schedule_cpus();
void schedule_cpus();

View File

@ -403,6 +403,13 @@ struct PROJECT : PROJ_AM {
}
return cpu_pwf.deadlines_missed;
}
inline double short_term_debt(int rsc_type) {
switch(rsc_type) {
case RSC_TYPE_CUDA: return cuda_pwf.short_term_debt;
case RSC_TYPE_ATI: return ati_pwf.short_term_debt;
}
return cpu_pwf.short_term_debt;
}
int nresults_returned;
// # of results being returned in current scheduler op
@ -661,6 +668,11 @@ struct RESULT {
if (avp->natis > 0) return true;
return false;
}
inline int resource_type() {
if (uses_cuda()) return RSC_TYPE_CUDA;
if (uses_ati()) return RSC_TYPE_ATI;
return RSC_TYPE_CPU;
}
// temporaries used in CLIENT_STATE::rr_simulation():
double rrsim_flops_left;

View File

@ -81,8 +81,11 @@ struct PROC_RESOURCES {
return ncpus_used >= ncpus;
}
inline bool stop_scan_coproc() {
return coprocs.fully_used();
inline bool stop_scan_coproc(int rsc_type) {
if (rsc_type == RSC_TYPE_CUDA) {
return coprocs.cuda.used >= coprocs.cuda.count;
}
return coprocs.ati.used >= coprocs.ati.count;
}
// should we consider scheduling this job?
@ -343,29 +346,49 @@ RESULT* CLIENT_STATE::largest_debt_project_best_result() {
return rp;
}
// Return coproc jobs in FIFO order
// Give priority to already-started jobs because of the following scenario:
// - client gets several jobs in a sched reply and starts download files
// - a job with a later name happens to finish downloading first, and starts
// - a job with an earlier name finishes downloading and preempts
// Return a job of the given type according to the following criteria
// (desc priority):
// - from project with higher STD for that resource
// - already-started job
// - earlier received_time
// - lexicographically earlier name
//
RESULT* first_coproc_result() {
// Give priority to already-started jobs because of the following scenario:
// - client gets several jobs in a sched reply and starts downloading files
// - a later job finishes downloading and starts
// - an earlier finishes downloading and preempts
//
RESULT* first_coproc_result(int rsc_type) {
unsigned int i;
RESULT* best = NULL;
double best_std=0;
for (i=0; i<gstate.results.size(); i++) {
RESULT* rp = gstate.results[i];
if (rp->resource_type() != rsc_type) continue;
if (!rp->runnable()) continue;
if (rp->project->non_cpu_intensive) continue;
if (rp->already_selected) continue;
if (!rp->uses_coprocs()) continue;
double std = rp->project->short_term_debt(rsc_type);
if (!best) {
best = rp;
best_std = std;
continue;
}
if (std < best_std) {
continue;
}
if (std > best_std) {
best = rp;
best_std = std;
continue;
}
bool bs = !best->not_started();
bool rs = !rp->not_started();
if (rs && !bs) {
best = rp;
best_std = std;
continue;
}
if (!rs && bs) {
@ -373,32 +396,31 @@ RESULT* first_coproc_result() {
}
if (rp->received_time < best->received_time) {
best = rp;
best_std = std;
} else if (rp->received_time == best->received_time) {
// make it deterministic by looking at name
//
if (strcmp(rp->name, best->name) > 0) {
best = rp;
best_std = std;
}
}
}
return best;
}
// Return earliest-deadline result.
// if coproc_only:
// return only coproc jobs, and only if project misses deadlines for that coproc
// otherwise:
// return only CPU jobs, and only from a project with deadlines_missed>0
// Return earliest-deadline result for given resource type.
//
RESULT* CLIENT_STATE::earliest_deadline_result(bool coproc_only) {
static RESULT* earliest_deadline_result(int rsc_type) {
RESULT *best_result = NULL;
ACTIVE_TASK* best_atp = NULL;
unsigned int i;
for (i=0; i<results.size(); i++) {
RESULT* rp = results[i];
if (!rp->runnable()) continue;
for (i=0; i<gstate.results.size(); i++) {
RESULT* rp = gstate.results[i];
if (rp->resource_type() != rsc_type) continue;
if (rp->already_selected) continue;
if (!rp->runnable()) continue;
PROJECT* p = rp->project;
if (p->non_cpu_intensive) continue;
@ -406,34 +428,23 @@ RESULT* CLIENT_STATE::earliest_deadline_result(bool coproc_only) {
// treat projects with DCF>90 as if they had deadline misses
//
if (coproc_only) {
if (!rp->uses_coprocs()) continue;
if (rp->avp->ncudas) {
if (p->duration_correction_factor < 90.0) {
if (!p->cuda_pwf.deadlines_missed_copy) {
continue;
}
} else {
only_deadline_misses = false;
}
} else if (rp->avp->natis) {
if (p->duration_correction_factor < 90.0) {
if (!p->ati_pwf.deadlines_missed_copy) {
continue;
}
} else {
only_deadline_misses = false;
}
if (p->duration_correction_factor < 90.0) {
int d;
switch (rsc_type) {
case RSC_TYPE_CUDA:
d = p->cuda_pwf.deadlines_missed_copy;
break;
case RSC_TYPE_ATI:
d = p->ati_pwf.deadlines_missed_copy;
break;
default:
d = p->cpu_pwf.deadlines_missed_copy;
}
if (!d) {
continue;
}
} else {
if (rp->uses_coprocs()) continue;
if (p->duration_correction_factor < 90.0) {
if (!p->cpu_pwf.deadlines_missed_copy) {
continue;
}
} else {
only_deadline_misses = false;
}
only_deadline_misses = false;
}
if (only_deadline_misses && !rp->rr_sim_misses_deadline) {
@ -449,7 +460,7 @@ RESULT* CLIENT_STATE::earliest_deadline_result(bool coproc_only) {
}
if (new_best) {
best_result = rp;
best_atp = lookup_active_task_by_result(rp);
best_atp = gstate.lookup_active_task_by_result(rp);
continue;
}
if (rp->report_deadline > best_result->report_deadline) {
@ -459,7 +470,7 @@ RESULT* CLIENT_STATE::earliest_deadline_result(bool coproc_only) {
// If there's a tie, pick the job with the least remaining time
// (but don't pick an unstarted job over one that's started)
//
ACTIVE_TASK* atp = lookup_active_task_by_result(rp);
ACTIVE_TASK* atp = gstate.lookup_active_task_by_result(rp);
if (best_atp && !atp) continue;
if (rp->estimated_time_remaining(false)
< best_result->estimated_time_remaining(false)
@ -667,6 +678,48 @@ static void promote_once_ran_edf() {
}
}
void add_coproc_jobs(int rsc_type, PROC_RESOURCES& proc_rsc) {
ACTIVE_TASK* atp;
RESULT* rp;
bool can_run;
// choose coproc jobs from projects with coproc deadline misses
//
while (!proc_rsc.stop_scan_coproc(rsc_type)) {
rp = earliest_deadline_result(rsc_type);
if (!rp) break;
rp->already_selected = true;
if (!proc_rsc.can_schedule(rp)) continue;
atp = gstate.lookup_active_task_by_result(rp);
can_run = schedule_if_possible(
rp, atp, proc_rsc, "coprocessor job, EDF"
);
if (!can_run) continue;
if (rsc_type == RSC_TYPE_CUDA) {
rp->project->cuda_pwf.deadlines_missed_copy--;
} else {
rp->project->ati_pwf.deadlines_missed_copy--;
}
rp->edf_scheduled = true;
gstate.ordered_scheduled_results.push_back(rp);
}
// then coproc jobs in FIFO order
//
while (!proc_rsc.stop_scan_coproc(rsc_type)) {
rp = first_coproc_result(rsc_type);
if (!rp) break;
rp->already_selected = true;
if (!proc_rsc.can_schedule(rp)) continue;
atp = gstate.lookup_active_task_by_result(rp);
can_run = schedule_if_possible(
rp, atp, proc_rsc, "coprocessor job, FIFO"
);
if (!can_run) continue;
gstate.ordered_scheduled_results.push_back(rp);
}
}
// CPU scheduler - decide which results to run.
// output: sets ordered_scheduled_result.
//
@ -730,41 +783,8 @@ void CLIENT_STATE::schedule_cpus() {
ordered_scheduled_results.clear();
// choose coproc jobs from projects with coproc deadline misses
//
while (!proc_rsc.stop_scan_coproc()) {
rp = earliest_deadline_result(true);
if (!rp) break;
rp->already_selected = true;
if (!proc_rsc.can_schedule(rp)) continue;
atp = lookup_active_task_by_result(rp);
can_run = schedule_if_possible(
rp, atp, proc_rsc, "coprocessor job, EDF"
);
if (!can_run) continue;
if (rp->avp->ncudas) {
rp->project->cuda_pwf.deadlines_missed_copy--;
} else if (rp->avp->natis) {
rp->project->ati_pwf.deadlines_missed_copy--;
}
rp->edf_scheduled = true;
ordered_scheduled_results.push_back(rp);
}
// then coproc jobs in FIFO order
//
while (!proc_rsc.stop_scan_coproc()) {
rp = first_coproc_result();
if (!rp) break;
rp->already_selected = true;
if (!proc_rsc.can_schedule(rp)) continue;
atp = lookup_active_task_by_result(rp);
can_run = schedule_if_possible(
rp, atp, proc_rsc, "coprocessor job, FIFO"
);
if (!can_run) continue;
ordered_scheduled_results.push_back(rp);
}
add_coproc_jobs(RSC_TYPE_CUDA, proc_rsc);
add_coproc_jobs(RSC_TYPE_ATI, proc_rsc);
// choose CPU jobs from projects with CPU deadline misses
//
@ -772,7 +792,7 @@ void CLIENT_STATE::schedule_cpus() {
if (!cpu_sched_rr_only) {
#endif
while (!proc_rsc.stop_scan_cpu()) {
rp = earliest_deadline_result(false);
rp = earliest_deadline_result(RSC_TYPE_CPU);
if (!rp) break;
rp->already_selected = true;
if (!proc_rsc.can_schedule(rp)) continue;

View File

@ -32,6 +32,12 @@
// log.txt
// summary.xml
// debt.dat
// debt_overall.png
// debt_cpu_std.png
// debt_cpu_ltd.png
// debt_nvidia_std.png
// debt_nvidia_ltd.png
// ...
//
// Simulation params:
// [--duration x]
@ -254,6 +260,7 @@ void decrement_request_rsc(
void decrement_request(RESULT* rp) {
APP_VERSION* avp = rp->avp;
double est_runtime = rp->wup->rsc_fpops_est/avp->flops;
est_runtime /= (gstate.time_stats.on_frac*gstate.time_stats.active_frac);
decrement_request_rsc(cpu_work_fetch, avp->avg_ncpus, est_runtime);
decrement_request_rsc(cuda_work_fetch, avp->ncudas, est_runtime);
decrement_request_rsc(ati_work_fetch, avp->natis, est_runtime);
@ -264,7 +271,7 @@ void decrement_request(RESULT* rp) {
//
bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
char buf[256], buf2[256];
static double last_time=-1e9;
static double last_time=0;
vector<IP_RESULT> ip_results;
int infeasible_count = 0;
vector<RESULT*> new_results;
@ -334,6 +341,8 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
}
msg_printf(0, MSG_INFO, "Got %d tasks", new_results.size());
sprintf(buf, "got %d tasks<br>", new_results.size());
html_msg += buf;
SCHEDULER_REPLY sr;
cpu_work_fetch.req_secs = save_cpu_req_secs;
@ -611,39 +620,41 @@ const char* colors[] = {
"#000088",
};
void show_cuda() {
fprintf(html_out, "<td>");
bool found = false;
for (unsigned int k=0; k<gstate.active_tasks.active_tasks.size(); k++) {
ACTIVE_TASK* atp = gstate.active_tasks.active_tasks[k];
RESULT* rp = atp->result;
if (!rp->avp->ncudas) continue;
if (atp->task_state() != PROCESS_EXECUTING) continue;
PROJECT* p = rp->project;
fprintf(html_out, "%.2f: <font color=%s>%s%s: %.2f</font><br>",
rp->avp->ncudas,
colors[p->index],
atp->result->rr_sim_misses_deadline?"*":"",
atp->result->name,
atp->cpu_time_left
);
found = true;
int njobs_in_progress(PROJECT* p, int rsc_type) {
int n = 0;
unsigned int i;
for (i=0; i<gstate.results.size(); i++) {
RESULT* rp = gstate.results[i];
if (rp->project != p) continue;
if (rp->resource_type() != rsc_type) continue;
if (rp->state() > RESULT_FILES_DOWNLOADED) continue;
n++;
}
if (!found) fprintf(html_out, "IDLE");
fprintf(html_out, "</td>");
return n;
}
void show_ati() {
void show_resource(int rsc_type) {
unsigned int i;
fprintf(html_out, "<td>");
bool found = false;
for (unsigned int k=0; k<gstate.active_tasks.active_tasks.size(); k++) {
ACTIVE_TASK* atp = gstate.active_tasks.active_tasks[k];
for (i=0; i<gstate.active_tasks.active_tasks.size(); i++) {
ACTIVE_TASK* atp = gstate.active_tasks.active_tasks[i];
RESULT* rp = atp->result;
if (!rp->avp->natis) continue;
if (rp->resource_type() != rsc_type) continue;
if (atp->task_state() != PROCESS_EXECUTING) continue;
PROJECT* p = rp->project;
double ninst;
if (rsc_type == RSC_TYPE_CUDA) {
ninst = rp->avp->ncudas;
} else if (rsc_type == RSC_TYPE_ATI) {
ninst = rp->avp->natis;
} else {
ninst = rp->avp->avg_ncpus;
}
fprintf(html_out, "%.2f: <font color=%s>%s%s: %.2f</font><br>",
rp->avp->natis,
ninst,
colors[p->index],
atp->result->rr_sim_misses_deadline?"*":"",
atp->result->name,
@ -652,6 +663,11 @@ void show_ati() {
found = true;
}
if (!found) fprintf(html_out, "IDLE");
for (i=0; i<gstate.projects.size(); i++) {
PROJECT* p = gstate.projects[i];
int n = njobs_in_progress(p, rsc_type);
fprintf(html_out, "<br>%s: %d jobs in progress\n", p->project_name, n);
}
fprintf(html_out, "</td>");
}
@ -674,11 +690,11 @@ void html_start() {
fprintf(html_out,
"<th>CPU<br><font size=-2>Job name and estimated time left<br>color denotes project<br>* means EDF mode</font></th>"
);
if (gstate.host_info.coprocs.cuda.count) {
if (gstate.host_info.have_cuda()) {
fprintf(html_out, "<th>NVIDIA GPU</th>");
nproc_types++;
}
if (gstate.host_info.coprocs.ati.count) {
if (gstate.host_info.have_ati()) {
fprintf(html_out, "<th>ATI GPU</th>");
nproc_types++;
}
@ -686,45 +702,9 @@ void html_start() {
}
void html_rec() {
fprintf(html_out, "<tr><td>%s</td>", time_to_string(gstate.now));
if (!running) {
fprintf(html_out, "<td bgcolor=#aaaaaa>OFF</td>");
if (gstate.host_info.coprocs.cuda.count) {
fprintf(html_out, "<td bgcolor=#aaaaaa>OFF</td>");
}
if (gstate.host_info.coprocs.ati.count) {
fprintf(html_out, "<td bgcolor=#aaaaaa>OFF</td>");
}
} else {
fprintf(html_out, "<td>");
double x=0;
for (unsigned int i=0; i<gstate.active_tasks.active_tasks.size(); i++) {
ACTIVE_TASK* atp = gstate.active_tasks.active_tasks[i];
if (atp->task_state() == PROCESS_EXECUTING) {
PROJECT* p = atp->result->project;
fprintf(html_out, "(%.2f) <font color=%s>%s%s: %.2f</font><br>",
atp->result->avp->avg_ncpus,
colors[p->index],
atp->result->rr_sim_misses_deadline?"*":"",
atp->result->name,
atp->cpu_time_left
);
x += atp->result->avp->avg_ncpus;
}
}
if (x<gstate.ncpus) {
fprintf(html_out, "IDLE: %.2f", gstate.ncpus-x);
}
if (gstate.host_info.coprocs.cuda.count) {
show_cuda();
}
if (gstate.host_info.coprocs.ati.count) {
show_ati();
}
}
if (html_msg.size()) {
fprintf(html_out, "<tr><td>%s</td>", time_to_string(gstate.now));
//fprintf(html_out, "<tr><td>%s</td>", time_to_string(gstate.now));
fprintf(html_out, "<tr><td>%f</td>", gstate.now);
fprintf(html_out,
"<td colspan=%d><font size=-2>%s</font></td></tr>\n",
nproc_types,
@ -732,6 +712,26 @@ void html_rec() {
);
html_msg = "";
}
//fprintf(html_out, "<tr><td>%s</td>", time_to_string(gstate.now));
fprintf(html_out, "<tr><td>%f</td>", gstate.now);
if (!running) {
fprintf(html_out, "<td bgcolor=#aaaaaa>OFF</td>");
if (gstate.host_info.have_cuda()) {
fprintf(html_out, "<td bgcolor=#aaaaaa>OFF</td>");
}
if (gstate.host_info.have_ati()) {
fprintf(html_out, "<td bgcolor=#aaaaaa>OFF</td>");
}
} else {
show_resource(RSC_TYPE_CPU);
if (gstate.host_info.have_cuda()) {
show_resource(RSC_TYPE_CUDA);
}
if (gstate.host_info.have_ati()) {
show_resource(RSC_TYPE_ATI);
}
}
}
void html_end() {
@ -749,10 +749,10 @@ void html_end() {
// per project:
// overall LTD
// CPU LTD
// [NVIDIA LTD]
// [ATI LTD]
// CPU STD
// [NVIDIA LTD]
// [NVIDIA STD]
// [ATI LTD]
// [ATI STD]
//
void write_debts() {
@ -764,13 +764,13 @@ void write_debts() {
p->cpu_pwf.long_term_debt,
p->cpu_pwf.short_term_debt
);
if (gstate.host_info.coprocs.cuda.count) {
if (gstate.host_info.have_cuda()) {
fprintf(debt_file, "%f %f ",
p->cuda_pwf.long_term_debt,
p->cuda_pwf.short_term_debt
);
}
if (gstate.host_info.coprocs.ati.count) {
if (gstate.host_info.have_ati()) {
fprintf(debt_file, "%f %f",
p->ati_pwf.long_term_debt,
p->ati_pwf.short_term_debt
@ -780,6 +780,49 @@ void write_debts() {
fprintf(debt_file, "\n");
}
// generate a bunch of debt graphs
//
void make_graph(const char* title, const char* fname, int field, int nfields) {
char gp_fname[256], cmd[256], png_fname[256];
sprintf(gp_fname, "%s%s.gp", output_file_prefix, fname);
FILE* f = fopen(gp_fname, "w");
fprintf(f,
"set terminal png small size 1024, 768\n"
"set title \"%s\"\n"
"plot ",
title
);
for (unsigned int i=0; i<gstate.projects.size(); i++) {
PROJECT* p = gstate.projects[i];
fprintf(f, "\"%sdebt.dat\" using 1:%d title \"%s\" with lines%s",
output_file_prefix, 2+field+i*nfields, p->project_name,
(i==gstate.projects.size()-1)?"\n":", \\\n"
);
}
fclose(f);
sprintf(png_fname, "%s%s.png", output_file_prefix, fname);
sprintf(cmd, "gnuplot < %s > %s", gp_fname, png_fname);
system(cmd);
}
void debt_graphs() {
int nfields = 3 + (gstate.host_info.have_cuda()?2:0) + (gstate.host_info.have_ati()?2:0);
make_graph("Overall debt", "debt_overall", 0, nfields);
make_graph("CPU LTD", "debt_cpu_ltd", 1, nfields);
make_graph("CPU STD", "debt_cpu_std", 2, nfields);
if (gstate.host_info.have_cuda()) {
make_graph("NVIDIA LTD", "debt_nvidia_ltd", 3, nfields);
make_graph("NVIDIA STD", "debt_nvidia_std", 4, nfields);
}
if (gstate.host_info.have_ati()) {
int off = gstate.host_info.have_cuda()?2:0;
make_graph("ATI LTD", "debt_ati_ltd", 3+off, nfields);
make_graph("ATI STD", "debt_ati_std", 4+off, nfields);
}
}
void simulate() {
bool action;
double start = START_TIME;
@ -951,6 +994,8 @@ void do_client_simulation() {
// then other
print_project_results(stdout);
debt_graphs();
}
int main(int argc, char** argv) {

View File

@ -101,6 +101,7 @@ extern bool gpus_usable;
//#define START_TIME 946684800
// Jan 1 2000
#define START_TIME 0
#define START_TIME 3600
// should be at least an hour or so
#endif

View File

@ -56,7 +56,6 @@ void HOST_INFO::generate_host_cpid() {}
void show_message(PROJECT *p, char* msg, int priority, const char*) {
const char* x;
char message[1024];
char* time_string = time_to_string(gstate.now);
if (priority == MSG_INTERNAL_ERROR) {
strcpy(message, "[error] ");
@ -74,7 +73,7 @@ void show_message(PROJECT *p, char* msg, int priority, const char*) {
x = "---";
}
fprintf(logfile, "%s [%s] %s\n", time_string, x, message);
fprintf(logfile, "%.0f [%s] %s\n", gstate.now, x, message);
}
APP_CLIENT_SHM::APP_CLIENT_SHM() {}

View File

@ -757,7 +757,9 @@ void WORK_FETCH::print_state() {
for (unsigned int i=0; i<gstate.projects.size(); i++) {
PROJECT* p = gstate.projects[i];
if (p->non_cpu_intensive) continue;
msg_printf(p, MSG_INFO, "[work_fetch] overall LTD %.2f", p->pwf.overall_debt);
msg_printf(p, MSG_INFO, "[work_fetch] overall LTD %.2f",
p->pwf.overall_debt
);
}
msg_printf(0, MSG_INFO, "[work_fetch] ------- end work fetch state -------");
}

View File

@ -272,11 +272,6 @@ struct COPROCS {
);
int parse(MIOFILE&);
void summary_string(char*, int);
bool fully_used() {
if (cuda.used < cuda.count) return false;
if (ati.used < ati.count) return false;
return true;
}
// Copy a coproc set, possibly setting usage to zero.
// used in round-robin simulator and CPU scheduler,