diff --git a/checkin_notes b/checkin_notes index b7a13594b3..0eea2719ca 100644 --- a/checkin_notes +++ b/checkin_notes @@ -7365,3 +7365,23 @@ Rom 15 Oct 2010 clientgui/ AdvancedFrame.cpp + +David 15 Oct 2010 + - client: use project STD, rather than arrival time, + as the major criterion in choosing non-EDF GPU jobs. + GPU scheduling now respects resource share, + and as a result STD should no longer diverge. + - client simulator: various improvements, most notably + that we now generate gnuplot graphs of all debt types + + NOTE: the client problem was found and fixed using the simulator! + + client/ + client_state.h + client_types.h + cpu_sched.cpp + sim.cpp,h + sim_util.cpp + work_fetch.cpp + lib/ + coproc.h diff --git a/client/client_state.h b/client/client_state.h index 23cf6510d9..b4330b0018 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -261,7 +261,6 @@ struct CLIENT_STATE { bool must_check_work_fetch; void assign_results_to_projects(); RESULT* largest_debt_project_best_result(); - RESULT* earliest_deadline_result(bool coproc_only); void reset_debt_accounting(); bool possibly_schedule_cpus(); void schedule_cpus(); diff --git a/client/client_types.h b/client/client_types.h index b42dd44487..d35216b5ee 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -403,6 +403,13 @@ struct PROJECT : PROJ_AM { } return cpu_pwf.deadlines_missed; } + inline double short_term_debt(int rsc_type) { + switch(rsc_type) { + case RSC_TYPE_CUDA: return cuda_pwf.short_term_debt; + case RSC_TYPE_ATI: return ati_pwf.short_term_debt; + } + return cpu_pwf.short_term_debt; + } int nresults_returned; // # of results being returned in current scheduler op @@ -661,6 +668,11 @@ struct RESULT { if (avp->natis > 0) return true; return false; } + inline int resource_type() { + if (uses_cuda()) return RSC_TYPE_CUDA; + if (uses_ati()) return RSC_TYPE_ATI; + return RSC_TYPE_CPU; + } // temporaries used in CLIENT_STATE::rr_simulation(): double rrsim_flops_left; diff --git a/client/cpu_sched.cpp b/client/cpu_sched.cpp index 35bc280490..17540ace81 100644 --- a/client/cpu_sched.cpp +++ b/client/cpu_sched.cpp @@ -81,8 +81,11 @@ struct PROC_RESOURCES { return ncpus_used >= ncpus; } - inline bool stop_scan_coproc() { - return coprocs.fully_used(); + inline bool stop_scan_coproc(int rsc_type) { + if (rsc_type == RSC_TYPE_CUDA) { + return coprocs.cuda.used >= coprocs.cuda.count; + } + return coprocs.ati.used >= coprocs.ati.count; } // should we consider scheduling this job? @@ -343,29 +346,49 @@ RESULT* CLIENT_STATE::largest_debt_project_best_result() { return rp; } -// Return coproc jobs in FIFO order -// Give priority to already-started jobs because of the following scenario: -// - client gets several jobs in a sched reply and starts download files -// - a job with a later name happens to finish downloading first, and starts -// - a job with an earlier name finishes downloading and preempts +// Return a job of the given type according to the following criteria +// (desc priority): +// - from project with higher STD for that resource +// - already-started job +// - earlier received_time +// - lexicographically earlier name // -RESULT* first_coproc_result() { +// Give priority to already-started jobs because of the following scenario: +// - client gets several jobs in a sched reply and starts downloading files +// - a later job finishes downloading and starts +// - an earlier finishes downloading and preempts +// +RESULT* first_coproc_result(int rsc_type) { unsigned int i; RESULT* best = NULL; + double best_std=0; for (i=0; iresource_type() != rsc_type) continue; if (!rp->runnable()) continue; if (rp->project->non_cpu_intensive) continue; if (rp->already_selected) continue; - if (!rp->uses_coprocs()) continue; + double std = rp->project->short_term_debt(rsc_type); if (!best) { best = rp; + best_std = std; continue; } + + if (std < best_std) { + continue; + } + if (std > best_std) { + best = rp; + best_std = std; + continue; + } + bool bs = !best->not_started(); bool rs = !rp->not_started(); if (rs && !bs) { best = rp; + best_std = std; continue; } if (!rs && bs) { @@ -373,32 +396,31 @@ RESULT* first_coproc_result() { } if (rp->received_time < best->received_time) { best = rp; + best_std = std; } else if (rp->received_time == best->received_time) { // make it deterministic by looking at name // if (strcmp(rp->name, best->name) > 0) { best = rp; + best_std = std; } } } return best; } -// Return earliest-deadline result. -// if coproc_only: -// return only coproc jobs, and only if project misses deadlines for that coproc -// otherwise: -// return only CPU jobs, and only from a project with deadlines_missed>0 +// Return earliest-deadline result for given resource type. // -RESULT* CLIENT_STATE::earliest_deadline_result(bool coproc_only) { +static RESULT* earliest_deadline_result(int rsc_type) { RESULT *best_result = NULL; ACTIVE_TASK* best_atp = NULL; unsigned int i; - for (i=0; irunnable()) continue; + for (i=0; iresource_type() != rsc_type) continue; if (rp->already_selected) continue; + if (!rp->runnable()) continue; PROJECT* p = rp->project; if (p->non_cpu_intensive) continue; @@ -406,34 +428,23 @@ RESULT* CLIENT_STATE::earliest_deadline_result(bool coproc_only) { // treat projects with DCF>90 as if they had deadline misses // - if (coproc_only) { - if (!rp->uses_coprocs()) continue; - if (rp->avp->ncudas) { - if (p->duration_correction_factor < 90.0) { - if (!p->cuda_pwf.deadlines_missed_copy) { - continue; - } - } else { - only_deadline_misses = false; - } - } else if (rp->avp->natis) { - if (p->duration_correction_factor < 90.0) { - if (!p->ati_pwf.deadlines_missed_copy) { - continue; - } - } else { - only_deadline_misses = false; - } + if (p->duration_correction_factor < 90.0) { + int d; + switch (rsc_type) { + case RSC_TYPE_CUDA: + d = p->cuda_pwf.deadlines_missed_copy; + break; + case RSC_TYPE_ATI: + d = p->ati_pwf.deadlines_missed_copy; + break; + default: + d = p->cpu_pwf.deadlines_missed_copy; + } + if (!d) { + continue; } } else { - if (rp->uses_coprocs()) continue; - if (p->duration_correction_factor < 90.0) { - if (!p->cpu_pwf.deadlines_missed_copy) { - continue; - } - } else { - only_deadline_misses = false; - } + only_deadline_misses = false; } if (only_deadline_misses && !rp->rr_sim_misses_deadline) { @@ -449,7 +460,7 @@ RESULT* CLIENT_STATE::earliest_deadline_result(bool coproc_only) { } if (new_best) { best_result = rp; - best_atp = lookup_active_task_by_result(rp); + best_atp = gstate.lookup_active_task_by_result(rp); continue; } if (rp->report_deadline > best_result->report_deadline) { @@ -459,7 +470,7 @@ RESULT* CLIENT_STATE::earliest_deadline_result(bool coproc_only) { // If there's a tie, pick the job with the least remaining time // (but don't pick an unstarted job over one that's started) // - ACTIVE_TASK* atp = lookup_active_task_by_result(rp); + ACTIVE_TASK* atp = gstate.lookup_active_task_by_result(rp); if (best_atp && !atp) continue; if (rp->estimated_time_remaining(false) < best_result->estimated_time_remaining(false) @@ -667,6 +678,48 @@ static void promote_once_ran_edf() { } } +void add_coproc_jobs(int rsc_type, PROC_RESOURCES& proc_rsc) { + ACTIVE_TASK* atp; + RESULT* rp; + bool can_run; + + // choose coproc jobs from projects with coproc deadline misses + // + while (!proc_rsc.stop_scan_coproc(rsc_type)) { + rp = earliest_deadline_result(rsc_type); + if (!rp) break; + rp->already_selected = true; + if (!proc_rsc.can_schedule(rp)) continue; + atp = gstate.lookup_active_task_by_result(rp); + can_run = schedule_if_possible( + rp, atp, proc_rsc, "coprocessor job, EDF" + ); + if (!can_run) continue; + if (rsc_type == RSC_TYPE_CUDA) { + rp->project->cuda_pwf.deadlines_missed_copy--; + } else { + rp->project->ati_pwf.deadlines_missed_copy--; + } + rp->edf_scheduled = true; + gstate.ordered_scheduled_results.push_back(rp); + } + + // then coproc jobs in FIFO order + // + while (!proc_rsc.stop_scan_coproc(rsc_type)) { + rp = first_coproc_result(rsc_type); + if (!rp) break; + rp->already_selected = true; + if (!proc_rsc.can_schedule(rp)) continue; + atp = gstate.lookup_active_task_by_result(rp); + can_run = schedule_if_possible( + rp, atp, proc_rsc, "coprocessor job, FIFO" + ); + if (!can_run) continue; + gstate.ordered_scheduled_results.push_back(rp); + } +} + // CPU scheduler - decide which results to run. // output: sets ordered_scheduled_result. // @@ -730,41 +783,8 @@ void CLIENT_STATE::schedule_cpus() { ordered_scheduled_results.clear(); - // choose coproc jobs from projects with coproc deadline misses - // - while (!proc_rsc.stop_scan_coproc()) { - rp = earliest_deadline_result(true); - if (!rp) break; - rp->already_selected = true; - if (!proc_rsc.can_schedule(rp)) continue; - atp = lookup_active_task_by_result(rp); - can_run = schedule_if_possible( - rp, atp, proc_rsc, "coprocessor job, EDF" - ); - if (!can_run) continue; - if (rp->avp->ncudas) { - rp->project->cuda_pwf.deadlines_missed_copy--; - } else if (rp->avp->natis) { - rp->project->ati_pwf.deadlines_missed_copy--; - } - rp->edf_scheduled = true; - ordered_scheduled_results.push_back(rp); - } - - // then coproc jobs in FIFO order - // - while (!proc_rsc.stop_scan_coproc()) { - rp = first_coproc_result(); - if (!rp) break; - rp->already_selected = true; - if (!proc_rsc.can_schedule(rp)) continue; - atp = lookup_active_task_by_result(rp); - can_run = schedule_if_possible( - rp, atp, proc_rsc, "coprocessor job, FIFO" - ); - if (!can_run) continue; - ordered_scheduled_results.push_back(rp); - } + add_coproc_jobs(RSC_TYPE_CUDA, proc_rsc); + add_coproc_jobs(RSC_TYPE_ATI, proc_rsc); // choose CPU jobs from projects with CPU deadline misses // @@ -772,7 +792,7 @@ void CLIENT_STATE::schedule_cpus() { if (!cpu_sched_rr_only) { #endif while (!proc_rsc.stop_scan_cpu()) { - rp = earliest_deadline_result(false); + rp = earliest_deadline_result(RSC_TYPE_CPU); if (!rp) break; rp->already_selected = true; if (!proc_rsc.can_schedule(rp)) continue; diff --git a/client/sim.cpp b/client/sim.cpp index f295e322e7..b4061f7ad8 100644 --- a/client/sim.cpp +++ b/client/sim.cpp @@ -32,6 +32,12 @@ // log.txt // summary.xml // debt.dat +// debt_overall.png +// debt_cpu_std.png +// debt_cpu_ltd.png +// debt_nvidia_std.png +// debt_nvidia_ltd.png +// ... // // Simulation params: // [--duration x] @@ -254,6 +260,7 @@ void decrement_request_rsc( void decrement_request(RESULT* rp) { APP_VERSION* avp = rp->avp; double est_runtime = rp->wup->rsc_fpops_est/avp->flops; + est_runtime /= (gstate.time_stats.on_frac*gstate.time_stats.active_frac); decrement_request_rsc(cpu_work_fetch, avp->avg_ncpus, est_runtime); decrement_request_rsc(cuda_work_fetch, avp->ncudas, est_runtime); decrement_request_rsc(ati_work_fetch, avp->natis, est_runtime); @@ -264,7 +271,7 @@ void decrement_request(RESULT* rp) { // bool CLIENT_STATE::simulate_rpc(PROJECT* p) { char buf[256], buf2[256]; - static double last_time=-1e9; + static double last_time=0; vector ip_results; int infeasible_count = 0; vector new_results; @@ -334,6 +341,8 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) { } msg_printf(0, MSG_INFO, "Got %d tasks", new_results.size()); + sprintf(buf, "got %d tasks
", new_results.size()); + html_msg += buf; SCHEDULER_REPLY sr; cpu_work_fetch.req_secs = save_cpu_req_secs; @@ -611,39 +620,41 @@ const char* colors[] = { "#000088", }; -void show_cuda() { - fprintf(html_out, ""); - bool found = false; - for (unsigned int k=0; kresult; - if (!rp->avp->ncudas) continue; - if (atp->task_state() != PROCESS_EXECUTING) continue; - PROJECT* p = rp->project; - fprintf(html_out, "%.2f: %s%s: %.2f
", - rp->avp->ncudas, - colors[p->index], - atp->result->rr_sim_misses_deadline?"*":"", - atp->result->name, - atp->cpu_time_left - ); - found = true; +int njobs_in_progress(PROJECT* p, int rsc_type) { + int n = 0; + unsigned int i; + for (i=0; iproject != p) continue; + if (rp->resource_type() != rsc_type) continue; + if (rp->state() > RESULT_FILES_DOWNLOADED) continue; + n++; } - if (!found) fprintf(html_out, "IDLE"); - fprintf(html_out, ""); + return n; } -void show_ati() { +void show_resource(int rsc_type) { + unsigned int i; + fprintf(html_out, ""); bool found = false; - for (unsigned int k=0; kresult; - if (!rp->avp->natis) continue; + if (rp->resource_type() != rsc_type) continue; if (atp->task_state() != PROCESS_EXECUTING) continue; PROJECT* p = rp->project; + double ninst; + if (rsc_type == RSC_TYPE_CUDA) { + ninst = rp->avp->ncudas; + } else if (rsc_type == RSC_TYPE_ATI) { + ninst = rp->avp->natis; + } else { + ninst = rp->avp->avg_ncpus; + } + fprintf(html_out, "%.2f: %s%s: %.2f
", - rp->avp->natis, + ninst, colors[p->index], atp->result->rr_sim_misses_deadline?"*":"", atp->result->name, @@ -652,6 +663,11 @@ void show_ati() { found = true; } if (!found) fprintf(html_out, "IDLE"); + for (i=0; i%s: %d jobs in progress\n", p->project_name, n); + } fprintf(html_out, ""); } @@ -674,11 +690,11 @@ void html_start() { fprintf(html_out, "CPU
Job name and estimated time left
color denotes project
* means EDF mode
" ); - if (gstate.host_info.coprocs.cuda.count) { + if (gstate.host_info.have_cuda()) { fprintf(html_out, "NVIDIA GPU"); nproc_types++; } - if (gstate.host_info.coprocs.ati.count) { + if (gstate.host_info.have_ati()) { fprintf(html_out, "ATI GPU"); nproc_types++; } @@ -686,45 +702,9 @@ void html_start() { } void html_rec() { - fprintf(html_out, "%s", time_to_string(gstate.now)); - - if (!running) { - fprintf(html_out, "OFF"); - if (gstate.host_info.coprocs.cuda.count) { - fprintf(html_out, "OFF"); - } - if (gstate.host_info.coprocs.ati.count) { - fprintf(html_out, "OFF"); - } - } else { - fprintf(html_out, ""); - double x=0; - for (unsigned int i=0; itask_state() == PROCESS_EXECUTING) { - PROJECT* p = atp->result->project; - fprintf(html_out, "(%.2f) %s%s: %.2f
", - atp->result->avp->avg_ncpus, - colors[p->index], - atp->result->rr_sim_misses_deadline?"*":"", - atp->result->name, - atp->cpu_time_left - ); - x += atp->result->avp->avg_ncpus; - } - } - if (x%s", time_to_string(gstate.now)); + //fprintf(html_out, "%s", time_to_string(gstate.now)); + fprintf(html_out, "%f", gstate.now); fprintf(html_out, "%s\n", nproc_types, @@ -732,6 +712,26 @@ void html_rec() { ); html_msg = ""; } + //fprintf(html_out, "%s", time_to_string(gstate.now)); + fprintf(html_out, "%f", gstate.now); + + if (!running) { + fprintf(html_out, "OFF"); + if (gstate.host_info.have_cuda()) { + fprintf(html_out, "OFF"); + } + if (gstate.host_info.have_ati()) { + fprintf(html_out, "OFF"); + } + } else { + show_resource(RSC_TYPE_CPU); + if (gstate.host_info.have_cuda()) { + show_resource(RSC_TYPE_CUDA); + } + if (gstate.host_info.have_ati()) { + show_resource(RSC_TYPE_ATI); + } + } } void html_end() { @@ -749,10 +749,10 @@ void html_end() { // per project: // overall LTD // CPU LTD -// [NVIDIA LTD] -// [ATI LTD] // CPU STD +// [NVIDIA LTD] // [NVIDIA STD] +// [ATI LTD] // [ATI STD] // void write_debts() { @@ -764,13 +764,13 @@ void write_debts() { p->cpu_pwf.long_term_debt, p->cpu_pwf.short_term_debt ); - if (gstate.host_info.coprocs.cuda.count) { + if (gstate.host_info.have_cuda()) { fprintf(debt_file, "%f %f ", p->cuda_pwf.long_term_debt, p->cuda_pwf.short_term_debt ); } - if (gstate.host_info.coprocs.ati.count) { + if (gstate.host_info.have_ati()) { fprintf(debt_file, "%f %f", p->ati_pwf.long_term_debt, p->ati_pwf.short_term_debt @@ -780,6 +780,49 @@ void write_debts() { fprintf(debt_file, "\n"); } +// generate a bunch of debt graphs +// + +void make_graph(const char* title, const char* fname, int field, int nfields) { + char gp_fname[256], cmd[256], png_fname[256]; + + sprintf(gp_fname, "%s%s.gp", output_file_prefix, fname); + FILE* f = fopen(gp_fname, "w"); + fprintf(f, + "set terminal png small size 1024, 768\n" + "set title \"%s\"\n" + "plot ", + title + ); + for (unsigned int i=0; iproject_name, + (i==gstate.projects.size()-1)?"\n":", \\\n" + ); + } + fclose(f); + sprintf(png_fname, "%s%s.png", output_file_prefix, fname); + sprintf(cmd, "gnuplot < %s > %s", gp_fname, png_fname); + system(cmd); +} + +void debt_graphs() { + int nfields = 3 + (gstate.host_info.have_cuda()?2:0) + (gstate.host_info.have_ati()?2:0); + make_graph("Overall debt", "debt_overall", 0, nfields); + make_graph("CPU LTD", "debt_cpu_ltd", 1, nfields); + make_graph("CPU STD", "debt_cpu_std", 2, nfields); + if (gstate.host_info.have_cuda()) { + make_graph("NVIDIA LTD", "debt_nvidia_ltd", 3, nfields); + make_graph("NVIDIA STD", "debt_nvidia_std", 4, nfields); + } + if (gstate.host_info.have_ati()) { + int off = gstate.host_info.have_cuda()?2:0; + make_graph("ATI LTD", "debt_ati_ltd", 3+off, nfields); + make_graph("ATI STD", "debt_ati_std", 4+off, nfields); + } +} + void simulate() { bool action; double start = START_TIME; @@ -951,6 +994,8 @@ void do_client_simulation() { // then other print_project_results(stdout); + + debt_graphs(); } int main(int argc, char** argv) { diff --git a/client/sim.h b/client/sim.h index 8b80303230..99f184a8eb 100644 --- a/client/sim.h +++ b/client/sim.h @@ -101,6 +101,7 @@ extern bool gpus_usable; //#define START_TIME 946684800 // Jan 1 2000 -#define START_TIME 0 +#define START_TIME 3600 + // should be at least an hour or so #endif diff --git a/client/sim_util.cpp b/client/sim_util.cpp index 01ae480c7f..c1710e4a1f 100644 --- a/client/sim_util.cpp +++ b/client/sim_util.cpp @@ -56,7 +56,6 @@ void HOST_INFO::generate_host_cpid() {} void show_message(PROJECT *p, char* msg, int priority, const char*) { const char* x; char message[1024]; - char* time_string = time_to_string(gstate.now); if (priority == MSG_INTERNAL_ERROR) { strcpy(message, "[error] "); @@ -74,7 +73,7 @@ void show_message(PROJECT *p, char* msg, int priority, const char*) { x = "---"; } - fprintf(logfile, "%s [%s] %s\n", time_string, x, message); + fprintf(logfile, "%.0f [%s] %s\n", gstate.now, x, message); } APP_CLIENT_SHM::APP_CLIENT_SHM() {} diff --git a/client/work_fetch.cpp b/client/work_fetch.cpp index 52f17372d9..53579e326a 100644 --- a/client/work_fetch.cpp +++ b/client/work_fetch.cpp @@ -757,7 +757,9 @@ void WORK_FETCH::print_state() { for (unsigned int i=0; inon_cpu_intensive) continue; - msg_printf(p, MSG_INFO, "[work_fetch] overall LTD %.2f", p->pwf.overall_debt); + msg_printf(p, MSG_INFO, "[work_fetch] overall LTD %.2f", + p->pwf.overall_debt + ); } msg_printf(0, MSG_INFO, "[work_fetch] ------- end work fetch state -------"); } diff --git a/lib/coproc.h b/lib/coproc.h index 042c7a9cdf..2e08430353 100644 --- a/lib/coproc.h +++ b/lib/coproc.h @@ -272,11 +272,6 @@ struct COPROCS { ); int parse(MIOFILE&); void summary_string(char*, int); - bool fully_used() { - if (cuda.used < cuda.count) return false; - if (ati.used < ati.count) return false; - return true; - } // Copy a coproc set, possibly setting usage to zero. // used in round-robin simulator and CPU scheduler,