diff --git a/checkin_notes b/checkin_notes index c33dc35789..59d2a48dd2 100644 --- a/checkin_notes +++ b/checkin_notes @@ -2371,3 +2371,15 @@ David 20 Apr 2011 util.inc user/ login_form.php + +David 20 Apr 2011 + - client: if an app version has a missing GPU, + we still need to keep track of the usage + and write it to the state file. + - get client simulator working again + + client/ + client_types.cpp,h + sim.cpp + tools/ + backend_lib.cpp diff --git a/client/client_types.cpp b/client/client_types.cpp index dba6b9794c..fb32058fd8 100644 --- a/client/client_types.cpp +++ b/client/client_types.cpp @@ -1334,6 +1334,7 @@ int APP_VERSION::parse(MIOFILE& in) { gpu_usage.usage = cp.count; } else { missing_coproc = true; + missing_coproc_usage = cp.count; strcpy(missing_coproc_name, cp.type); } } else { @@ -1394,6 +1395,16 @@ int APP_VERSION::write(MIOFILE& out, bool write_file_info) { gpu_usage.usage ); } + if (missing_coproc) { + out.printf( + " \n" + " %s\n" + " %f\n" + " \n", + missing_coproc_name, + missing_coproc_usage + ); + } if (gpu_ram) { out.printf( " %f\n", diff --git a/client/client_types.h b/client/client_types.h index 725ebb15bb..b7ee38fbfe 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -542,6 +542,7 @@ struct APP_VERSION { // so that we don't run a long sequence of jobs, // each of which turns out not to fit in available RAM bool missing_coproc; + double missing_coproc_usage; char missing_coproc_name[256]; int index; // temp var for make_scheduler_request() diff --git a/client/sim.cpp b/client/sim.cpp index f82a225b22..7701409dd7 100644 --- a/client/sim.cpp +++ b/client/sim.cpp @@ -78,8 +78,8 @@ #define SCHED_RETRY_DELAY_MIN 60 // 1 minute #define SCHED_RETRY_DELAY_MAX (60*60*4) // 4 hours -const char* infile_prefix = ""; -const char* outfile_prefix = ""; +const char* infile_prefix = "."; +const char* outfile_prefix = "."; #define TIMELINE_FNAME "timeline.html" #define LOG_FNAME "log.txt" @@ -133,20 +133,20 @@ void usage(char* prog) { // double app_peak_flops(APP_VERSION* avp, double cpu_scale) { double x = avp->avg_ncpus*cpu_scale; - if (avp->ncudas) { - x += avp->ncudas * cuda_work_fetch.relative_speed; - } - if (avp->natis) { - x += avp->natis * ati_work_fetch.relative_speed; + int rt = avp->gpu_usage.rsc_type; + if (rt) { + x += avp->gpu_usage.usage * rsc_work_fetch[rt].relative_speed; } x *= gstate.host_info.p_fpops; return x; } double gpu_peak_flops() { - double cuda = gstate.host_info.coprocs.cuda.count * cuda_work_fetch.relative_speed * gstate.host_info.p_fpops; - double ati = gstate.host_info.coprocs.ati.count * ati_work_fetch.relative_speed * gstate.host_info.p_fpops; - return cuda + ati; + double x = 0; + for (int i=1; i& apps) { bool app_version_needs_work(APP_VERSION* avp) { if (avp->dont_use) return false; - if (avp->ncudas) { - return (cuda_work_fetch.req_secs>0 || cuda_work_fetch.req_instances>0); + int rt = avp->gpu_usage.rsc_type; + if (rt) { + return (rsc_work_fetch[rt].req_secs>0 || rsc_work_fetch[rt].req_instances>0); } - if (avp->natis) { - return (ati_work_fetch.req_secs>0 || ati_work_fetch.req_instances>0); - } - return (cpu_work_fetch.req_secs>0 || cpu_work_fetch.req_instances>0); + return (rsc_work_fetch[0].req_secs>0 || rsc_work_fetch[0].req_instances>0); } bool has_app_version_needing_work(APP* app) { @@ -326,19 +324,16 @@ void decrement_request(RESULT* rp) { APP_VERSION* avp = rp->avp; double est_runtime = rp->wup->rsc_fpops_est/avp->flops; est_runtime /= (gstate.time_stats.on_frac*gstate.time_stats.active_frac); - decrement_request_rsc(cpu_work_fetch, avp->avg_ncpus, est_runtime); - decrement_request_rsc(cuda_work_fetch, avp->ncudas, est_runtime); - decrement_request_rsc(ati_work_fetch, avp->natis, est_runtime); + decrement_request_rsc(rsc_work_fetch[0], avp->avg_ncpus, est_runtime); + int rt = avp->gpu_usage.rsc_type; + if (rt) { + decrement_request_rsc(rsc_work_fetch[rt], avp->gpu_usage.usage, est_runtime); + } } double get_estimated_delay(RESULT* rp) { - if (rp->avp->ncudas) { - return cuda_work_fetch.estimated_delay; - } else if (rp->avp->natis) { - return ati_work_fetch.estimated_delay; - } else { - return cpu_work_fetch.estimated_delay; - } + int rt = rp->avp->gpu_usage.rsc_type; + return rsc_work_fetch[rt].estimated_delay; } // simulate trying to do an RPC; @@ -352,14 +347,21 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) { // save request params for WORK_FETCH::handle_reply // - double save_cpu_req_secs = cpu_work_fetch.req_secs; - host_info.coprocs.cuda.req_secs = cuda_work_fetch.req_secs; - host_info.coprocs.ati.req_secs = ati_work_fetch.req_secs; + double save_cpu_req_secs = rsc_work_fetch[0].req_secs; + for (int i=1; inrpc_failures = 0; p->sched_rpc_pending = false; @@ -800,13 +802,9 @@ void show_resource(int rsc_type) { if (rsc_type!=RSC_TYPE_CPU && rp->resource_type() != rsc_type) continue; if (atp->task_state() != PROCESS_EXECUTING) continue; PROJECT* p = rp->project; - double ninst; - if (rsc_type == RSC_TYPE_CUDA) { - ninst = rp->avp->ncudas; - } else if (rsc_type == RSC_TYPE_ATI) { - ninst = rp->avp->natis; - } else { - ninst = rp->avp->avg_ncpus; + double ninst=0; + if (rsc_type == rp->avp->gpu_usage.rsc_type) { + ninst = rp->avp->gpu_usage.usage; } fprintf(html_out, "%.2f: %s%s: %.2fG
", @@ -856,11 +854,11 @@ void html_start() { fprintf(html_out, "CPU
Job name and estimated time left
color denotes project
* means EDF mode
", WIDTH2 ); - if (gstate.host_info.have_cuda()) { + if (coprocs.have_nvidia()) { fprintf(html_out, "NVIDIA GPU", WIDTH2); nproc_types++; } - if (gstate.host_info.have_ati()) { + if (coprocs.have_ati()) { fprintf(html_out, "ATI GPU", WIDTH2); nproc_types++; } @@ -885,26 +883,17 @@ void html_rec() { if (active) { show_resource(RSC_TYPE_CPU); if (gpu_active) { - if (gstate.host_info.have_cuda()) { - show_resource(RSC_TYPE_CUDA); - } - if (gstate.host_info.have_ati()) { - show_resource(RSC_TYPE_ATI); + for (int i=1; iOFF", WIDTH2); - } - if (gstate.host_info.have_ati()) { + for (int i=1; iOFF", WIDTH2); } } } else { fprintf(html_out, "OFF", WIDTH2); - if (gstate.host_info.have_cuda()) { - fprintf(html_out, "OFF", WIDTH2); - } - if (gstate.host_info.have_ati()) { + for (int i=1; iOFF", WIDTH2); } } @@ -990,19 +979,13 @@ void write_debts() { PROJECT* p = gstate.projects[i]; fprintf(debt_file, "%f %f %f ", p->pwf.overall_debt, - p->cpu_pwf.long_term_debt, - p->cpu_pwf.short_term_debt + p->rsc_pwf[0].long_term_debt, + p->rsc_pwf[0].short_term_debt ); - if (gstate.host_info.have_cuda()) { + for (int j=1; jcuda_pwf.long_term_debt, - p->cuda_pwf.short_term_debt - ); - } - if (gstate.host_info.have_ati()) { - fprintf(debt_file, "%f %f", - p->ati_pwf.long_term_debt, - p->ati_pwf.short_term_debt + p->rsc_pwf[j].long_term_debt, + p->rsc_pwf[j].short_term_debt ); } } @@ -1038,16 +1021,16 @@ void make_graph(const char* title, const char* fname, int field, int nfields) { } void debt_graphs() { - int nfields = 3 + (gstate.host_info.have_cuda()?2:0) + (gstate.host_info.have_ati()?2:0); + int nfields = 3 + (coprocs.have_nvidia()?2:0) + (coprocs.have_ati()?2:0); make_graph("Overall debt", "debt_overall", 0, nfields); make_graph("CPU LTD", "debt_cpu_ltd", 1, nfields); make_graph("CPU STD", "debt_cpu_std", 2, nfields); - if (gstate.host_info.have_cuda()) { + if (coprocs.have_nvidia()) { make_graph("NVIDIA LTD", "debt_nvidia_ltd", 3, nfields); make_graph("NVIDIA STD", "debt_nvidia_std", 4, nfields); } - if (gstate.host_info.have_ati()) { - int off = gstate.host_info.have_cuda()?2:0; + if (coprocs.have_ati()) { + int off = coprocs.have_nvidia()?2:0; make_graph("ATI LTD", "debt_ati_ltd", 3+off, nfields); make_graph("ATI STD", "debt_ati_std", 4+off, nfields); } @@ -1091,11 +1074,12 @@ void simulate() { "hardware\n %d CPUs, %fG\n", gstate.host_info.p_ncpus, gstate.host_info.p_fpops/1e9 ); - if (gstate.host_info.have_cuda()) { + for (int i=1; iapp != app) continue; fprintf(summary_file, - " app version %s %d (%s): ncpus %.2f ncuda %.2f nati %.2f flops %.0fG\n", + " app version %s %d (%s): ncpus %.2f rsc %d usage %.2f flops %.0fG\n", avp->app_name, avp->version_num, avp->plan_class, - avp->avg_ncpus, avp->ncudas, avp->natis, + avp->avg_ncpus, avp->gpu_usage.rsc_type, avp->gpu_usage.usage, avp->flops/1e9 ); } @@ -1263,9 +1247,9 @@ void clear_backoff() { unsigned int i; for (i=0; icpu_pwf.reset(); - p->cuda_pwf.reset(); - p->ati_pwf.reset(); + for (int j=0; jrsc_pwf[j].reset(); + } p->min_rpc_time = 0; } } @@ -1280,20 +1264,15 @@ void cull_projects() { for (i=0; idont_request_more_work = true; - p->no_cpu_apps = true; - p->no_cuda_apps = true; - p->no_ati_apps = true; + for (int j=0; jno_rsc_apps[j] = true; + } } for (i=0; iapp->ignore) continue; - if (avp->ncudas) { - avp->project->no_cuda_apps = false; - } else if (avp->natis) { - avp->project->no_ati_apps = false; - } else { - avp->project->no_cpu_apps = false; - } + int rt = avp->gpu_usage.rsc_type; + avp->project->no_rsc_apps[rt] = false; } for (i=0; iproject_name, - p->no_cpu_apps?" no CPU apps":"", - p->no_cuda_apps?" no nvidia apps":"", - p->no_ati_apps?" no ATI apps":"" - ); + fprintf(summary_file, "%s: ", p->project_name); + for (int j=0; jno_rsc_apps[j]) { + fprintf(summary_file, " no %s apps", coprocs.coprocs[j].type); + } + } } } diff --git a/tools/backend_lib.cpp b/tools/backend_lib.cpp index 98bcb4f232..4b7225770b 100644 --- a/tools/backend_lib.cpp +++ b/tools/backend_lib.cpp @@ -583,7 +583,7 @@ int create_work( wu, wu_template, infiles, ninfiles, config_loc, command_line, additional_xml ); if (retval) { - fprintf(stderr, "process_wu_template: %d\n", retval); + fprintf(stderr, "process_wu_template(): %d\n", retval); return retval; } @@ -591,7 +591,10 @@ int create_work( result_template_filepath, _result_template, sizeof(_result_template) ); if (retval) { - fprintf(stderr, "create_work: can't read result template file %s\n", result_template_filepath); + fprintf(stderr, + "create_work: can't read result template file %s\n", + result_template_filepath + ); return retval; }