- client: if an app version has a missing GPU,

we still need to keep track of the usage and write it to the state file. - get client simulator working again svn path=/trunk/boinc/; revision=23413
2011-04-20 23:32:14 +00:00 · 2011-04-20 23:32:14 +00:00 · 98b4c79e36
parent fa2c939df6
commit 98b4c79e36
5 changed files with 103 additions and 97 deletions
--- a/12
+++ b/12
@ -2371,3 +2371,15 @@ David  20 Apr 2011
            util.inc
        user/
            login_form.php
+
+David  20 Apr 2011
+    - client: if an app version has a missing GPU,
+        we still need to keep track of the usage
+        and write it to the state file.
+    - get client simulator working again
+
+    client/
+        client_types.cpp,h
+        sim.cpp
+    tools/
+        backend_lib.cpp
--- a/client/client_types.cpp
+++ b/client/client_types.cpp
@ -1334,6 +1334,7 @@ int APP_VERSION::parse(MIOFILE& in) {
                    gpu_usage.usage = cp.count;
                } else {
                    missing_coproc = true;
+                    missing_coproc_usage = cp.count;
                    strcpy(missing_coproc_name, cp.type);
                }
            } else {
@ -1394,6 +1395,16 @@ int APP_VERSION::write(MIOFILE& out, bool write_file_info) {
            gpu_usage.usage
        );
    }
+    if (missing_coproc) {
+        out.printf(
+            "    <coproc>\n"
+            "        <type>%s</type>\n"
+            "        <count>%f</count>\n"
+            "    </coproc>\n",
+            missing_coproc_name,
+            missing_coproc_usage
+        );
+    }
    if (gpu_ram) {
        out.printf(
            "    <gpu_ram>%f</gpu_ram>\n",
--- a/client/client_types.h
+++ b/client/client_types.h
@ -542,6 +542,7 @@ struct APP_VERSION {
        // so that we don't run a long sequence of jobs,
        // each of which turns out not to fit in available RAM
    bool missing_coproc;
+    double missing_coproc_usage;
    char missing_coproc_name[256];

    int index;  // temp var for make_scheduler_request()
--- a/client/sim.cpp
+++ b/client/sim.cpp
@ -78,8 +78,8 @@
 #define SCHED_RETRY_DELAY_MIN    60                // 1 minute
 #define SCHED_RETRY_DELAY_MAX    (60*60*4)         // 4 hours

-const char* infile_prefix = "";
-const char* outfile_prefix = "";
+const char* infile_prefix = ".";
+const char* outfile_prefix = ".";

 #define TIMELINE_FNAME "timeline.html"
 #define LOG_FNAME "log.txt"
@ -133,20 +133,20 @@ void usage(char* prog) {
 //
 double app_peak_flops(APP_VERSION* avp, double cpu_scale) {
    double x = avp->avg_ncpus*cpu_scale;
-    if (avp->ncudas) {
-        x += avp->ncudas * cuda_work_fetch.relative_speed;
-    }
-    if (avp->natis) {
-        x += avp->natis * ati_work_fetch.relative_speed;
+    int rt = avp->gpu_usage.rsc_type;
+    if (rt) {
+        x += avp->gpu_usage.usage * rsc_work_fetch[rt].relative_speed;
    }
    x *= gstate.host_info.p_fpops;
    return x;
 }

 double gpu_peak_flops() {
-    double cuda = gstate.host_info.coprocs.cuda.count * cuda_work_fetch.relative_speed * gstate.host_info.p_fpops;
-    double ati = gstate.host_info.coprocs.ati.count * ati_work_fetch.relative_speed * gstate.host_info.p_fpops;
-    return cuda + ati;
+    double x = 0;
+    for (int i=1; i<coprocs.n_rsc; i++) {
+        x += coprocs.coprocs[i].count * rsc_work_fetch[i].relative_speed * gstate.host_info.p_fpops;
+    }
+    return x;
 }

 double cpu_peak_flops() {
@ -184,13 +184,11 @@ APP* choose_app(vector<APP*>& apps) {

 bool app_version_needs_work(APP_VERSION* avp) {
    if (avp->dont_use) return false;
-    if (avp->ncudas) {
-        return (cuda_work_fetch.req_secs>0 || cuda_work_fetch.req_instances>0);
+    int rt = avp->gpu_usage.rsc_type;
+    if (rt) {
+        return (rsc_work_fetch[rt].req_secs>0 || rsc_work_fetch[rt].req_instances>0);
    }
-    if (avp->natis) {
-        return (ati_work_fetch.req_secs>0 || ati_work_fetch.req_instances>0);
-    }
-    return (cpu_work_fetch.req_secs>0 || cpu_work_fetch.req_instances>0);
+    return (rsc_work_fetch[0].req_secs>0 || rsc_work_fetch[0].req_instances>0);
 }

 bool has_app_version_needing_work(APP* app) {
@ -326,19 +324,16 @@ void decrement_request(RESULT* rp) {
    APP_VERSION* avp = rp->avp;
    double est_runtime = rp->wup->rsc_fpops_est/avp->flops;
    est_runtime /= (gstate.time_stats.on_frac*gstate.time_stats.active_frac);
-    decrement_request_rsc(cpu_work_fetch, avp->avg_ncpus, est_runtime);
-    decrement_request_rsc(cuda_work_fetch, avp->ncudas, est_runtime);
-    decrement_request_rsc(ati_work_fetch, avp->natis, est_runtime);
+    decrement_request_rsc(rsc_work_fetch[0], avp->avg_ncpus, est_runtime);
+    int rt = avp->gpu_usage.rsc_type;
+    if (rt) {
+        decrement_request_rsc(rsc_work_fetch[rt], avp->gpu_usage.usage, est_runtime);
+    }
 }

 double get_estimated_delay(RESULT* rp) {
-    if (rp->avp->ncudas) {
-        return cuda_work_fetch.estimated_delay;
-    } else if (rp->avp->natis) {
-        return ati_work_fetch.estimated_delay;
-    } else {
-        return cpu_work_fetch.estimated_delay;
-    }
+    int rt = rp->avp->gpu_usage.rsc_type;
+    return rsc_work_fetch[rt].estimated_delay;
 }

 // simulate trying to do an RPC;
@ -352,14 +347,21 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {

    // save request params for WORK_FETCH::handle_reply
    //
-    double save_cpu_req_secs = cpu_work_fetch.req_secs;
-    host_info.coprocs.cuda.req_secs = cuda_work_fetch.req_secs;
-    host_info.coprocs.ati.req_secs = ati_work_fetch.req_secs;
+    double save_cpu_req_secs = rsc_work_fetch[0].req_secs;
+    for (int i=1; i<coprocs.n_rsc; i++) {
+        COPROC& cp = coprocs.coprocs[i];
+        if (!strcmp(cp.type, "NVIDIA")) {
+            coprocs.nvidia.req_secs = rsc_work_fetch[i].req_secs;
+        }
+        if (!strcmp(cp.type, "ATI")) {
+            coprocs.ati.req_secs = rsc_work_fetch[i].req_secs;
+        }
+    }

    if (!server_uses_workload) {
-        cpu_work_fetch.estimated_delay = cpu_work_fetch.busy_time_estimator.get_busy_time();
-        cuda_work_fetch.estimated_delay = cuda_work_fetch.busy_time_estimator.get_busy_time();
-        ati_work_fetch.estimated_delay = ati_work_fetch.busy_time_estimator.get_busy_time();
+        for (int i=1; i<coprocs.n_rsc; i++) {
+            rsc_work_fetch[i].estimated_delay = rsc_work_fetch[i].busy_time_estimator.get_busy_time();
+        }
    }

    for (unsigned int i=0; i<app_versions.size(); i++) {
@ -430,7 +432,7 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
    html_msg += buf;

    SCHEDULER_REPLY sr;
-    cpu_work_fetch.req_secs = save_cpu_req_secs;
+    rsc_work_fetch[0].req_secs = save_cpu_req_secs;
    work_fetch.handle_reply(p, &sr, new_results);
    p->nrpc_failures = 0;
    p->sched_rpc_pending = false;
@ -800,13 +802,9 @@ void show_resource(int rsc_type) {
        if (rsc_type!=RSC_TYPE_CPU && rp->resource_type() != rsc_type) continue;
        if (atp->task_state() != PROCESS_EXECUTING) continue;
        PROJECT* p = rp->project;
-        double ninst;
-        if (rsc_type == RSC_TYPE_CUDA) {
-            ninst = rp->avp->ncudas;
-        } else if (rsc_type == RSC_TYPE_ATI) {
-            ninst = rp->avp->natis;
-        } else {
-            ninst = rp->avp->avg_ncpus;
+        double ninst=0;
+        if (rsc_type == rp->avp->gpu_usage.rsc_type) {
+            ninst = rp->avp->gpu_usage.usage;
        }

        fprintf(html_out, "%.2f: <font color=%s>%s%s: %.2fG</font><br>",
@ -856,11 +854,11 @@ void html_start() {
    fprintf(html_out,
        "<th width=%d>CPU<br><font size=-2>Job name and estimated time left<br>color denotes project<br>* means EDF mode</font></th>", WIDTH2
    );
-    if (gstate.host_info.have_cuda()) {
+    if (coprocs.have_nvidia()) {
        fprintf(html_out, "<th width=%d>NVIDIA GPU</th>", WIDTH2);
        nproc_types++;
    }
-    if (gstate.host_info.have_ati()) {
+    if (coprocs.have_ati()) {
        fprintf(html_out, "<th width=%d>ATI GPU</th>", WIDTH2);
        nproc_types++;
    }
@ -885,26 +883,17 @@ void html_rec() {
    if (active) {
        show_resource(RSC_TYPE_CPU);
        if (gpu_active) {
-            if (gstate.host_info.have_cuda()) {
-                show_resource(RSC_TYPE_CUDA);
-            }
-            if (gstate.host_info.have_ati()) {
-                show_resource(RSC_TYPE_ATI);
+            for (int i=1; i<coprocs.n_rsc; i++) {
+                show_resource(i);
            }
        } else {
-            if (gstate.host_info.have_cuda()) {
-                fprintf(html_out, "<td width=%d valign=top bgcolor=#aaaaaa>OFF</td>", WIDTH2);
-            }
-            if (gstate.host_info.have_ati()) {
+            for (int i=1; i<coprocs.n_rsc; i++) {
                fprintf(html_out, "<td width=%d valign=top bgcolor=#aaaaaa>OFF</td>", WIDTH2);
            }
        }
    } else {
        fprintf(html_out, "<td width=%d valign=top bgcolor=#aaaaaa>OFF</td>", WIDTH2);
-        if (gstate.host_info.have_cuda()) {
-            fprintf(html_out, "<td width=%d valign=top bgcolor=#aaaaaa>OFF</td>", WIDTH2);
-        }
-        if (gstate.host_info.have_ati()) {
+        for (int i=1; i<coprocs.n_rsc; i++) {
            fprintf(html_out, "<td width=%d valign=top bgcolor=#aaaaaa>OFF</td>", WIDTH2);
        }
    }
@ -990,19 +979,13 @@ void write_debts() {
        PROJECT* p = gstate.projects[i];
        fprintf(debt_file, "%f %f %f ",
            p->pwf.overall_debt,
-            p->cpu_pwf.long_term_debt,
-            p->cpu_pwf.short_term_debt
+            p->rsc_pwf[0].long_term_debt,
+            p->rsc_pwf[0].short_term_debt
        );
-        if (gstate.host_info.have_cuda()) {
+        for (int j=1; j<coprocs.n_rsc; j++) {
            fprintf(debt_file, "%f %f ",
-                p->cuda_pwf.long_term_debt,
-                p->cuda_pwf.short_term_debt
-            );
-        }
-        if (gstate.host_info.have_ati()) {
-            fprintf(debt_file, "%f %f",
-                p->ati_pwf.long_term_debt,
-                p->ati_pwf.short_term_debt
+                p->rsc_pwf[j].long_term_debt,
+                p->rsc_pwf[j].short_term_debt
            );
        }
    }
@ -1038,16 +1021,16 @@ void make_graph(const char* title, const char* fname, int field, int nfields) {
 }

 void debt_graphs() {
-    int nfields = 3 + (gstate.host_info.have_cuda()?2:0) + (gstate.host_info.have_ati()?2:0);
+    int nfields = 3 + (coprocs.have_nvidia()?2:0) + (coprocs.have_ati()?2:0);
    make_graph("Overall debt", "debt_overall", 0, nfields);
    make_graph("CPU LTD", "debt_cpu_ltd", 1, nfields);
    make_graph("CPU STD", "debt_cpu_std", 2, nfields);
-    if (gstate.host_info.have_cuda()) {
+    if (coprocs.have_nvidia()) {
        make_graph("NVIDIA LTD", "debt_nvidia_ltd", 3, nfields);
        make_graph("NVIDIA STD", "debt_nvidia_std", 4, nfields);
    }
-    if (gstate.host_info.have_ati()) {
-        int off = gstate.host_info.have_cuda()?2:0;
+    if (coprocs.have_ati()) {
+        int off = coprocs.have_nvidia()?2:0;
        make_graph("ATI LTD", "debt_ati_ltd", 3+off, nfields);
        make_graph("ATI STD", "debt_ati_std", 4+off, nfields);
    }
@ -1091,11 +1074,12 @@ void simulate() {
        "hardware\n   %d CPUs, %fG\n",
        gstate.host_info.p_ncpus, gstate.host_info.p_fpops/1e9
    );
-    if (gstate.host_info.have_cuda()) {
+    for (int i=1; i<coprocs.n_rsc; i++) {
        fprintf(summary_file,
-            "   %d GPUs, %fG\n",
-            gstate.host_info.coprocs.cuda.count,
-            gstate.host_info.coprocs.cuda.peak_flops/1e9
+            "   %d %s GPUs, %fG\n",
+            coprocs.coprocs[i].count,
+            coprocs.coprocs[i].type,
+            coprocs.coprocs[i].peak_flops/1e9
        );
    }
    fprintf(summary_file,
@ -1189,9 +1173,9 @@ void show_app(APP* app) {
        APP_VERSION* avp = gstate.app_versions[i];
        if (avp->app != app) continue;
        fprintf(summary_file,
-            "      app version %s %d (%s): ncpus %.2f ncuda %.2f nati %.2f flops %.0fG\n",
+            "      app version %s %d (%s): ncpus %.2f rsc %d usage %.2f flops %.0fG\n",
            avp->app_name, avp->version_num, avp->plan_class,
-            avp->avg_ncpus, avp->ncudas, avp->natis,
+            avp->avg_ncpus, avp->gpu_usage.rsc_type, avp->gpu_usage.usage,
            avp->flops/1e9
        );
    }
@ -1263,9 +1247,9 @@ void clear_backoff() {
    unsigned int i;
    for (i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
-        p->cpu_pwf.reset();
-        p->cuda_pwf.reset();
-        p->ati_pwf.reset();
+        for (int j=0; j<coprocs.n_rsc; j++) {
+            p->rsc_pwf[j].reset();
+        }
        p->min_rpc_time = 0;
    }
 }
@ -1280,20 +1264,15 @@ void cull_projects() {
    for (i=0; i<gstate.projects.size(); i++) {
        p = gstate.projects[i];
        p->dont_request_more_work = true;
-        p->no_cpu_apps = true;
-        p->no_cuda_apps = true;
-        p->no_ati_apps = true;
+        for (int j=0; j<coprocs.n_rsc; j++) {
+            p->no_rsc_apps[j] = true;
+        }
    }
    for (i=0; i<gstate.app_versions.size(); i++) {
        APP_VERSION* avp = gstate.app_versions[i];
        if (avp->app->ignore) continue;
-        if (avp->ncudas) {
-            avp->project->no_cuda_apps = false;
-        } else if (avp->natis) {
-            avp->project->no_ati_apps = false;
-        } else {
-            avp->project->no_cpu_apps = false;
-        }
+        int rt = avp->gpu_usage.rsc_type;
+        avp->project->no_rsc_apps[rt] = false;
    }
    for (i=0; i<gstate.apps.size(); i++) {
        APP* app = gstate.apps[i];
@ -1317,12 +1296,12 @@ void cull_projects() {
    }
    for (i=0; i<gstate.projects.size(); i++) {
        p = gstate.projects[i];
-        fprintf(summary_file, "%s: %s%s%s\n",
-            p->project_name,
-            p->no_cpu_apps?" no CPU apps":"",
-            p->no_cuda_apps?" no nvidia apps":"",
-            p->no_ati_apps?" no ATI apps":""
-        );
+        fprintf(summary_file, "%s: ", p->project_name);
+        for (int j=0; j<coprocs.n_rsc; j++) {
+            if (p->no_rsc_apps[j]) {
+                fprintf(summary_file, " no %s apps", coprocs.coprocs[j].type);
+            }
+        }
    }
 }

--- a/tools/backend_lib.cpp
+++ b/tools/backend_lib.cpp
@ -583,7 +583,7 @@ int create_work(
        wu, wu_template, infiles, ninfiles, config_loc, command_line, additional_xml
    );
    if (retval) {
-        fprintf(stderr, "process_wu_template: %d\n", retval);
+        fprintf(stderr, "process_wu_template(): %d\n", retval);
        return retval;
    }

@ -591,7 +591,10 @@ int create_work(
        result_template_filepath, _result_template, sizeof(_result_template)
    );
    if (retval) {
-        fprintf(stderr, "create_work: can't read result template file %s\n", result_template_filepath);
+        fprintf(stderr,
+            "create_work: can't read result template file %s\n",
+            result_template_filepath
+        );
        return retval;
    }