Merge pull request #3001 from BOINC/dpa_opencl

client: fix possible overflow in peak FLOPS calculation for OpenCL GPUs
2019-02-12 23:25:15 +02:00 · 2019-02-12 23:25:15 +02:00 · dc70ff9100
parent f371c5beb6 ccd0e64b2b
commit dc70ff9100
6 changed files with 69 additions and 8 deletions
--- a/client/gpu_amd.cpp
+++ b/client/gpu_amd.cpp
@ -377,6 +377,9 @@ void COPROC_ATI::get(
        cc.atirt_detected = atirt_detected;
        cc.device_num = i;
        cc.set_peak_flops();
+        if (cc.bad_gpu_peak_flops("CAL", s)) {
+            warnings.push_back(s);
+        }
        get_available_ati_ram(cc, warnings);
        ati_gpus.push_back(cc);
    }
--- a/client/gpu_nvidia.cpp
+++ b/client/gpu_nvidia.cpp
@ -240,6 +240,7 @@ void COPROC_NVIDIA::get(
    char buf[256];
    int j, itemp;
    size_t global_mem = 0;
+    string s;
    COPROC_NVIDIA cc;

 #ifdef _WIN32
@ -444,6 +445,9 @@ void* cudalib = NULL;
        cc.cuda_version = cuda_version;
        cc.device_num = j;
        cc.set_peak_flops();
+        if (cc.bad_gpu_peak_flops("CUDA", s)) {
+            warnings.push_back(s);
+        }
        get_available_nvidia_ram(cc, warnings);
        nvidia_gpus.push_back(cc);
    }
--- a/client/gpu_opencl.cpp
+++ b/client/gpu_opencl.cpp
@ -202,6 +202,7 @@ void COPROCS::get_opencl(
    vector<int>devnums_pci_slot_sort;
    vector<OPENCL_DEVICE_PROP>::iterator it;
    int max_other_coprocs = MAX_RSC-1;  // coprocs[0] is reserved for CPU
+    string s;

    if (cc_config.no_opencl) {
        return;
@ -486,6 +487,9 @@ void COPROCS::get_opencl(
                    COPROC_NVIDIA c;
                    c.opencl_prop = prop;
                    c.set_peak_flops();
+                    if (c.bad_gpu_peak_flops("NVIDIA OpenCL", s)) {
+                        warnings.push_back(s);
+                    }
                    prop.peak_flops = c.peak_flops;
                }
                if (cuda_match_found) {
@ -552,6 +556,9 @@ void COPROCS::get_opencl(
                    COPROC_ATI c;
                    c.opencl_prop = prop;
                    c.set_peak_flops();
+                    if (c.bad_gpu_peak_flops("AMD OpenCL", s)) {
+                        warnings.push_back(s);
+                    }
                    prop.peak_flops = c.peak_flops;
                }

@ -576,6 +583,9 @@ void COPROCS::get_opencl(
                safe_strcpy(c.version, prop.opencl_driver_version);

                c.set_peak_flops();
+                if (c.bad_gpu_peak_flops("Intel OpenCL", s)) {
+                    warnings.push_back(s);
+                }
                prop.peak_flops = c.peak_flops;
                prop.opencl_available_ram = prop.global_mem_size;

@ -606,12 +616,22 @@ void COPROCS::get_opencl(
                prop.opencl_available_ram = prop.global_mem_size;
                prop.is_used = COPROC_USED;

-                // TODO: Find a better way to calculate / estimate peak_flops for future coprocessors?
+                // TODO: is there a better way to estimate peak_flops?
+                //
                prop.peak_flops = 0;
                if (prop.max_compute_units) {
-                    prop.peak_flops = prop.max_compute_units * prop.max_clock_frequency * MEGA;
+                    double freq = ((double)prop.max_clock_frequency) * MEGA;
+                    prop.peak_flops = ((double)prop.max_compute_units) * freq;
+                }
+                if (prop.peak_flops <= 0 || prop.peak_flops > GPU_MAX_PEAK_FLOPS) {
+                    char buf2[256];
+                    sprintf(buf2,
+                        "OpenCL generic: bad peak FLOPS; Max units %d, max freq %d MHz",
+                        prop.max_compute_units, prop.max_clock_frequency
+                    );
+                    warnings.push_back(buf2);
+                    prop.peak_flops = GPU_DEFAULT_PEAK_FLOPS;
                }
-                if (prop.peak_flops <= 0) prop.peak_flops = 45e9;

                other_opencls.push_back(prop);
            }
--- a/lib/coproc.cpp
+++ b/lib/coproc.cpp
@ -600,7 +600,7 @@ void COPROC_NVIDIA::set_peak_flops() {
        //
        x = opencl_prop.max_compute_units * 48 * 2 * opencl_prop.max_clock_frequency * 1e6;
    }
-    peak_flops =  (x>0)?x:5e10;
+    peak_flops = x;
 }

 // fake a NVIDIA GPU (for debugging)
@ -868,7 +868,7 @@ void COPROC_ATI::set_peak_flops() {
        //
        x = opencl_prop.max_compute_units * 16 * 5 * opencl_prop.max_clock_frequency * 1e6;
    }
-    peak_flops = (x>0)?x:5e10;
+    peak_flops = x;
 }

 void COPROC_ATI::fake(double ram, double avail_ram, int n) {
@ -980,7 +980,7 @@ void COPROC_INTEL::set_peak_flops() {
    if (opencl_prop.max_compute_units) {
        x = opencl_prop.max_compute_units * 8 * opencl_prop.max_clock_frequency * 1e6;
    }
-    peak_flops = (x>0)?x:45e9;
+    peak_flops = x;
 }

 void COPROC_INTEL::fake(double ram, double avail_ram, int n) {
--- a/lib/coproc.h
+++ b/lib/coproc.h
@ -91,6 +91,14 @@
 #define MAX_COPROC_INSTANCES 64
 #define MAX_RSC 8
    // max # of processing resources types
+#define GPU_MAX_PEAK_FLOPS  1.e15
+    // sanity-check bound for peak FLOPS
+    // for now (Feb 2019) 1000 TeraFLOPS.
+    // As of now, the fastest GPU is 20 TeraFLOPS (NVIDIA).
+    // May need to increase this at some point
+#define GPU_DEFAULT_PEAK_FLOPS  100.e9
+    // value to use if sanity check fails
+    // as of now (Feb 2019) 100 GigaFLOPS is a typical low-end GPU

 // arguments to proc_type_name() and proc_type_name_xml().
 //
@ -246,6 +254,21 @@ struct COPROC {
        std::vector<OPENCL_DEVICE_PROP> &opencls,
        std::vector<int>& ignore_dev
    );
+
+    // sanity check GPU peak FLOPS
+    //
+    inline bool bad_gpu_peak_flops(const char* source, std::string& msg) {
+        if (peak_flops <= 0 || peak_flops > GPU_MAX_PEAK_FLOPS) {
+            char buf[256];
+            sprintf(buf, "%s reported bad GPU peak FLOPS %f; using %f",
+                source, peak_flops, GPU_DEFAULT_PEAK_FLOPS
+            );
+            msg = buf;
+            peak_flops = GPU_DEFAULT_PEAK_FLOPS;
+            return true;
+        }
+        return false;
+    }
 };

 // Based on cudaDeviceProp from /usr/local/cuda/include/driver_types.h
--- a/sched/plan_class_spec.cpp
+++ b/sched/plan_class_spec.cpp
@ -181,6 +181,7 @@ bool PLAN_CLASS_SPEC::opencl_check(OPENCL_DEVICE_PROP& opencl_prop) {
 bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu, const WORKUNIT* wu) {
    COPROC* cpp = NULL;
    bool can_use_multicore = true;
+    string msg;

    if (infeasible_random && drand()<infeasible_random) {
        return false;
@ -594,7 +595,9 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu, const WORKU
            return false;
        }

-        cp.set_peak_flops();
+        if (cp.bad_gpu_peak_flops("AMD", msg)) {
+            log_messages.printf(MSG_NORMAL, "%s\n", msg.c_str());
+        }
        gpu_ram = cp.opencl_prop.global_mem_size;

        driver_version = 0;
@ -686,7 +689,9 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu, const WORKU
            }
        }
        gpu_ram = cp.prop.totalGlobalMem;
-        cp.set_peak_flops();
+        if (cp.bad_gpu_peak_flops("NVIDIA", msg)) {
+            log_messages.printf(MSG_NORMAL, "%s\n", msg.c_str());
+        }

    // Intel GPU
    //
@ -705,6 +710,9 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu, const WORKU
        if (min_gpu_ram_mb) {
            gpu_requirements[PROC_TYPE_INTEL_GPU].update(0, min_gpu_ram_mb * MEGA);
        }
+        if (cp.bad_gpu_peak_flops("Intel GPU", msg)) {
+            log_messages.printf(MSG_NORMAL, "%s\n", msg.c_str());
+        }

    // custom GPU type
    //
@ -723,6 +731,9 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu, const WORKU
                "[version] plan_class_spec: Custom coproc %s found\n", gpu_type
            );
        }
+        if (cpp->bad_gpu_peak_flops("Custom GPU", msg)) {
+            log_messages.printf(MSG_NORMAL, "%s\n", msg.c_str());
+        }
    }

    if (opencl) {