- client: change in the use of GPU available RAM:

- measure the available RAM of each GPU when BOINC starts up. If this fails, set available = physical. Show available RAM in startup messages. - use available RAM rather than physical RAM in selecting the "best" GPU instance - report available RAM to the scheduler TODO: change the scheduler to use available rather than physical if it's reported svn path=/trunk/boinc/; revision=24210
2011-09-14 22:45:26 +00:00 · 2011-09-14 22:45:26 +00:00 · 7411dd60aa
parent 9e1cafbf4d
commit 7411dd60aa
8 changed files with 86 additions and 128 deletions
--- a/20
+++ b/20
@ -6032,3 +6032,23 @@ Rom    14 Sept 2011
        
    client/
        hostinfo_win.cpp
+
+David  14 Sept 2011
+    - client: change in the use of GPU available RAM:
+        - measure the available RAM of each GPU when BOINC starts up.
+            If this fails, set available = physical.
+            Show available RAM in startup messages.
+        - use available RAM rather than physical RAM in selecting
+            the "best" GPU instance
+        - report available RAM to the scheduler
+    TODO: change the scheduler to use available rather than physical
+        if it's reported
+
+    sched/
+        sample_trivial_validator.cpp
+    lib/
+        coproc.cpp,h
+    client/
+        client_state.cpp
+        coproc_detect.cpp
+        cpu_sched.cpp
--- a/client/client_state.cpp
+++ b/client/client_state.cpp
@ -369,15 +369,11 @@ int CLIENT_STATE::init() {
        }
 #if 0
        msg_printf(NULL, MSG_INFO, "Faking an NVIDIA GPU");
-        coprocs.nvidia.fake(18000, 256*MEGA, 2);
-        coprocs.nvidia.available_ram_fake[0] = 256*MEGA;
-        coprocs.nvidia.available_ram_fake[1] = 192*MEGA;
+        coprocs.nvidia.fake(18000, 256*MEGA, 192*MEGA, 2);
 #endif
 #if 0
        msg_printf(NULL, MSG_INFO, "Faking an ATI GPU");
-        coprocs.ati.fake(512*MEGA, 2);
-        coprocs.ati.available_ram_fake[0] = 256*MEGA;
-        coprocs.ati.available_ram_fake[1] = 192*MEGA;
+        coprocs.ati.fake(512*MEGA, 256*MEGA, 2);
 #endif
    }

@ -386,7 +382,6 @@ int CLIENT_STATE::init() {
            msg_printf(NULL, MSG_INFO, "NVIDIA GPU info taken from cc_config.xml");
        } else {
            coprocs.add(coprocs.nvidia);
-            coprocs.nvidia.print_available_ram();
        }
    }
    if (coprocs.have_ati()) {
@ -394,7 +389,6 @@ int CLIENT_STATE::init() {
            msg_printf(NULL, MSG_INFO, "ATI GPU info taken from cc_config.xml");
        } else {
            coprocs.add(coprocs.ati);
-            coprocs.ati.print_available_ram();
        }
    }
    host_info._coprocs = coprocs;
--- a/client/coproc_detect.cpp
+++ b/client/coproc_detect.cpp
@ -101,24 +101,6 @@ cl_int (*__clGetDeviceInfo)(cl_device_id    /* device */,

 #endif

-void COPROC::print_available_ram() {
-    for (int i=0; i<count; i++) {
-        if (available_ram_unknown[i]) {
-            msg_printf(0, MSG_INFO,
-                "[coproc] %s device %d: available RAM unknown",
-                type, device_nums[i]
-            );
-        } else {
-            msg_printf(0, MSG_INFO,
-                "[coproc] %s device %d: available RAM %d MB",
-                type, device_nums[i],
-                (int)(available_ram[i]/MEGA)
-            );
-        }
-    }
-}
-
-
 //TODO: Determine how we want to compare OpenCL devices - this is only a placeholder
 // return 1/-1/0 if device 1 is more/less/same capable than device 2.
 // factors (decreasing priority):
@ -614,7 +596,7 @@ void COPROCS::get(
 // factors (decreasing priority):
 // - compute capability
 // - software version
-// - memory
+// - available memory
 // - speed
 //
 // If "loose", ignore FLOPS and tolerate small memory diff
@ -627,12 +609,12 @@ int nvidia_compare(COPROC_NVIDIA& c1, COPROC_NVIDIA& c2, bool loose) {
    if (c1.cuda_version > c2.cuda_version) return 1;
    if (c1.cuda_version < c2.cuda_version) return -1;
    if (loose) {
-        if (c1.prop.totalGlobalMem > 1.4*c2.prop.totalGlobalMem) return 1;
-        if (c1.prop.totalGlobalMem < .7* c2.prop.totalGlobalMem) return -1;
+        if (c1.available_ram> 1.4*c2.available_ram) return 1;
+        if (c1.available_ram < .7* c2.available_ram) return -1;
        return 0;
    }
-    if (c1.prop.totalGlobalMem > c2.prop.totalGlobalMem) return 1;
-    if (c1.prop.totalGlobalMem < c2.prop.totalGlobalMem) return -1;
+    if (c1.available_ram > c2.available_ram) return 1;
+    if (c1.available_ram < c2.available_ram) return -1;
    double s1 = c1.peak_flops;
    double s2 = c2.peak_flops;
    if (s1 > s2) return 1;
@ -944,12 +926,15 @@ void COPROC_NVIDIA::get(

 // fake a NVIDIA GPU (for debugging)
 //
-void COPROC_NVIDIA::fake(int driver_version, double ram, int n) {
+void COPROC_NVIDIA::fake(
+    int driver_version, double ram, double avail_ram, int n
+) {
   strcpy(type, GPU_TYPE_NVIDIA);
   count = n;
   for (int i=0; i<count; i++) {
       device_nums[i] = i;
   }
+   available_ram = avail_ram;
   display_driver_version = driver_version;
   cuda_version = 2020;
   strcpy(prop.name, "Fake NVIDIA GPU");
@ -974,8 +959,7 @@ void COPROC_NVIDIA::fake(int driver_version, double ram, int n) {
   set_peak_flops();
 }

-// See how much RAM is available on each GPU.
-// If this fails, set "available_ram_unknown"
+// See how much RAM is available on this GPU.
 //
 void COPROC_NVIDIA::get_available_ram() {
    int retval;
@ -983,8 +967,7 @@ void COPROC_NVIDIA::get_available_ram() {
 	int device;
    void* ctx;
    
-    available_ram[0] = 0;
-    available_ram_unknown[0] = true;
+    available_ram = prop.dtotalGlobalMem;
    retval = (*__cuDeviceGet)(&device, device_num);
    if (retval) {
        if (log_flags.coproc_debug) {
@ -1014,8 +997,7 @@ void COPROC_NVIDIA::get_available_ram() {
        return;
    }
    (*__cuCtxDestroy)(ctx);
-    available_ram[0] = (double) memfree;
-    available_ram_unknown[0] = false;
+    available_ram = (double) memfree;
 }

 // check whether each GPU is running a graphics app (assume yes)
@ -1429,11 +1411,12 @@ void COPROC_ATI::get(
    }
 }

-void COPROC_ATI::fake(double ram, int n) {
+void COPROC_ATI::fake(double ram, double avail_ram, int n) {
    strcpy(type, GPU_TYPE_ATI);
    strcpy(version, "1.4.3");
    strcpy(name, "foobar");
    count = n;
+    available_ram = avail_ram;
    memset(&attribs, 0, sizeof(attribs));
    memset(&info, 0, sizeof(info));
    attribs.localRAM = (int)(ram/MEGA);
@ -1446,52 +1429,39 @@ void COPROC_ATI::fake(double ram, int n) {
    set_peak_flops();
 }

-// get available RAM of ATI GPUs
-// NOTE: last time we checked, repeated calls to this crash the driver
+// get available RAM of ATI GPU
 //
 void COPROC_ATI::get_available_ram() {
    CALdevicestatus st;
    CALdevice dev;
-    int i, retval;
+    int retval;
+
+    available_ram = attribs.localRAM*MEGA;

    st.struct_size = sizeof(CALdevicestatus);

-    // avoid crash if faked GPU
-    if (!__calInit) {
-        for (i=0; i<count; i++) {
-            available_ram[i] = available_ram_fake[i];
-            available_ram_unknown[i] = false;
+    retval = (*__calDeviceOpen)(&dev, device_num);
+    if (retval) {
+        if (log_flags.coproc_debug) {
+            msg_printf(0, MSG_INFO,
+                "[coproc] calDeviceOpen(%d) returned %d", device_num, retval
+            );
        }
        return;
    }
-    for (i=0; i<count; i++) {
-        available_ram[i] = 0;
-        available_ram_unknown[i] = true;
-        int devnum = device_nums[i];
-        retval = (*__calDeviceOpen)(&dev, devnum);
-        if (retval) {
-            if (log_flags.coproc_debug) {
-                msg_printf(0, MSG_INFO,
-                    "[coproc] calDeviceOpen(%d) returned %d", devnum, retval
-                );
-            }
-            continue;
+    retval = (*__calDeviceGetStatus)(&st, dev);
+    if (retval) {
+        if (log_flags.coproc_debug) {
+            msg_printf(0, MSG_INFO,
+                "[coproc] calDeviceGetStatus(%d) returned %d",
+                device_num, retval
+            );
        }
-        retval = (*__calDeviceGetStatus)(&st, dev);
-        if (retval) {
-            if (log_flags.coproc_debug) {
-                msg_printf(0, MSG_INFO,
-                    "[coproc] calDeviceGetStatus(%d) returned %d",
-                    devnum, retval
-                );
-            }
-            (*__calDeviceClose)(dev);
-            continue;
-        }
-        available_ram[i] = st.availLocalRAM*MEGA;
-        available_ram_unknown[i] = false;
        (*__calDeviceClose)(dev);
+        return;
    }
+    available_ram = st.availLocalRAM*MEGA;
+    (*__calDeviceClose)(dev);
 }

 bool COPROC_ATI::matches(OPENCL_DEVICE_PROP& OpenCLprop) {
--- a/client/cpu_sched.cpp
+++ b/client/cpu_sched.cpp
@ -1156,7 +1156,7 @@ static inline void confirm_current_assignment(
                cp->type, j, rp->name
            );
        }
-        cp->available_ram[j] -= rp->avp->gpu_ram;
+        cp->available_ram_temp[j] -= rp->avp->gpu_ram;
    }
 }

@ -1169,22 +1169,19 @@ static inline bool get_fractional_assignment(
    // try to assign an instance that's already fractionally assigned
    //
    for (i=0; i<cp->count; i++) {
-        if (cp->available_ram_unknown[i]) {
-            continue;
-        }
        if (excluded(rp, cp, i)) {
            continue;
        }
        if ((cp->usage[i] || cp->pending_usage[i])
            && (cp->usage[i] + cp->pending_usage[i] + usage <= 1)
        ) {
-            if (rp->avp->gpu_ram > cp->available_ram[i]) {
+            if (rp->avp->gpu_ram > cp->available_ram_temp[i]) {
                defer_sched = true;
                continue;
            }
            rp->coproc_indices[0] = i;
            cp->usage[i] += usage;
-            cp->available_ram[i] -= rp->avp->gpu_ram;
+            cp->available_ram_temp[i] -= rp->avp->gpu_ram;
            if (log_flags.coproc_debug) {
                msg_printf(rp->project, MSG_INFO,
                    "[coproc] Assigning %f of %s instance %d to %s",
@ -1198,20 +1195,17 @@ static inline bool get_fractional_assignment(
    // failing that, assign an unreserved instance
    //
    for (i=0; i<cp->count; i++) {
-        if (cp->available_ram_unknown[i]) {
-            continue;
-        }
        if (excluded(rp, cp, i)) {
            continue;
        }
        if (!cp->usage[i]) {
-            if (rp->avp->gpu_ram > cp->available_ram[i]) {
+            if (rp->avp->gpu_ram > cp->available_ram_temp[i]) {
                defer_sched = true;
                continue;
            }
            rp->coproc_indices[0] = i;
            cp->usage[i] += usage;
-            cp->available_ram[i] -= rp->avp->gpu_ram;
+            cp->available_ram_temp[i] -= rp->avp->gpu_ram;
            if (log_flags.coproc_debug) {
                msg_printf(rp->project, MSG_INFO,
                    "[coproc] Assigning %f of %s free instance %d to %s",
@ -1241,14 +1235,11 @@ static inline bool get_integer_assignment(
    //
    int nfree = 0;
    for (i=0; i<cp->count; i++) {
-        if (cp->available_ram_unknown[i]) {
-            continue;
-        }
        if (excluded(rp, cp, i)) {
            continue;
        }
        if (!cp->usage[i]) {
-            if (rp->avp->gpu_ram > cp->available_ram[i]) {
+            if (rp->avp->gpu_ram > cp->available_ram_temp[i]) {
                defer_sched = true;
                continue;
            };
@ -1275,18 +1266,15 @@ static inline bool get_integer_assignment(
    // assign non-pending instances first

    for (i=0; i<cp->count; i++) {
-        if (cp->available_ram_unknown[i]) {
-            continue;
-        }
        if (excluded(rp, cp, i)) {
            continue;
        }
        if (!cp->usage[i]
            && !cp->pending_usage[i]
-            && (rp->avp->gpu_ram <= cp->available_ram[i])
+            && (rp->avp->gpu_ram <= cp->available_ram_temp[i])
        ) {
            cp->usage[i] = 1;
-            cp->available_ram[i] -= rp->avp->gpu_ram;
+            cp->available_ram_temp[i] -= rp->avp->gpu_ram;
            rp->coproc_indices[n++] = i;
            if (log_flags.coproc_debug) {
                msg_printf(rp->project, MSG_INFO,
@ -1301,17 +1289,14 @@ static inline bool get_integer_assignment(
    // if needed, assign pending instances

    for (i=0; i<cp->count; i++) {
-        if (cp->available_ram_unknown[i]) {
-            continue;
-        }
        if (excluded(rp, cp, i)) {
            continue;
        }
        if (!cp->usage[i]
-            && (rp->avp->gpu_ram <= cp->available_ram[i])
+            && (rp->avp->gpu_ram <= cp->available_ram_temp[i])
        ) {
            cp->usage[i] = 1;
-            cp->available_ram[i] -= rp->avp->gpu_ram;
+            cp->available_ram_temp[i] -= rp->avp->gpu_ram;
            rp->coproc_indices[n++] = i;
            if (log_flags.coproc_debug) {
                msg_printf(rp->project, MSG_INFO,
@ -1344,8 +1329,7 @@ static void copy_available_ram(COPROC& cp, const char* name) {
    int rt = rsc_index(name);
    if (rt > 0) {
        for (int i=0; i<MAX_COPROC_INSTANCES; i++) {
-            coprocs.coprocs[rt].available_ram[i] = cp.available_ram[i];
-            coprocs.coprocs[rt].available_ram_unknown[i] = cp.available_ram_unknown[i];
+            coprocs.coprocs[rt].available_ram_temp[i] = cp.available_ram;
        }
    }
 }
--- a/doc/index.php
+++ b/doc/index.php
@ -49,9 +49,9 @@ function show_totals() {
    $users = number_format($users);
    $hosts = number_format($hosts);

-    $teraflops = number_format($credit_day/200000, 2);
+    $petaflops = number_format($credit_day/200000000, 3);
    echo tra("Active:")." $users ".tra("volunteers,")." $hosts ".tra("computers.
-")."        <br>".tra("24-hour average:")." $teraflops ".tra("TeraFLOPS.")."
+")."        <br>".tra("24-hour average:")." $petaflops ".tra("PetaFLOPS.")."
        <hr size=1 width=\"80%\">
    ";
 }
--- a/lib/coproc.cpp
+++ b/lib/coproc.cpp
@ -300,9 +300,9 @@ void COPROC_NVIDIA::description(char* buf) {
    } else {
        strcpy(vers, "unknown");
    }
-    sprintf(buf, "%s (driver version %s, CUDA version %d, compute capability %d.%d, %.0fMB, %.0f GFLOPS peak)",
+    sprintf(buf, "%s (driver version %s, CUDA version %d, compute capability %d.%d, %.0fMB, %.0fMB available, %.0f GFLOPS peak)",
        prop.name, vers, cuda_version, prop.major, prop.minor,
-        prop.totalGlobalMem/(1024.*1024.), peak_flops/1e9
+        prop.totalGlobalMem/MEGA, available_ram/MEGA, peak_flops/1e9
    );
 }

@ -663,7 +663,7 @@ int COPROC_ATI::parse(XML_PARSER& xp) {
 }

 void COPROC_ATI::description(char* buf) {
-    sprintf(buf, "%s (CAL version %s, %.0fMB, %.0f GFLOPS peak)",
-        name, version, attribs.localRAM/1024.*1024., peak_flops/1.e9
+    sprintf(buf, "%s (CAL version %s, %.0fMB, %.0fMB available, %.0f GFLOPS peak)",
+        name, version, attribs.localRAM/MEGA, available_ram/MEGA, peak_flops/1.e9
    );
 }
--- a/lib/coproc.h
+++ b/lib/coproc.h
@ -95,8 +95,9 @@ struct COPROC_REQ {
    int parse(XML_PARSER&);
 };

-// For now, there will be some duplication between the values present in 
-// the OPENCL_DEVICE_PROP struct and the NVIDA and / or ATI structs
+// For now, there will be some duplication between the values in 
+// the OPENCL_DEVICE_PROP struct and the NVIDIA/ATI structs
+//
 struct OPENCL_DEVICE_PROP {
    cl_device_id device_id;
    char name[256];                     // Device name
@ -159,10 +160,9 @@ struct COPROC {
    int opencl_device_count;
    bool running_graphics_app[MAX_COPROC_INSTANCES];
        // is this GPU running a graphics app (NVIDIA only)
-    double available_ram[MAX_COPROC_INSTANCES];
-    bool available_ram_unknown[MAX_COPROC_INSTANCES];
-        // couldn't get available RAM; don't start new apps on this instance
-    double available_ram_fake[MAX_COPROC_INSTANCES];
+    double available_ram;
+    double available_ram_temp[MAX_COPROC_INSTANCES];
+        // used during job scheduling

    double last_print_time;
    
@ -189,13 +189,11 @@ struct COPROC {
        req_instances = 0;
        opencl_device_count = 0;
        estimated_delay = 0;
+        available_ram = 0;
        for (int i=0; i<MAX_COPROC_INSTANCES; i++) {
            device_nums[i] = 0;
            opencl_device_ids[i] = 0;
            running_graphics_app[i] = true;
-            available_ram[i] = 0;
-            available_ram_fake[i] = 0;
-            available_ram_unknown[i] = true;
        }
        memset(&opencl_prop, 0, sizeof(opencl_prop));
    }
@ -212,7 +210,6 @@ struct COPROC {
    COPROC() {
        clear();
    }
-    void print_available_ram();
 };

 // based on cudaDeviceProp from /usr/local/cuda/include/driver_types.h
@ -285,7 +282,7 @@ struct COPROC_NVIDIA : public COPROC {

    bool check_running_graphics_app();
    bool matches(OPENCL_DEVICE_PROP& OpenCLprop);
-    void fake(int driver_version, double ram, int count);
+    void fake(int driver_version, double ram, double avail_ram, int count);

 };

@ -318,7 +315,7 @@ struct COPROC_ATI : public COPROC {
        // clock is in MHz
        peak_flops = (x>0)?x:5e10;
 	}
-    void fake(double, int);
+    void fake(double ram, double avail_ram, int);
 };

 struct COPROCS {
--- a/sched/sample_trivial_validator.cpp
+++ b/sched/sample_trivial_validator.cpp
@ -15,8 +15,7 @@
 // You should have received a copy of the GNU Lesser General Public License
 // along with BOINC.  If not, see <http://www.gnu.org/licenses/>.

-// A sample validator that grants credit to any result whose CPU time is above
-// a certain minimum
+// A sample validator that accepts all results

 #include <cstdlib>
 #include "config.h"
@ -24,18 +23,12 @@

 using std::vector;

-static const double MIN_CPU_TIME = 0;
-
-int init_result(RESULT& /*result*/, void*& /*data*/) {
+int init_result(RESULT&, void*&) {
    return 0;
 }

-int compare_results(
-    RESULT & r1, void* /*data1*/,
-    RESULT const& r2, void* /*data2*/,
-    bool& match
-) {
-    match = (r1.cpu_time >= MIN_CPU_TIME && r2.cpu_time >= MIN_CPU_TIME);
+int compare_results(RESULT&, void*, RESULT const&, void*, bool& match) {
+    match = true;
    return 0;
 }