client: tweaks to code for detecting GPUs via a child process.

Added safety features requested by Rom Walton:
* Change COPROC_ATI::get_available_ram and  COPROC_NVIDIA::get_available_ram to static routines to prevent calling them without first loading CAL or CUDA libraries.
* Add tests for NULL library calls in these routines.
* Add comments warning about need to call from a separate child process on dual-GPU laptops, proper library initialization, etc.
This commit is contained in:
Charlie Fenton 2013-06-27 02:36:20 -07:00
parent 31e3bf2783
commit 631e236b08
3 changed files with 100 additions and 43 deletions

View File

@ -49,6 +49,8 @@ using std::string;
#include "client_msgs.h" #include "client_msgs.h"
#include "gpu_detect.h" #include "gpu_detect.h"
static void get_available_ati_ram(COPROC_ATI &cc, vector<string>& warnings);
// criteria: // criteria:
// //
// - double precision support // - double precision support
@ -196,18 +198,6 @@ void COPROC_ATI::get(
warnings.push_back("calDeviceGetInfo() missing from CAL library"); warnings.push_back("calDeviceGetInfo() missing from CAL library");
return; return;
} }
if (!__calDeviceGetStatus) {
warnings.push_back("calDeviceGetStatus() missing from CAL library");
return;
}
if (!__calDeviceOpen) {
warnings.push_back("calDeviceOpen() missing from CAL library");
return;
}
if (!__calDeviceClose) {
warnings.push_back("calDeviceClose() missing from CAL library");
return;
}
retval = (*__calInit)(); retval = (*__calInit)();
if (retval != CAL_RESULT_OK) { if (retval != CAL_RESULT_OK) {
@ -375,7 +365,7 @@ void COPROC_ATI::get(
cc.atirt_detected = atirt_detected; cc.atirt_detected = atirt_detected;
cc.device_num = i; cc.device_num = i;
cc.set_peak_flops(); cc.set_peak_flops();
cc.get_available_ram(warnings); get_available_ati_ram(cc, warnings);
ati_gpus.push_back(cc); ati_gpus.push_back(cc);
} }
@ -431,20 +421,48 @@ void COPROC_ATI::correlate(
// get available RAM of ATI GPU // get available RAM of ATI GPU
// //
void COPROC_ATI::get_available_ram(vector<string>& warnings) { // CAUTION: as currently written, this method should be
// called only from COPROC_ATI::get(). If in the future
// you wish to call it from additional places:
// * It must be called from a separate child process on
// dual-GPU laptops (e.g., Macbook Pros) with the results
// communicated to the main client process via IPC or a
// temp file. See the comments about dual-GPU laptops
// in gpu_detect.cpp and main.cpp for more details.
// * The CAL library must be loaded and calInit() called
// first.
// * See client/coproc_detect.cpp and cpu_sched.cpp in
// BOINC 6.12.36 for an earlier attempt to call this
// from the scheduler. Note that it was abandoned
// due to repeated calls crashing the driver.
//
static void get_available_ati_ram(COPROC_ATI &cc, vector<string>& warnings) {
CALdevicestatus st; CALdevicestatus st;
CALdevice dev; CALdevice dev;
char buf[256]; char buf[256];
int retval; int retval;
available_ram = attribs.localRAM*MEGA; cc.available_ram = cc.attribs.localRAM*MEGA;
st.struct_size = sizeof(CALdevicestatus); st.struct_size = sizeof(CALdevicestatus);
retval = (*__calDeviceOpen)(&dev, device_num); if (!__calDeviceOpen) {
warnings.push_back("calDeviceOpen() missing from CAL library");
return;
}
if (!__calDeviceGetStatus) {
warnings.push_back("calDeviceGetStatus() missing from CAL library");
return;
}
if (!__calDeviceClose) {
warnings.push_back("calDeviceClose() missing from CAL library");
return;
}
retval = (*__calDeviceOpen)(&dev, cc.device_num);
if (retval) { if (retval) {
snprintf(buf, sizeof(buf), snprintf(buf, sizeof(buf),
"[coproc] calDeviceOpen(%d) returned %d", device_num, retval "[coproc] calDeviceOpen(%d) returned %d", cc.device_num, retval
); );
warnings.push_back(buf); warnings.push_back(buf);
return; return;
@ -453,12 +471,12 @@ void COPROC_ATI::get_available_ram(vector<string>& warnings) {
if (retval) { if (retval) {
snprintf(buf, sizeof(buf), snprintf(buf, sizeof(buf),
"[coproc] calDeviceGetStatus(%d) returned %d", "[coproc] calDeviceGetStatus(%d) returned %d",
device_num, retval cc.device_num, retval
); );
warnings.push_back(buf); warnings.push_back(buf);
(*__calDeviceClose)(dev); (*__calDeviceClose)(dev);
return; return;
} }
available_ram = st.availLocalRAM*MEGA; cc.available_ram = st.availLocalRAM*MEGA;
(*__calDeviceClose)(dev); (*__calDeviceClose)(dev);
} }

View File

@ -45,6 +45,8 @@ using std::string;
#include "client_msgs.h" #include "client_msgs.h"
#include "gpu_detect.h" #include "gpu_detect.h"
static void get_available_nvidia_ram(COPROC_NVIDIA &cc, vector<string>& warnings);
// return 1/-1/0 if device 1 is more/less/same capable than device 2. // return 1/-1/0 if device 1 is more/less/same capable than device 2.
// factors (decreasing priority): // factors (decreasing priority):
// - compute capability // - compute capability
@ -237,14 +239,6 @@ void COPROC_NVIDIA::get(
warnings.push_back("cuDeviceComputeCapability() missing from NVIDIA library"); warnings.push_back("cuDeviceComputeCapability() missing from NVIDIA library");
return; return;
} }
if (!__cuCtxCreate) {
warnings.push_back("cuCtxCreate() missing from NVIDIA library");
return;
}
if (!__cuCtxDestroy) {
warnings.push_back("cuCtxDestroy() missing from NVIDIA library");
return;
}
if (!__cuMemAlloc) { if (!__cuMemAlloc) {
warnings.push_back("cuMemAlloc() missing from NVIDIA library"); warnings.push_back("cuMemAlloc() missing from NVIDIA library");
return; return;
@ -253,10 +247,6 @@ void COPROC_NVIDIA::get(
warnings.push_back("cuMemFree() missing from NVIDIA library"); warnings.push_back("cuMemFree() missing from NVIDIA library");
return; return;
} }
if (!__cuMemGetInfo) {
warnings.push_back("cuMemGetInfo() missing from NVIDIA library");
return;
}
retval = (*__cuInit)(0); retval = (*__cuInit)(0);
if (retval) { if (retval) {
@ -340,7 +330,7 @@ void COPROC_NVIDIA::get(
cc.cuda_version = cuda_version; cc.cuda_version = cuda_version;
cc.device_num = j; cc.device_num = j;
cc.set_peak_flops(); cc.set_peak_flops();
cc.get_available_ram(warnings); get_available_nvidia_ram(cc, warnings);
nvidia_gpus.push_back(cc); nvidia_gpus.push_back(cc);
} }
if (!nvidia_gpus.size()) { if (!nvidia_gpus.size()) {
@ -390,18 +380,50 @@ void COPROC_NVIDIA::correlate(
// See how much RAM is available on this GPU. // See how much RAM is available on this GPU.
// //
void COPROC_NVIDIA::get_available_ram(vector<string>& warnings) { // CAUTION: as currently written, this method should be
// called only from COPROC_NVIDIA::get(). If in the
// future you wish to call it from additional places:
// * It must be called from a separate child process on
// dual-GPU laptops (e.g., Macbook Pros) with the results
// communicated to the main client process via IPC or a
// temp file. See the comments about dual-GPU laptops
// in gpu_detect.cpp and main.cpp for more details.
// * The CUDA library must be loaded and cuInit() called
// first.
// * See client/coproc_detect.cpp and cpu_sched.cpp in
// BOINC 6.12.36 for an earlier attempt to call this
// from the scheduler. Note that it was abandoned
// due to repeated calls crashing the driver.
//
static void get_available_nvidia_ram(COPROC_NVIDIA &cc, vector<string>& warnings) {
int retval; int retval;
size_t memfree = 0, memtotal = 0; size_t memfree = 0, memtotal = 0;
int device; int device;
void* ctx; void* ctx;
char buf[256]; char buf[256];
available_ram = prop.totalGlobalMem; cc.available_ram = cc.prop.totalGlobalMem;
retval = (*__cuDeviceGet)(&device, device_num); if (!__cuDeviceGet) {
warnings.push_back("cuDeviceGet() missing from NVIDIA library");
return;
}
if (!__cuCtxCreate) {
warnings.push_back("cuCtxCreate() missing from NVIDIA library");
return;
}
if (!__cuCtxDestroy) {
warnings.push_back("cuCtxDestroy() missing from NVIDIA library");
return;
}
if (!__cuMemGetInfo) {
warnings.push_back("cuMemGetInfo() missing from NVIDIA library");
return;
}
retval = (*__cuDeviceGet)(&device, cc.device_num);
if (retval) { if (retval) {
snprintf(buf, sizeof(buf), snprintf(buf, sizeof(buf),
"[coproc] cuDeviceGet(%d) returned %d", device_num, retval "[coproc] cuDeviceGet(%d) returned %d", cc.device_num, retval
); );
warnings.push_back(buf); warnings.push_back(buf);
return; return;
@ -409,7 +431,7 @@ void COPROC_NVIDIA::get_available_ram(vector<string>& warnings) {
retval = (*__cuCtxCreate)(&ctx, 0, device); retval = (*__cuCtxCreate)(&ctx, 0, device);
if (retval) { if (retval) {
snprintf(buf, sizeof(buf), snprintf(buf, sizeof(buf),
"[coproc] cuCtxCreate(%d) returned %d", device_num, retval "[coproc] cuCtxCreate(%d) returned %d", cc.device_num, retval
); );
warnings.push_back(buf); warnings.push_back(buf);
return; return;
@ -417,22 +439,42 @@ void COPROC_NVIDIA::get_available_ram(vector<string>& warnings) {
retval = (*__cuMemGetInfo)(&memfree, &memtotal); retval = (*__cuMemGetInfo)(&memfree, &memtotal);
if (retval) { if (retval) {
snprintf(buf, sizeof(buf), snprintf(buf, sizeof(buf),
"[coproc] cuMemGetInfo(%d) returned %d", device_num, retval "[coproc] cuMemGetInfo(%d) returned %d", cc.device_num, retval
); );
warnings.push_back(buf); warnings.push_back(buf);
(*__cuCtxDestroy)(ctx); (*__cuCtxDestroy)(ctx);
return; return;
} }
(*__cuCtxDestroy)(ctx); (*__cuCtxDestroy)(ctx);
available_ram = (double) memfree; cc.available_ram = (double) memfree;
} }
// check whether each GPU is running a graphics app (assume yes) // check whether each GPU is running a graphics app (assume yes)
// return true if there's been a change since last time // return true if there's been a change since last time
// //
// CAUTION: this method is not currently used. If you wish
// to call it in the future:
// * It must be called from a separate child process on
// dual-GPU laptops (e.g., Macbook Pros) with the results
// communicated to the main client process via IPC or a
// temp file. See the comments about dual-GPU laptops
// in gpu_detect.cpp and main.cpp for more details.
// * The CUDA library must be loaded and cuInit() called
// first.
//
#if 0
bool COPROC_NVIDIA::check_running_graphics_app() { bool COPROC_NVIDIA::check_running_graphics_app() {
int retval, j; int retval, j;
bool change = false; bool change = false;
if (!__cuDeviceGet) {
warnings.push_back("cuDeviceGet() missing from NVIDIA library");
return;
}
if (!__cuDeviceGetAttribute) {
warnings.push_back("cuDeviceGetAttribute() missing from NVIDIA library");
return;
}
for (j=0; j<count; j++) { for (j=0; j<count; j++) {
bool new_val = true; bool new_val = true;
int device, kernel_timeout; int device, kernel_timeout;
@ -450,4 +492,4 @@ bool COPROC_NVIDIA::check_running_graphics_app() {
} }
return change; return change;
} }
#endif

View File

@ -332,9 +332,7 @@ struct COPROC_NVIDIA : public COPROC {
void description(char* buf, int buflen); void description(char* buf, int buflen);
void clear(); void clear();
int parse(XML_PARSER&); int parse(XML_PARSER&);
void get_available_ram(std::vector<std::string>& warnings);
void set_peak_flops(); void set_peak_flops();
bool check_running_graphics_app();
void fake(int driver_version, double ram, double avail_ram, int count); void fake(int driver_version, double ram, double avail_ram, int count);
}; };
@ -371,7 +369,6 @@ struct COPROC_ATI : public COPROC {
void description(char* buf, int buflen); void description(char* buf, int buflen);
void clear(); void clear();
int parse(XML_PARSER&); int parse(XML_PARSER&);
void get_available_ram(std::vector<std::string>& warnings);
void set_peak_flops(); void set_peak_flops();
void fake(double ram, double avail_ram, int); void fake(double ram, double avail_ram, int);
}; };