From 328d87be2625ce68a04b0d1caf29e3826eab25f3 Mon Sep 17 00:00:00 2001 From: Charlie Fenton Date: Mon, 22 Sep 2014 04:57:23 -0700 Subject: [PATCH] API: Fix a bug in boinc_get_opencl_ids() which incorrectly returned CL_INVALID_DEVICE on some systems. This would happen on hosts with multiple GPUs if: * the host has GPUs from two different vendors with different OpenCL platforms (e.g., NVIDIA and AMD), and * the second platform queried has more GPUs than the first one, and * the requested GPU is a higher numbed one in the second platform. --- api/boinc_opencl.cpp | 125 +++++++++++++++++++++++++++++++------------ lib/app_ipc.cpp | 7 ++- lib/opencl_boinc.h | 2 +- 3 files changed, 97 insertions(+), 37 deletions(-) diff --git a/api/boinc_opencl.cpp b/api/boinc_opencl.cpp index 7b0fa9a7de..908bdf8b4c 100644 --- a/api/boinc_opencl.cpp +++ b/api/boinc_opencl.cpp @@ -41,6 +41,8 @@ #include "boinc_opencl.h" +static int compareBOINCVersionTo(int toMajor, int toMinor, int toRelease); + // A few complicating factors: // Windows & Linux have a separate OpenCL platform for each vendor // (NVIDIA, AMD, Intel). @@ -95,7 +97,7 @@ int boinc_get_opencl_ids_aux( int device_num_for_type = -1; int device_index; - if ((!type) || (!strlen(type))) return CL_DEVICE_NOT_FOUND; + if ((!type) || (!strlen(type))) return CL_INVALID_DEVICE_TYPE; retval = clGetPlatformIDs(MAX_OPENCL_PLATFORMS, platforms, &num_platforms); if (num_platforms == 0) return CL_DEVICE_NOT_FOUND; @@ -110,25 +112,23 @@ int boinc_get_opencl_ids_aux( // Use gpu_opencl_dev_index if available if (opencl_device_index >= 0) { - if (opencl_device_index >= (int)num_devices) { - fprintf(stderr, "Invalid OpenCL GPU index: %d \n", opencl_device_index); - return CL_INVALID_DEVICE; - } + if (opencl_device_index < (int)num_devices) { + device_id = devices[opencl_device_index]; + retval = get_vendor(device_id, vendor, sizeof(vendor)); + if (retval != CL_SUCCESS) continue; - device_id = devices[opencl_device_index]; - retval = get_vendor(device_id, vendor, sizeof(vendor)); - if (retval != CL_SUCCESS) continue; - - if (!strcmp(vendor, type)) { - *device = device_id; - *platform = platforms[platform_index]; - return 0; + if (!strcmp(vendor, type)) { + *device = device_id; + *platform = platforms[platform_index]; + return 0; + } } - + continue; } - // Older versions of init_data.xml don't have gpu_opencl_dev_index field + // Older versions of init_data.xml don't have the gpu_opencl_dev_index + // field so use the value of gpu_device_num. // NOTE: This may return the wrong device on older versions of BOINC if // OpenCL does not recognize all GPU models detected by CUDA or CAL for (device_index=0; device_index<(int)num_devices; ++device_index) { @@ -147,6 +147,9 @@ int boinc_get_opencl_ids_aux( } } + fprintf(stderr, "GPU not found: type=%s, opencl_device_index=%d, device_num=%d\n", + type, opencl_device_index, device_num); + return CL_DEVICE_NOT_FOUND; } @@ -157,14 +160,15 @@ int boinc_get_opencl_ids_aux( // type: may be PROC_TYPE_NVIDIA_GPU, PROC_TYPE_AMD_GPU or PROC_TYPE_INTEL_GPU // (it may also be 0, but then it will fail on older clients.) // -// The argc, argv and type arguments are ignored for 7.0.12 or later clients. +// The argc, argv and type arguments are ignored for 6.13.3 or later clients. // // returns // - 0 if success // - ERR_FOPEN if init_data.xml missing // - ERR_XML_PARSE if can't parse init_data.xml // - CL_INVALID_DEVICE_TYPE if unable to get gpu_type information -// - ERR_NOT_FOUND if unable to get opencl_device_index or gpu device_num +// - CL_INVALID_DEVICE if unable to get opencl_device_index or gpu device_num +// - CL_DEVICE_NOT_FOUND if the requested device was not found // - an OpenCL error number if OpenCL error // int boinc_get_opencl_ids( @@ -203,24 +207,39 @@ int boinc_get_opencl_ids( } if (aid.gpu_opencl_dev_index < 0) { - // Older versions of init_data.xml don't have gpu_opencl_dev_index field - // + if (compareBOINCVersionTo(7,0,12) >= 0) { + // gpu_opencl_dev_index was added in BOINC version 7.0.12. + // A gpu_opencl_dev_index value of -1 in version 7.0.12 or later + // means BOINC client did not assign an OpenCL GPU to this task. + fprintf(stderr, "Illegal value for gpu_opencl_dev_index: %d in BOINC Client %d.%d.%d\n", + aid.gpu_opencl_dev_index, aid.major_version, aid.minor_version, aid.release); + return CL_INVALID_DEVICE; + } + + // Older versions of init_data.xml don't have the gpu_opencl_dev_index + // field so use the value of gpu_device_num if available. gpu_device_num = aid.gpu_device_num; if (gpu_device_num < 0) { - // Even older versions of init_data.xml don't have gpu_device_num field - for (i=0; i or fields +// - CL_INVALID_DEVICE_TYPE if unable to get gpu_type information +// - CL_INVALID_DEVICE if unable to get opencl_device_index or gpu device_num +// - CL_DEVICE_NOT_FOUND if the requested device was not found // - an OpenCL error number if OpenCL error // int boinc_get_opencl_ids(cl_device_id* device, cl_platform_id* platform) { @@ -249,12 +270,29 @@ int boinc_get_opencl_ids(cl_device_id* device, cl_platform_id* platform) { if (!strlen(aid.gpu_type)) { fprintf(stderr, "GPU type not found in %s\n", INIT_DATA_FILE); - return ERR_NOT_FOUND; + return CL_INVALID_DEVICE_TYPE; } - if ((aid.gpu_opencl_dev_index < 0) && (aid.gpu_device_num < 0)) { - fprintf(stderr, "GPU device # not found in %s\n", INIT_DATA_FILE); - return ERR_NOT_FOUND; + if (aid.gpu_opencl_dev_index < 0) { + if (compareBOINCVersionTo(7,0,12) >= 0) { + // gpu_opencl_dev_index was added in BOINC version 7.0.12. + // A gpu_opencl_dev_index value of -1 in version 7.0.12 or + // later means BOINC did not assign an OpenCL GPU to this task. + fprintf(stderr, "Illegal value for gpu_opencl_dev_index: %d in BOINC Client %d.%d.%d\n", + aid.gpu_opencl_dev_index, aid.major_version, aid.minor_version, aid.release); + return CL_INVALID_DEVICE; + } + + if (aid.gpu_device_num < 0) { + if (compareBOINCVersionTo(6,13,3) >= 0) { + // gpu_device_num and gpu_type fields were added in BOINC version 6.13.3. + // A gpu_device_num value of -1 in version 6.13.3 or later means + // BOINC did not assign a GPU to this task. + fprintf(stderr, "Illegal value for gpu_device_num: %d in BOINC Client %d.%d.%d\n", + aid.gpu_device_num, aid.major_version, aid.minor_version, aid.release); + return CL_INVALID_DEVICE; + } + } } retval = boinc_get_opencl_ids_aux( @@ -263,3 +301,20 @@ int boinc_get_opencl_ids(cl_device_id* device, cl_platform_id* platform) { return retval; } + +static int compareBOINCVersionTo(int toMajor, int toMinor, int toRelease) { + APP_INIT_DATA aid; + + boinc_get_init_data(aid); + + if (aid.major_version < toMajor) return -1; + if (aid.major_version > toMajor) return 1; + if (aid.minor_version < toMinor) return -1; + if (aid.minor_version > toMinor) return 1; + if (aid.release < toRelease) return -1; + if (aid.release > toRelease) return 1; + return 0; +} + + + diff --git a/lib/app_ipc.cpp b/lib/app_ipc.cpp index 262ab47e90..737e45c11c 100644 --- a/lib/app_ipc.cpp +++ b/lib/app_ipc.cpp @@ -281,9 +281,14 @@ void APP_INIT_DATA::clear() { fraction_done_start = 0; fraction_done_end = 0; checkpoint_period = 0; + // gpu_type is an empty string for client versions before 6.13.3 without this + // field or (on newer clients) if BOINC did not assign an OpenCL GPU to task. strcpy(gpu_type, ""); + // gpu_device_num < 0 for client versions before 6.13.3 without this field + // or (on newer clients) if BOINC did not assign an OpenCL GPU to task. gpu_device_num = -1; - // -1 means an older version without gpu_opencl_dev_index field + // gpu_opencl_dev_index < 0 for client versions before 7.0.12 without this + // field or (on newer clients) if BOINC did not assign any GPU to task. gpu_opencl_dev_index = -1; gpu_usage = 0; ncpus = 0; diff --git a/lib/opencl_boinc.h b/lib/opencl_boinc.h index 7e16aa6b12..30415523f7 100644 --- a/lib/opencl_boinc.h +++ b/lib/opencl_boinc.h @@ -63,7 +63,7 @@ struct OPENCL_DEVICE_PROP { double peak_flops; // temp used in scan process COPROC_USAGE is_used; // temp used in scan process double opencl_available_ram; // temp used in scan process - int opencl_device_index; // zero-based device number within this COPROC type + int opencl_device_index; // zero-based device number within this OpenCL platform void write_xml(MIOFILE&, const char* tag, bool temp_file=false); int parse(XML_PARSER&, const char* end_tag);