diff --git a/lib/coproc.cpp b/lib/coproc.cpp index 169098b8e0..bdb8a2763e 100644 --- a/lib/coproc.cpp +++ b/lib/coproc.cpp @@ -93,7 +93,7 @@ void COPROCS::summary_string(char* buf, int len) { COPROC_CUDA* cp2 = (COPROC_CUDA*) cp; int mem = (int)(cp2->prop.dtotalGlobalMem/MEGA); sprintf(buf2, "[CUDA|%s|%d|%dMB|%d]", - cp2->prop.name, cp2->count, mem, cp2->drvVersion + cp2->prop.name, cp2->count, mem, cp2->display_driver_version ); strcat(bigbuf, buf2); } @@ -149,8 +149,8 @@ int cuda_compare(COPROC_CUDA& c1, COPROC_CUDA& c2, bool loose) { if (c1.prop.major < c2.prop.major) return -1; if (c1.prop.minor > c2.prop.minor) return 1; if (c1.prop.minor < c2.prop.minor) return -1; - if (c1.drvVersion > c2.drvVersion) return 1; - if (c1.drvVersion < c2.drvVersion) return -1; + if (c1.cuda_version > c2.cuda_version) return 1; + if (c1.cuda_version < c2.cuda_version) return -1; if (loose) { if (c1.prop.totalGlobalMem > 1.4*c2.prop.totalGlobalMem) return 1; if (c1.prop.totalGlobalMem < .7* c2.prop.totalGlobalMem) return -1; @@ -169,17 +169,29 @@ void COPROC_CUDA::get( COPROCS& coprocs, vector& strings, bool use_all // if false, use only those equivalent to most capable ) { - int count; + int count, retval; #ifdef _WIN32 typedef int (__stdcall *PCGDC)(int *count); typedef int (__stdcall *PCGDP)(struct cudaDeviceProp *prop, int device); typedef int (__stdcall *PCGDV)(int* version); + typedef int (__stdcall *PCGDI)(int); + typedef int (__stdcall *PCGDG)(int*, int); + typedef int (__stdcall *PCGDA)(int*, int, int); + typedef int (__stdcall *PCGDN)(char*, int, int); + typedef int (__stdcall *PCGDM)(unsigned int*, int); + typedef int (__stdcall *PCGDCC)(int*, int*, int); PCGDC __cudaGetDeviceCount = NULL; PCGDP __cudaGetDeviceProperties = NULL; PCGDV __cudaGetDriverVersion = NULL; + PCGDI __cudaInit = NULL; + PCGDG __cuDeviceGet = NULL; + PCGDA __cuDeviceGetAttribute = NULL; + PCGDN __cuDeviceGetName = NULL; + PCGDM __cuDeviceTotalMem = NULL; + PCGDCC __cuDeviceComputeCapability = NULL; HMODULE cudalib = LoadLibrary("nvcuda.dll"); if (!cudalib) { @@ -204,7 +216,28 @@ void COPROC_CUDA::get( strings.push_back("Library doesn't have cuDriverGetVersion()"); return; } + __cudaInit = (PCGDI)GetProcAddress(cudalib, "cuInit" ); + __cuDeviceGet = (PCGDG)GetProcAddress(cudalib, "cuDeviceGet" ); + __cuDeviceGetAttribute = (PCGDA)GetProcAddress(cudalib, "cuDeviceGetAttribute" ); + __cuDeviceGetName = (PCGDN)GetProcAddress(cudalib, "cuDeviceGetName" ); + __cuDeviceTotalMem = (PCGDM)GetProcAddress(cudalib, "cuDeviceTotalMem" ); + __cuDeviceComputeCapability = (PCGDCC)GetProcAddress(cudalib, "cuDeviceComputeCapability" ); + #ifndef SIM + NvAPI_Status nvapiStatus; + NvDisplayHandle hDisplay; + NV_DISPLAY_DRIVER_VERSION Version; + memset(&Version, 0, sizeof(Version)); + Version.version = NV_DISPLAY_DRIVER_VERSION_VER; + + NvAPI_Initialize(); + for (int i=0; ; i++) { + nvapiStatus = NvAPI_EnumNvidiaDisplayHandle(i, &hDisplay); + if (nvapiStatus != NVAPI_OK) break; + nvapiStatus = NvAPI_GetDisplayDriverVersion(hDisplay, &Version); + if (nvapiStatus == NVAPI_OK) break; + } +#endif #else void* cudalib; void (*__cudaGetDeviceCount)(int*); @@ -237,20 +270,50 @@ void COPROC_CUDA::get( } #endif - int driver_version; - (*__cudaGetDriverVersion)(&driver_version); + + retval = (*__cudaInit)(0); + + int cuda_version; + retval = (*__cudaGetDriverVersion)(&cuda_version); vector gpus; - (*__cudaGetDeviceCount)(&count); + retval = (*__cudaGetDeviceCount)(&count); int j; unsigned int i; COPROC_CUDA cc; string s; for (j=0; j 100) continue; // e.g. 9999 is an error - cc.drvVersion = driver_version; +#if defined(_WIN32) && !defined(SIM) + cc.display_driver_version = Version.drvVersion; +#else + cc.display_driver_version = 0; +#endif + cc.cuda_version = cuda_version; cc.device_num = j; gpus.push_back(cc); } @@ -294,8 +357,8 @@ void COPROC_CUDA::get( } void COPROC_CUDA::description(char* buf) { - sprintf(buf, "%s (driver version %d, compute capability %d.%d, %.0fMB, est. %.0fGFLOPS)", - prop.name, drvVersion, prop.major, prop.minor, prop.totalGlobalMem/(1024.*1024.), flops_estimate()/1e9 + sprintf(buf, "%s (driver version %d, CUDA version %d, compute capability %d.%d, %.0fMB, est. %.0fGFLOPS)", + prop.name, display_driver_version, cuda_version, prop.major, prop.minor, prop.totalGlobalMem/(1024.*1024.), flops_estimate()/1e9 ); } @@ -337,6 +400,7 @@ void COPROC_CUDA::write_xml(MIOFILE& f) { " %d\n" " %f\n" " %d\n" + " %d\n" " %u\n" " %u\n" " %d\n" @@ -358,7 +422,8 @@ void COPROC_CUDA::write_xml(MIOFILE& f) { req_secs, req_instances, estimated_delay, - drvVersion, + display_driver_version, + cuda_version, (unsigned int)prop.totalGlobalMem, (unsigned int)prop.sharedMemPerBlock, prop.regsPerBlock, @@ -384,6 +449,8 @@ void COPROC_CUDA::clear() { req_secs = 0; req_instances = 0; estimated_delay = -1; // mark as absent + cuda_version = 0; + display_driver_version = 0; strcpy(prop.name, ""); prop.totalGlobalMem = 0; prop.sharedMemPerBlock = 0; @@ -419,7 +486,8 @@ int COPROC_CUDA::parse(FILE* fin) { if (parse_int(buf, "", req_instances)) continue; if (parse_double(buf, "", estimated_delay)) continue; if (parse_str(buf, "", prop.name, sizeof(prop.name))) continue; - if (parse_int(buf, "", drvVersion)) continue; + if (parse_int(buf, "", display_driver_version)) continue; + if (parse_int(buf, "", cuda_version)) continue; if (parse_double(buf, "", prop.dtotalGlobalMem)) continue; if (parse_int(buf, "", (int&)prop.sharedMemPerBlock)) continue; if (parse_int(buf, "", prop.regsPerBlock)) continue; diff --git a/lib/coproc.h b/lib/coproc.h index 37611a40a3..39d8be5e4f 100644 --- a/lib/coproc.h +++ b/lib/coproc.h @@ -174,29 +174,29 @@ struct COPROCS { // struct cudaDeviceProp { char name[256]; - size_t totalGlobalMem; + unsigned int totalGlobalMem; // not used on the server; dtotalGlobalMem is used instead // (since some boards have >= 4GB) - size_t sharedMemPerBlock; + int sharedMemPerBlock; int regsPerBlock; int warpSize; - size_t memPitch; + int memPitch; int maxThreadsPerBlock; int maxThreadsDim[3]; int maxGridSize[3]; int clockRate; - size_t totalConstMem; + int totalConstMem; int major; int minor; - size_t textureAlignment; + int textureAlignment; int deviceOverlap; int multiProcessorCount; - int __cudaReserved[40]; double dtotalGlobalMem; // not defined in client }; struct COPROC_CUDA : public COPROC { - int drvVersion; // display driver version, obtained from NVAPI + int cuda_version; // CUDA runtime version + int display_driver_version; cudaDeviceProp prop; #ifndef _USING_FCGI_ @@ -222,4 +222,27 @@ struct COPROC_CUDA : public COPROC { void fake_cuda(COPROCS&, int); +enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, + CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, + CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, + CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, + CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20 +}; + #endif