svn path=/trunk/boinc/; revision=18770

2009-07-29 21:21:52 +00:00 · 2009-07-29 21:21:52 +00:00 · 94e75fd4b1
parent ff5d082e86
commit 94e75fd4b1
2 changed files with 111 additions and 20 deletions
--- a/lib/coproc.cpp
+++ b/lib/coproc.cpp
@ -93,7 +93,7 @@ void COPROCS::summary_string(char* buf, int len) {
            COPROC_CUDA* cp2 = (COPROC_CUDA*) cp;
            int mem = (int)(cp2->prop.dtotalGlobalMem/MEGA);
            sprintf(buf2, "[CUDA|%s|%d|%dMB|%d]",
-                cp2->prop.name, cp2->count, mem, cp2->drvVersion
+                cp2->prop.name, cp2->count, mem, cp2->display_driver_version
            );
            strcat(bigbuf, buf2);
        }
@ -149,8 +149,8 @@ int cuda_compare(COPROC_CUDA& c1, COPROC_CUDA& c2, bool loose) {
    if (c1.prop.major < c2.prop.major) return -1;
    if (c1.prop.minor > c2.prop.minor) return 1;
    if (c1.prop.minor < c2.prop.minor) return -1;
-    if (c1.drvVersion > c2.drvVersion) return 1; 
-    if (c1.drvVersion < c2.drvVersion) return -1; 
+    if (c1.cuda_version > c2.cuda_version) return 1; 
+    if (c1.cuda_version < c2.cuda_version) return -1; 
    if (loose) {
        if (c1.prop.totalGlobalMem > 1.4*c2.prop.totalGlobalMem) return 1;
        if (c1.prop.totalGlobalMem < .7* c2.prop.totalGlobalMem) return -1;
@ -169,17 +169,29 @@ void COPROC_CUDA::get(
    COPROCS& coprocs, vector<string>& strings,
    bool use_all    // if false, use only those equivalent to most capable
 ) {
-    int count;
+    int count, retval;

 #ifdef _WIN32

    typedef int (__stdcall *PCGDC)(int *count);
    typedef int (__stdcall *PCGDP)(struct cudaDeviceProp *prop, int device);
    typedef int (__stdcall *PCGDV)(int* version);
+    typedef int (__stdcall *PCGDI)(int);
+    typedef int (__stdcall *PCGDG)(int*, int);
+    typedef int (__stdcall *PCGDA)(int*, int, int);
+    typedef int (__stdcall *PCGDN)(char*, int, int);
+    typedef int (__stdcall *PCGDM)(unsigned int*, int);
+    typedef int (__stdcall *PCGDCC)(int*, int*, int);

    PCGDC __cudaGetDeviceCount = NULL;
    PCGDP __cudaGetDeviceProperties = NULL;
    PCGDV __cudaGetDriverVersion = NULL;
+    PCGDI __cudaInit = NULL;
+    PCGDG __cuDeviceGet = NULL;
+    PCGDA __cuDeviceGetAttribute = NULL;
+    PCGDN __cuDeviceGetName = NULL;
+    PCGDM __cuDeviceTotalMem = NULL;
+    PCGDCC __cuDeviceComputeCapability = NULL;

    HMODULE cudalib = LoadLibrary("nvcuda.dll");
    if (!cudalib) {
@ -204,7 +216,28 @@ void COPROC_CUDA::get(
        strings.push_back("Library doesn't have cuDriverGetVersion()");
        return;
    }
+    __cudaInit = (PCGDI)GetProcAddress(cudalib, "cuInit" );
+    __cuDeviceGet = (PCGDG)GetProcAddress(cudalib, "cuDeviceGet" );
+    __cuDeviceGetAttribute = (PCGDA)GetProcAddress(cudalib, "cuDeviceGetAttribute" );
+    __cuDeviceGetName = (PCGDN)GetProcAddress(cudalib, "cuDeviceGetName" );
+    __cuDeviceTotalMem = (PCGDM)GetProcAddress(cudalib, "cuDeviceTotalMem" );
+    __cuDeviceComputeCapability = (PCGDCC)GetProcAddress(cudalib, "cuDeviceComputeCapability" );

+    #ifndef SIM
+    NvAPI_Status nvapiStatus;
+    NvDisplayHandle hDisplay;
+    NV_DISPLAY_DRIVER_VERSION Version;
+    memset(&Version, 0, sizeof(Version));
+    Version.version = NV_DISPLAY_DRIVER_VERSION_VER;
+
+    NvAPI_Initialize();
+    for (int i=0; ; i++) {
+        nvapiStatus = NvAPI_EnumNvidiaDisplayHandle(i, &hDisplay);
+        if (nvapiStatus != NVAPI_OK) break;
+        nvapiStatus = NvAPI_GetDisplayDriverVersion(hDisplay, &Version);
+        if (nvapiStatus == NVAPI_OK) break;
+    }
+#endif
 #else
    void* cudalib;
    void (*__cudaGetDeviceCount)(int*);
@ -237,20 +270,50 @@ void COPROC_CUDA::get(
    }
 #endif

-    int driver_version;
-    (*__cudaGetDriverVersion)(&driver_version);
+
+    retval = (*__cudaInit)(0);
+
+    int cuda_version;
+    retval = (*__cudaGetDriverVersion)(&cuda_version);

    vector<COPROC_CUDA> gpus;
-    (*__cudaGetDeviceCount)(&count);
+    retval = (*__cudaGetDeviceCount)(&count);
    int j;
    unsigned int i;
    COPROC_CUDA cc;
    string s;
    for (j=0; j<count; j++) {
-        (*__cudaGetDeviceProperties)(&cc.prop, j);
+        memset(&cc.prop, 0, sizeof(cc.prop));
+        int device;
+        retval = (*__cuDeviceGet)(&device, j);
+        (*__cuDeviceGetName)(cc.prop.name, 256, device);
+        (*__cuDeviceComputeCapability)(&cc.prop.major, &cc.prop.minor, device);
+        (*__cuDeviceTotalMem)(&cc.prop.totalGlobalMem, device);
+        (*__cuDeviceGetAttribute)(&cc.prop.sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK, device);
+        (*__cuDeviceGetAttribute)(&cc.prop.regsPerBlock, CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK, device);
+        (*__cuDeviceGetAttribute)(&cc.prop.warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device);
+        (*__cuDeviceGetAttribute)(&cc.prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, device);
+        retval = (*__cuDeviceGetAttribute)(&cc.prop.maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device);
+        retval = (*__cuDeviceGetAttribute)(&cc.prop.maxThreadsDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device);
+        (*__cuDeviceGetAttribute)(&cc.prop.maxThreadsDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device);
+        (*__cuDeviceGetAttribute)(&cc.prop.maxThreadsDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device);
+        (*__cuDeviceGetAttribute)(&cc.prop.maxGridSize[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
+        (*__cuDeviceGetAttribute)(&cc.prop.maxGridSize[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
+        (*__cuDeviceGetAttribute)(&cc.prop.maxGridSize[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
+        (*__cuDeviceGetAttribute)(&cc.prop.clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device);
+        (*__cuDeviceGetAttribute)(&cc.prop.totalConstMem, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, device);
+        (*__cuDeviceGetAttribute)(&cc.prop.textureAlignment, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, device);
+        (*__cuDeviceGetAttribute)(&cc.prop.deviceOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, device);
+        retval = (*__cuDeviceGetAttribute)(&cc.prop.multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
+        //retval = (*__cudaGetDeviceProperties)(&cc.prop, device);
        if (cc.prop.major <= 0) continue;  // major == 0 means emulation
        if (cc.prop.major > 100) continue;  // e.g. 9999 is an error
-        cc.drvVersion = driver_version;
+#if defined(_WIN32) && !defined(SIM)
+        cc.display_driver_version = Version.drvVersion;
+#else
+        cc.display_driver_version = 0;
+#endif
+        cc.cuda_version = cuda_version;
        cc.device_num = j;
        gpus.push_back(cc);
    }
@ -294,8 +357,8 @@ void COPROC_CUDA::get(
 }

 void COPROC_CUDA::description(char* buf) {
-	sprintf(buf, "%s (driver version %d, compute capability %d.%d, %.0fMB, est. %.0fGFLOPS)",
-		prop.name, drvVersion, prop.major, prop.minor, prop.totalGlobalMem/(1024.*1024.), flops_estimate()/1e9
+	sprintf(buf, "%s (driver version %d, CUDA version %d, compute capability %d.%d, %.0fMB, est. %.0fGFLOPS)",
+		prop.name, display_driver_version, cuda_version, prop.major, prop.minor, prop.totalGlobalMem/(1024.*1024.), flops_estimate()/1e9
 	);
 }

@ -337,6 +400,7 @@ void COPROC_CUDA::write_xml(MIOFILE& f) {
        "   <req_instances>%d</req_instances>\n"
        "   <estimated_delay>%f</estimated_delay>\n"
        "   <drvVersion>%d</drvVersion>\n"
+        "   <cudaVersion>%d</cudaVersion>\n"
        "   <totalGlobalMem>%u</totalGlobalMem>\n"
        "   <sharedMemPerBlock>%u</sharedMemPerBlock>\n"
        "   <regsPerBlock>%d</regsPerBlock>\n"
@ -358,7 +422,8 @@ void COPROC_CUDA::write_xml(MIOFILE& f) {
        req_secs,
        req_instances,
        estimated_delay,
-        drvVersion,
+        display_driver_version,
+        cuda_version,
        (unsigned int)prop.totalGlobalMem,
        (unsigned int)prop.sharedMemPerBlock,
        prop.regsPerBlock,
@ -384,6 +449,8 @@ void COPROC_CUDA::clear() {
    req_secs = 0;
    req_instances = 0;
    estimated_delay = -1;   // mark as absent
+    cuda_version = 0;
+    display_driver_version = 0;
    strcpy(prop.name, "");
    prop.totalGlobalMem = 0;
    prop.sharedMemPerBlock = 0;
@ -419,7 +486,8 @@ int COPROC_CUDA::parse(FILE* fin) {
        if (parse_int(buf, "<req_instances>", req_instances)) continue;
        if (parse_double(buf, "<estimated_delay>", estimated_delay)) continue;
        if (parse_str(buf, "<name>", prop.name, sizeof(prop.name))) continue;
-        if (parse_int(buf, "<drvVersion>", drvVersion)) continue;
+        if (parse_int(buf, "<drvVersion>", display_driver_version)) continue;
+        if (parse_int(buf, "<cudaVersion>", cuda_version)) continue;
        if (parse_double(buf, "<totalGlobalMem>", prop.dtotalGlobalMem)) continue;
        if (parse_int(buf, "<sharedMemPerBlock>", (int&)prop.sharedMemPerBlock)) continue;
        if (parse_int(buf, "<regsPerBlock>", prop.regsPerBlock)) continue;
--- a/lib/coproc.h
+++ b/lib/coproc.h
@ -174,29 +174,29 @@ struct COPROCS {
 //
 struct cudaDeviceProp {
  char   name[256];
-  size_t totalGlobalMem;
+  unsigned int totalGlobalMem;
    // not used on the server; dtotalGlobalMem is used instead
    // (since some boards have >= 4GB)
-  size_t sharedMemPerBlock;
+  int sharedMemPerBlock;
  int    regsPerBlock;
  int    warpSize;
-  size_t memPitch;
+  int memPitch;
  int    maxThreadsPerBlock;
  int    maxThreadsDim[3];
  int    maxGridSize[3]; 
  int    clockRate;
-  size_t totalConstMem; 
+  int totalConstMem; 
  int    major;
  int    minor;
-  size_t textureAlignment;
+  int textureAlignment;
  int    deviceOverlap;
  int    multiProcessorCount;
-  int    __cudaReserved[40];
  double dtotalGlobalMem;   // not defined in client
 };

 struct COPROC_CUDA : public COPROC {
-    int drvVersion;  // display driver version, obtained from NVAPI
+    int cuda_version;  // CUDA runtime version
+    int display_driver_version;
    cudaDeviceProp prop;

 #ifndef _USING_FCGI_
@ -222,4 +222,27 @@ struct COPROC_CUDA : public COPROC {

 void fake_cuda(COPROCS&, int);

+enum  	CUdevice_attribute_enum {
+  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,
+  CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,
+  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,
+  CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
+  CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,
+  CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,
+  CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
+  CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,
+  CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
+  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
+  CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,
+  CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
+  CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20
+};
+
 #endif