// This file is part of BOINC.
// http://boinc.berkeley.edu
// Copyright (C) 2012 University of California
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation,
// either version 3 of the License, or (at your option) any later version.
//
// BOINC is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC. If not, see .
// Detection of NVIDIA GPUs
#ifdef _WIN32
#include "boinc_win.h"
/* get annotation macros from sal.h */
/* define the ones that don't exist */
#include "sal.h"
/* These are just an annotations. They don't do anything */
#ifndef __success
#define __success(x)
#endif
#ifndef __in
#define __in
#endif
#ifndef __out
#define __out
#endif
#ifndef __in_ecount
#define __in_ecount(x)
#endif
#ifndef __out_ecount
#define __out_ecount(x)
#endif
#ifndef __in_opt
#define __in_opt
#endif
#ifndef __out_opt
#define __out_opt
#endif
#ifndef __inout
#define __inout
#endif
#ifndef __inout_opt
#define __inout_opt
#endif
#ifndef __inout_ecount
#define __inout_ecount(x)
#endif
#ifndef __inout_ecount_full
#define __inout_ecount_full(x)
#endif
#ifndef __inout_ecount_part_opt
#define __inout_ecount_part_opt(x,y)
#endif
#ifndef __inout_ecount_full_opt
#define __inout_ecount_full_opt(x,y)
#endif
#ifndef __out_ecount_full_opt
#define __out_ecount_full_opt(x)
#endif
#include "nvapi.h"
#ifdef _MSC_VER
#define snprintf _snprintf
#endif
#else
#ifdef __APPLE__
// Suppress obsolete warning when building for OS 10.3.9
#define DLOPEN_NO_WARN
#include
#include
#endif
#include "config.h"
#include
#endif
#include
#include
using std::vector;
using std::string;
#include "coproc.h"
#include "util.h"
#include "client_msgs.h"
#include "gpu_detect.h"
static void get_available_nvidia_ram(COPROC_NVIDIA &cc, vector& warnings);
#if !(defined(_WIN32) || defined(__APPLE__))
static int nvidia_driver_version() {
int (*nvml_init)() = NULL;
int (*nvml_finish)() = NULL;
int (*nvml_driver)(char *f, unsigned int len) = NULL;
int dri_ver = 0;
void *handle = NULL;
char driver_string[81];
handle = dlopen("libnvidia-ml.so", RTLD_NOW);
if (!handle) goto end;
nvml_driver = (int(*)(char *, unsigned int)) dlsym(handle, "nvmlSystemGetDriverVersion");
nvml_init = (int(*)(void)) dlsym(handle, "nvmlInit");
nvml_finish = (int(*)(void)) dlsym(handle, "nvmlShutdown");
if (!nvml_driver || !nvml_init || !nvml_finish) goto end;
if (nvml_init()) goto end;
if (nvml_driver(driver_string, 80)) goto end;
dri_ver = (int) (100. * atof(driver_string));
end:
if (nvml_finish) nvml_finish();
if (handle) dlclose(handle);
return dri_ver;
}
#endif
// return 1/-1/0 if device 1 is more/less/same capable than device 2.
// factors (decreasing priority):
// - compute capability
// - software version
// - available memory
// - speed
//
// If "loose", ignore FLOPS and tolerate small memory diff
//
int nvidia_compare(COPROC_NVIDIA& c1, COPROC_NVIDIA& c2, bool loose) {
if (c1.prop.major > c2.prop.major) return 1;
if (c1.prop.major < c2.prop.major) return -1;
if (c1.prop.minor > c2.prop.minor) return 1;
if (c1.prop.minor < c2.prop.minor) return -1;
if (c1.cuda_version > c2.cuda_version) return 1;
if (c1.cuda_version < c2.cuda_version) return -1;
if (loose) {
if (c1.available_ram> 1.4*c2.available_ram) return 1;
if (c1.available_ram < .7* c2.available_ram) return -1;
return 0;
}
if (c1.available_ram > c2.available_ram) return 1;
if (c1.available_ram < c2.available_ram) return -1;
double s1 = c1.peak_flops;
double s2 = c2.peak_flops;
if (s1 > s2) return 1;
if (s1 < s2) return -1;
return 0;
}
enum CUdevice_attribute_enum {
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,
CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,
CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,
CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50
};
#ifdef _WIN32
typedef int (__stdcall *CUDA_GDC)(int *count);
typedef int (__stdcall *CUDA_GDV)(int* version);
typedef int (__stdcall *CUDA_GDI)(unsigned int);
typedef int (__stdcall *CUDA_GDG)(int*, int);
typedef int (__stdcall *CUDA_GDA)(int*, int, int);
typedef int (__stdcall *CUDA_GDN)(char*, int, int);
typedef int (__stdcall *CUDA_GDM)(size_t*, int);
typedef int (__stdcall *CUDA_GDCC)(int*, int*, int);
typedef int (__stdcall *CUDA_CC)(void**, unsigned int, unsigned int);
typedef int (__stdcall *CUDA_CD)(void*);
typedef int (__stdcall *CUDA_MA)(unsigned int*, size_t);
typedef int (__stdcall *CUDA_MF)(unsigned int);
typedef int (__stdcall *CUDA_MGI)(size_t*, size_t*);
CUDA_GDC __cuDeviceGetCount = NULL;
CUDA_GDV __cuDriverGetVersion = NULL;
CUDA_GDI __cuInit = NULL;
CUDA_GDG __cuDeviceGet = NULL;
CUDA_GDA __cuDeviceGetAttribute = NULL;
CUDA_GDN __cuDeviceGetName = NULL;
CUDA_GDM __cuDeviceTotalMem = NULL;
CUDA_GDCC __cuDeviceComputeCapability = NULL;
CUDA_CC __cuCtxCreate = NULL;
CUDA_CD __cuCtxDestroy = NULL;
CUDA_MA __cuMemAlloc = NULL;
CUDA_MF __cuMemFree = NULL;
CUDA_MGI __cuMemGetInfo = NULL;
#else
void* cudalib;
int (*__cuInit)(unsigned int);
int (*__cuDeviceGetCount)(int*);
int (*__cuDriverGetVersion)(int*);
int (*__cuDeviceGet)(int*, int);
int (*__cuDeviceGetAttribute)(int*, int, int);
int (*__cuDeviceGetName)(char*, int, int);
int (*__cuDeviceTotalMem)(size_t*, int);
int (*__cuDeviceComputeCapability)(int*, int*, int);
int (*__cuCtxCreate)(void**, unsigned int, unsigned int);
int (*__cuCtxDestroy)(void*);
int (*__cuMemAlloc)(unsigned int*, size_t);
int (*__cuMemFree)(unsigned int);
int (*__cuMemGetInfo)(size_t*, size_t*);
#endif
// NVIDIA interfaces are documented here:
// http://developer.download.nvidia.com/compute/cuda/2_3/toolkit/docs/online/index.html
void COPROC_NVIDIA::get(
vector& warnings
) {
int cuda_ndevs, retval;
char buf[256];
#ifdef _WIN32
HMODULE cudalib = LoadLibrary("nvcuda.dll");
if (!cudalib) {
warnings.push_back("No NVIDIA library found");
return;
}
__cuDeviceGetCount = (CUDA_GDC)GetProcAddress( cudalib, "cuDeviceGetCount" );
__cuDriverGetVersion = (CUDA_GDV)GetProcAddress( cudalib, "cuDriverGetVersion" );
__cuInit = (CUDA_GDI)GetProcAddress( cudalib, "cuInit" );
__cuDeviceGet = (CUDA_GDG)GetProcAddress( cudalib, "cuDeviceGet" );
__cuDeviceGetAttribute = (CUDA_GDA)GetProcAddress( cudalib, "cuDeviceGetAttribute" );
__cuDeviceGetName = (CUDA_GDN)GetProcAddress( cudalib, "cuDeviceGetName" );
__cuDeviceTotalMem = (CUDA_GDM)GetProcAddress( cudalib, "cuDeviceTotalMem" );
__cuDeviceComputeCapability = (CUDA_GDCC)GetProcAddress( cudalib, "cuDeviceComputeCapability" );
__cuCtxCreate = (CUDA_CC)GetProcAddress( cudalib, "cuCtxCreate" );
__cuCtxDestroy = (CUDA_CD)GetProcAddress( cudalib, "cuCtxDestroy" );
__cuMemAlloc = (CUDA_MA)GetProcAddress( cudalib, "cuMemAlloc" );
__cuMemFree = (CUDA_MF)GetProcAddress( cudalib, "cuMemFree" );
__cuMemGetInfo = (CUDA_MGI)GetProcAddress( cudalib, "cuMemGetInfo" );
#ifndef SIM
NvAPI_Initialize();
NvAPI_ShortString ss;
NvU32 Version = 0;
NvAPI_SYS_GetDriverAndBranchVersion(&Version, ss);
#if 0
// NvAPI now provides an API for getting #cores :-)
// But not FLOPs per clock cycle :-(
// Anyway, don't use this for now because server code estimates FLOPS
// based on compute capability, so we may as well do the same
// See http://docs.nvidia.com/gameworks/content/gameworkslibrary/coresdk/nvapi/
//
NvPhysicalGpuHandle GPUHandle[NVAPI_MAX_PHYSICAL_GPUS];
NvU32 GpuCount, nc;
NvAPI_EnumPhysicalGPUs(GPUHandle, &GpuCount);
for (unsigned int i=0; i (120*60)) break; // Don't retry if system has been up for over 2 minutes
boinc_sleep(1.);
continue;
}
#endif
if (retval) {
sprintf(buf, "NVIDIA drivers present but no GPUs found");
warnings.push_back(buf);
return;
}
retval = (*__cuDriverGetVersion)(&cuda_version);
if (retval) {
sprintf(buf, "cuDriverGetVersion() returned %d", retval);
warnings.push_back(buf);
return;
}
have_cuda = true;
retval = (*__cuDeviceGetCount)(&cuda_ndevs);
if (retval) {
sprintf(buf, "cuDeviceGetCount() returned %d", retval);
warnings.push_back(buf);
return;
}
sprintf(buf, "NVIDIA library reports %d GPU%s", cuda_ndevs, (cuda_ndevs==1)?"":"s");
warnings.push_back(buf);
int j, itemp;
size_t global_mem = 0;
COPROC_NVIDIA cc;
string s;
for (j=0; j 100) continue; // e.g. 9999 is an error
#if defined(_WIN32) && !defined(SIM)
cc.display_driver_version = Version;
#elif defined(__APPLE__)
cc.display_driver_version = NSVersionOfRunTimeLibrary("cuda");
#else
cc.display_driver_version = nvidia_driver_version();
#endif
cc.have_cuda = true;
cc.cuda_version = cuda_version;
cc.device_num = j;
cc.set_peak_flops();
get_available_nvidia_ram(cc, warnings);
nvidia_gpus.push_back(cc);
}
if (!nvidia_gpus.size()) {
warnings.push_back("No CUDA-capable NVIDIA GPUs found");
}
}
void COPROC_NVIDIA::correlate(
bool use_all, // if false, use only those equivalent to most capable
vector& ignore_devs
) {
unsigned int i;
if (!nvidia_gpus.size()) return;
// identify the most capable non-ignored instance
//
bool first = true;
for (i=0; i= 6050) && nvidia_gpus[i].prop.major < 2) {
// Can't use GPUs with compute capability < 2 with CUDA drivers >= 6.5.x
nvidia_gpus[i].is_used = COPROC_UNUSED;
continue;
}
#endif
if (first) {
*this = nvidia_gpus[i];
first = false;
} else if (nvidia_compare(nvidia_gpus[i], *this, false) > 0) {
*this = nvidia_gpus[i];
}
}
// see which other instances are equivalent,
// and set "count", "device_nums", and "pci_infos"
//
count = 0;
for (i=0; ihave_opencl && !nvidia_gpus[i].have_opencl) {
nvidia_gpus[i].is_used = COPROC_UNUSED;
} else if (this->have_cuda && !nvidia_gpus[i].have_cuda) {
nvidia_gpus[i].is_used = COPROC_UNUSED;
#ifdef __APPLE__
} else if (nvidia_gpus[i].is_used == COPROC_UNUSED) {
// Can't use GPUs with compute capability < 2 with CUDA drivers >= 6.5.x
continue;
#endif
} else if (use_all || !nvidia_compare(nvidia_gpus[i], *this, true)) {
device_nums[count] = nvidia_gpus[i].device_num;
pci_infos[count] = nvidia_gpus[i].pci_info;
count++;
nvidia_gpus[i].is_used = COPROC_USED;
} else {
nvidia_gpus[i].is_used = COPROC_UNUSED;
}
}
}
// See how much RAM is available on this GPU.
//
// CAUTION: as currently written, this method should be
// called only from COPROC_NVIDIA::get(). If in the
// future you wish to call it from additional places:
// * It must be called from a separate child process on
// dual-GPU laptops (e.g., Macbook Pros) with the results
// communicated to the main client process via IPC or a
// temp file. See the comments about dual-GPU laptops
// in gpu_detect.cpp and main.cpp for more details.
// * The CUDA library must be loaded and cuInit() called
// first.
// * See client/coproc_detect.cpp and cpu_sched.cpp in
// BOINC 6.12.36 for an earlier attempt to call this
// from the scheduler. Note that it was abandoned
// due to repeated calls crashing the driver.
//
static void get_available_nvidia_ram(COPROC_NVIDIA &cc, vector& warnings) {
int retval;
size_t memfree = 0, memtotal = 0;
int device;
void* ctx;
char buf[256];
cc.available_ram = cc.prop.totalGlobalMem;
if (!__cuDeviceGet) {
warnings.push_back("cuDeviceGet() missing from NVIDIA library");
return;
}
if (!__cuCtxCreate) {
warnings.push_back("cuCtxCreate() missing from NVIDIA library");
return;
}
if (!__cuCtxDestroy) {
warnings.push_back("cuCtxDestroy() missing from NVIDIA library");
return;
}
if (!__cuMemGetInfo) {
warnings.push_back("cuMemGetInfo() missing from NVIDIA library");
return;
}
retval = (*__cuDeviceGet)(&device, cc.device_num);
if (retval) {
snprintf(buf, sizeof(buf),
"[coproc] cuDeviceGet(%d) returned %d", cc.device_num, retval
);
warnings.push_back(buf);
return;
}
retval = (*__cuCtxCreate)(&ctx, 0, device);
if (retval) {
snprintf(buf, sizeof(buf),
"[coproc] cuCtxCreate(%d) returned %d", cc.device_num, retval
);
warnings.push_back(buf);
return;
}
retval = (*__cuMemGetInfo)(&memfree, &memtotal);
if (retval) {
snprintf(buf, sizeof(buf),
"[coproc] cuMemGetInfo(%d) returned %d", cc.device_num, retval
);
warnings.push_back(buf);
(*__cuCtxDestroy)(ctx);
return;
}
(*__cuCtxDestroy)(ctx);
cc.available_ram = (double) memfree;
}
// check whether each GPU is running a graphics app (assume yes)
// return true if there's been a change since last time
//
// CAUTION: this method is not currently used. If you wish
// to call it in the future:
// * It must be called from a separate child process on
// dual-GPU laptops (e.g., Macbook Pros) with the results
// communicated to the main client process via IPC or a
// temp file. See the comments about dual-GPU laptops
// in gpu_detect.cpp and main.cpp for more details.
// * The CUDA library must be loaded and cuInit() called
// first.
//
#if 0
bool COPROC_NVIDIA::check_running_graphics_app() {
int retval, j;
bool change = false;
if (!__cuDeviceGet) {
warnings.push_back("cuDeviceGet() missing from NVIDIA library");
return;
}
if (!__cuDeviceGetAttribute) {
warnings.push_back("cuDeviceGetAttribute() missing from NVIDIA library");
return;
}
for (j=0; j