mirror of https://github.com/BOINC/boinc.git
client: fix bug in GPU detection
PR #3364 changed the way we clear structures. This introduced a bug: HOST_INFO::clear_host_info() intentionally didn't clear HOST_INFO::coprocs. But it was replaced with HOST_INFO::clear(), which did. This caused the client to lose GPU info. Fix: restore HOST_INFO::clear_host_info(), and add a comment to avoid future errors like this. Also add some comments in GPU detection, which is woefully lacking in them.
This commit is contained in:
parent
0a1d669c59
commit
758ff3e13a
|
@ -18,39 +18,39 @@
|
|||
|
||||
// client-specific GPU code. Mostly GPU detection
|
||||
//
|
||||
// theory of operation:
|
||||
// there are two ways of detecting GPUs:
|
||||
// theory of operation:
|
||||
// there are two ways of detecting GPUs:
|
||||
// - vendor-specific libraries like CUDA and CAL,
|
||||
// which detect only that vendor's GPUs
|
||||
// - OpenCL, which can detect multiple types of GPUs,
|
||||
// including nvidia/amd/intel as well was new types
|
||||
// such as ARM integrated GPUs
|
||||
//
|
||||
// These libraries sometimes crash,
|
||||
// and we've been unable to trap these via signal and exception handlers.
|
||||
// So we do GPU detection in a separate process (boinc --detect_gpus)
|
||||
// This process writes an XML file "coproc_info.xml" containing
|
||||
// These libraries sometimes crash,
|
||||
// and we've been unable to trap these via signal and exception handlers.
|
||||
// So we do GPU detection in a separate process (boinc --detect_gpus)
|
||||
// This process writes an XML file "coproc_info.xml" containing
|
||||
// - lists of GPU detected via CUDA and CAL
|
||||
// - lists of nvidia/amd/intel GPUs detected via OpenCL
|
||||
// - a list of other GPUs detected via OpenCL
|
||||
//
|
||||
// When the process finishes, the client parses the info file.
|
||||
// Then for each vendor it "correlates" the GPUs, which includes:
|
||||
// Also, some dual-GPU laptops (e.g., Macbook Pro) don't power
|
||||
// down the more powerful GPU until all applications which used them exit.
|
||||
// Doing GPU detection in a second, short-lived process
|
||||
// saves battery life on these laptops.
|
||||
//
|
||||
// When the process finishes, the client parses the info file.
|
||||
// Then for each vendor it "correlates" the GPUs, which includes:
|
||||
// - matching up the OpenCL and vendor-specific descriptions, if both exist
|
||||
// - finding the most capable GPU, and seeing which other GPUs
|
||||
// are similar to it in hardware and RAM.
|
||||
// Other GPUs are not used.
|
||||
// - copy these to the COPROCS structure
|
||||
//
|
||||
// Also, some dual-GPU laptops (e.g., Macbook Pro) don't power
|
||||
// down the more powerful GPU until all applications which used
|
||||
// them exit. Doing GPU detection in a second, short-lived process
|
||||
// saves battery life on these laptops.
|
||||
//
|
||||
// GPUs can also be explicitly described in cc_config.xml
|
||||
|
||||
// GPUs can also be explicitly described in cc_config.xml
|
||||
|
||||
// uncomment to do GPU detection in same process (for debugging)
|
||||
// comment out to do GPU detection in same process (for debugging)
|
||||
//
|
||||
#define USE_CHILD_PROCESS_TO_DETECT_GPUS 1
|
||||
|
||||
|
@ -90,6 +90,8 @@ void segv_handler(int) {
|
|||
}
|
||||
#endif
|
||||
|
||||
// the following store GPU instances during initialization
|
||||
//
|
||||
vector<COPROC_ATI> ati_gpus;
|
||||
vector<COPROC_NVIDIA> nvidia_gpus;
|
||||
vector<COPROC_INTEL> intel_gpus;
|
||||
|
@ -105,11 +107,16 @@ static char* client_path;
|
|||
static char client_dir[MAXPATHLEN];
|
||||
// current directory at start of client
|
||||
|
||||
// find GPU instances, then correlate (merge) them
|
||||
//
|
||||
void COPROCS::get(
|
||||
bool use_all, vector<string>&descs, vector<string>&warnings,
|
||||
IGNORE_GPU_INSTANCE& ignore_gpu_instance
|
||||
) {
|
||||
#if USE_CHILD_PROCESS_TO_DETECT_GPUS
|
||||
// detect_gpus() can cause crashes even with try/catch,
|
||||
// so do it in a separate process that writes a file
|
||||
//
|
||||
int retval = 0;
|
||||
char buf[256];
|
||||
|
||||
|
@ -135,7 +142,10 @@ void COPROCS::get(
|
|||
correlate_gpus(use_all, descs, ignore_gpu_instance);
|
||||
}
|
||||
|
||||
|
||||
// populate the global vectors
|
||||
// ati_gpus, nvidia_gpus, intel_gpus,
|
||||
// nvidia_opencls, etc.
|
||||
//
|
||||
void COPROCS::detect_gpus(vector<string> &warnings) {
|
||||
#ifdef _WIN32
|
||||
try {
|
||||
|
@ -194,7 +204,12 @@ void COPROCS::detect_gpus(vector<string> &warnings) {
|
|||
#endif
|
||||
}
|
||||
|
||||
|
||||
// for each GPU type, scan the GPUs we detected
|
||||
// (e.g. the vector nvidia_gpus).
|
||||
// Find the most capable one, and the ones equivalent to it.
|
||||
// Also correlate OpenCL GPUs with CUDA/CAL GPUs.
|
||||
// Then create a single COPROC (with appropriate count)
|
||||
//
|
||||
void COPROCS::correlate_gpus(
|
||||
bool use_all,
|
||||
vector<string> &descs,
|
||||
|
@ -441,6 +456,9 @@ int COPROCS::write_coproc_info_file(vector<string> &warnings) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
// if we're using a separate process to find GPUs,
|
||||
// read its output file and create data structures
|
||||
//
|
||||
int COPROCS::read_coproc_info_file(vector<string> &warnings) {
|
||||
MIOFILE mf;
|
||||
int retval;
|
||||
|
|
|
@ -465,7 +465,10 @@ leave:
|
|||
#endif
|
||||
}
|
||||
|
||||
|
||||
// Find the most capable instance; copy to *this.
|
||||
// set is_used (USED, UNUSED, IGNORED) for each instance.
|
||||
// Don't use less-capable instances (unless use_all is set)
|
||||
//
|
||||
void COPROC_NVIDIA::correlate(
|
||||
bool use_all, // if false, use only those equivalent to most capable
|
||||
vector<int>& ignore_devs
|
||||
|
|
|
@ -269,7 +269,7 @@ void APP_INIT_DATA::clear() {
|
|||
host_total_credit = 0;
|
||||
host_expavg_credit = 0;
|
||||
resource_share_fraction = 0;
|
||||
host_info.clear();
|
||||
host_info.clear_host_info();
|
||||
proxy_info.clear();
|
||||
global_prefs.defaults();
|
||||
starting_elapsed_time = 0;
|
||||
|
|
|
@ -304,7 +304,7 @@ struct COPROC_NVIDIA : public COPROC {
|
|||
#ifndef _USING_FCGI_
|
||||
void write_xml(MIOFILE&, bool scheduler_rpc);
|
||||
#endif
|
||||
COPROC_NVIDIA(): COPROC() {}
|
||||
COPROC_NVIDIA(): COPROC() {clear();}
|
||||
void get(std::vector<std::string>& warnings);
|
||||
void correlate(
|
||||
bool use_all,
|
||||
|
@ -339,7 +339,7 @@ struct COPROC_ATI : public COPROC {
|
|||
#ifndef _USING_FCGI_
|
||||
void write_xml(MIOFILE&, bool scheduler_rpc);
|
||||
#endif
|
||||
COPROC_ATI(): COPROC() {}
|
||||
COPROC_ATI(): COPROC() {clear();}
|
||||
void get(std::vector<std::string>& warnings);
|
||||
void correlate(
|
||||
bool use_all,
|
||||
|
@ -361,7 +361,7 @@ struct COPROC_INTEL : public COPROC {
|
|||
#ifndef _USING_FCGI_
|
||||
void write_xml(MIOFILE&, bool scheduler_rpc);
|
||||
#endif
|
||||
COPROC_INTEL(): COPROC() {}
|
||||
COPROC_INTEL(): COPROC() {clear();}
|
||||
void get(std::vector<std::string>& warnings);
|
||||
void correlate(
|
||||
bool use_all,
|
||||
|
|
|
@ -1052,7 +1052,7 @@ void CC_STATE::clear() {
|
|||
results.clear();
|
||||
platforms.clear();
|
||||
executing_as_daemon = false;
|
||||
host_info.clear();
|
||||
host_info.clear_host_info();
|
||||
have_nvidia = false;
|
||||
have_ati = false;
|
||||
}
|
||||
|
|
|
@ -39,8 +39,54 @@
|
|||
|
||||
#include "hostinfo.h"
|
||||
|
||||
HOST_INFO::HOST_INFO() {
|
||||
clear_host_info();
|
||||
}
|
||||
|
||||
// this must NOT clear coprocs
|
||||
// (initialization logic assumes that)
|
||||
//
|
||||
void HOST_INFO::clear_host_info() {
|
||||
timezone = 0;
|
||||
safe_strcpy(domain_name, "");
|
||||
safe_strcpy(serialnum, "");
|
||||
safe_strcpy(ip_addr, "");
|
||||
safe_strcpy(host_cpid, "");
|
||||
|
||||
p_ncpus = 0;
|
||||
safe_strcpy(p_vendor, "");
|
||||
safe_strcpy(p_model, "");
|
||||
safe_strcpy(p_features, "");
|
||||
p_fpops = 0;
|
||||
p_iops = 0;
|
||||
p_membw = 0;
|
||||
p_calculated = 0;
|
||||
p_vm_extensions_disabled = false;
|
||||
|
||||
m_nbytes = 0;
|
||||
m_cache = 0;
|
||||
m_swap = 0;
|
||||
|
||||
d_total = 0;
|
||||
d_free = 0;
|
||||
|
||||
safe_strcpy(os_name, "");
|
||||
safe_strcpy(os_version, "");
|
||||
|
||||
wsl_available = false;
|
||||
#ifdef _WIN64
|
||||
wsls.clear();
|
||||
#endif
|
||||
|
||||
safe_strcpy(product_name, "");
|
||||
safe_strcpy(mac_address, "");
|
||||
|
||||
safe_strcpy(virtualbox_version, "");
|
||||
num_opencl_cpu_platforms = 0;
|
||||
}
|
||||
|
||||
int HOST_INFO::parse(XML_PARSER& xp, bool static_items_only) {
|
||||
clear();
|
||||
clear_host_info();
|
||||
while (!xp.get_tag()) {
|
||||
if (xp.match_tag("/host_info")) return 0;
|
||||
if (xp.parse_double("p_fpops", p_fpops)) {
|
||||
|
|
|
@ -95,12 +95,9 @@ public:
|
|||
int num_opencl_cpu_platforms;
|
||||
OPENCL_CPU_PROP opencl_cpu_prop[MAX_OPENCL_CPU_PLATFORMS];
|
||||
|
||||
HOST_INFO(int){}
|
||||
HOST_INFO(){}
|
||||
void clear() {
|
||||
static const HOST_INFO x(0);
|
||||
*this = x;
|
||||
}
|
||||
void clear_host_info();
|
||||
HOST_INFO();
|
||||
|
||||
int parse(XML_PARSER&, bool static_items_only = false);
|
||||
int write(MIOFILE&, bool include_net_info, bool include_coprocs);
|
||||
int parse_cpu_benchmarks(FILE*);
|
||||
|
|
Loading…
Reference in New Issue