Merge pull request #3443 from BOINC/dpa_hostinfo_fix

client: fix bug in GPU detection
This commit is contained in:
Vitalii Koshura 2020-02-05 10:50:12 +01:00 committed by GitHub
commit 2efc0964ea
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 95 additions and 31 deletions

View File

@ -426,7 +426,7 @@ int main(int argc, char** argv) {
}
} else if (!strcmp(cmd, "--set_host_info")) {
HOST_INFO h;
h.clear();
h.clear_host_info();
char* pn = next_arg(argc, argv, i);
safe_strcpy(h.product_name, pn);
retval = rpc.set_host_info(h);

View File

@ -18,39 +18,39 @@
// client-specific GPU code. Mostly GPU detection
//
// theory of operation:
// there are two ways of detecting GPUs:
// theory of operation:
// there are two ways of detecting GPUs:
// - vendor-specific libraries like CUDA and CAL,
// which detect only that vendor's GPUs
// - OpenCL, which can detect multiple types of GPUs,
// including nvidia/amd/intel as well was new types
// such as ARM integrated GPUs
//
// These libraries sometimes crash,
// and we've been unable to trap these via signal and exception handlers.
// So we do GPU detection in a separate process (boinc --detect_gpus)
// This process writes an XML file "coproc_info.xml" containing
// These libraries sometimes crash,
// and we've been unable to trap these via signal and exception handlers.
// So we do GPU detection in a separate process (boinc --detect_gpus)
// This process writes an XML file "coproc_info.xml" containing
// - lists of GPU detected via CUDA and CAL
// - lists of nvidia/amd/intel GPUs detected via OpenCL
// - a list of other GPUs detected via OpenCL
//
// When the process finishes, the client parses the info file.
// Then for each vendor it "correlates" the GPUs, which includes:
// Also, some dual-GPU laptops (e.g., Macbook Pro) don't power
// down the more powerful GPU until all applications which used them exit.
// Doing GPU detection in a second, short-lived process
// saves battery life on these laptops.
//
// When the process finishes, the client parses the info file.
// Then for each vendor it "correlates" the GPUs, which includes:
// - matching up the OpenCL and vendor-specific descriptions, if both exist
// - finding the most capable GPU, and seeing which other GPUs
// are similar to it in hardware and RAM.
// Other GPUs are not used.
// - copy these to the COPROCS structure
//
// Also, some dual-GPU laptops (e.g., Macbook Pro) don't power
// down the more powerful GPU until all applications which used
// them exit. Doing GPU detection in a second, short-lived process
// saves battery life on these laptops.
//
// GPUs can also be explicitly described in cc_config.xml
// GPUs can also be explicitly described in cc_config.xml
// uncomment to do GPU detection in same process (for debugging)
// comment out to do GPU detection in same process (for debugging)
//
#define USE_CHILD_PROCESS_TO_DETECT_GPUS 1
@ -90,6 +90,8 @@ void segv_handler(int) {
}
#endif
// the following store GPU instances during initialization
//
vector<COPROC_ATI> ati_gpus;
vector<COPROC_NVIDIA> nvidia_gpus;
vector<COPROC_INTEL> intel_gpus;
@ -105,11 +107,16 @@ static char* client_path;
static char client_dir[MAXPATHLEN];
// current directory at start of client
// find GPU instances, then correlate (merge) them
//
void COPROCS::get(
bool use_all, vector<string>&descs, vector<string>&warnings,
IGNORE_GPU_INSTANCE& ignore_gpu_instance
) {
#if USE_CHILD_PROCESS_TO_DETECT_GPUS
// detect_gpus() can cause crashes even with try/catch,
// so do it in a separate process that writes a file
//
int retval = 0;
char buf[256];
@ -135,7 +142,10 @@ void COPROCS::get(
correlate_gpus(use_all, descs, ignore_gpu_instance);
}
// populate the global vectors
// ati_gpus, nvidia_gpus, intel_gpus,
// nvidia_opencls, etc.
//
void COPROCS::detect_gpus(vector<string> &warnings) {
#ifdef _WIN32
try {
@ -194,7 +204,12 @@ void COPROCS::detect_gpus(vector<string> &warnings) {
#endif
}
// for each GPU type, scan the GPUs we detected
// (e.g. the vector nvidia_gpus).
// Find the most capable one, and the ones equivalent to it.
// Also correlate OpenCL GPUs with CUDA/CAL GPUs.
// Then create a single COPROC (with appropriate count)
//
void COPROCS::correlate_gpus(
bool use_all,
vector<string> &descs,
@ -441,6 +456,9 @@ int COPROCS::write_coproc_info_file(vector<string> &warnings) {
return 0;
}
// if we're using a separate process to find GPUs,
// read its output file and create data structures
//
int COPROCS::read_coproc_info_file(vector<string> &warnings) {
MIOFILE mf;
int retval;

View File

@ -465,7 +465,10 @@ leave:
#endif
}
// Find the most capable instance; copy to *this.
// set is_used (USED, UNUSED, IGNORED) for each instance.
// Don't use less-capable instances (unless use_all is set)
//
void COPROC_NVIDIA::correlate(
bool use_all, // if false, use only those equivalent to most capable
vector<int>& ignore_devs

View File

@ -269,7 +269,7 @@ void APP_INIT_DATA::clear() {
host_total_credit = 0;
host_expavg_credit = 0;
resource_share_fraction = 0;
host_info.clear();
host_info.clear_host_info();
proxy_info.clear();
global_prefs.defaults();
starting_elapsed_time = 0;

View File

@ -304,7 +304,7 @@ struct COPROC_NVIDIA : public COPROC {
#ifndef _USING_FCGI_
void write_xml(MIOFILE&, bool scheduler_rpc);
#endif
COPROC_NVIDIA(): COPROC() {}
COPROC_NVIDIA(): COPROC() {clear();}
void get(std::vector<std::string>& warnings);
void correlate(
bool use_all,
@ -339,7 +339,7 @@ struct COPROC_ATI : public COPROC {
#ifndef _USING_FCGI_
void write_xml(MIOFILE&, bool scheduler_rpc);
#endif
COPROC_ATI(): COPROC() {}
COPROC_ATI(): COPROC() {clear();}
void get(std::vector<std::string>& warnings);
void correlate(
bool use_all,
@ -361,7 +361,7 @@ struct COPROC_INTEL : public COPROC {
#ifndef _USING_FCGI_
void write_xml(MIOFILE&, bool scheduler_rpc);
#endif
COPROC_INTEL(): COPROC() {}
COPROC_INTEL(): COPROC() {clear();}
void get(std::vector<std::string>& warnings);
void correlate(
bool use_all,

View File

@ -1052,7 +1052,7 @@ void CC_STATE::clear() {
results.clear();
platforms.clear();
executing_as_daemon = false;
host_info.clear();
host_info.clear_host_info();
have_nvidia = false;
have_ati = false;
}

View File

@ -39,8 +39,54 @@
#include "hostinfo.h"
HOST_INFO::HOST_INFO() {
clear_host_info();
}
// this must NOT clear coprocs
// (initialization logic assumes that)
//
void HOST_INFO::clear_host_info() {
timezone = 0;
safe_strcpy(domain_name, "");
safe_strcpy(serialnum, "");
safe_strcpy(ip_addr, "");
safe_strcpy(host_cpid, "");
p_ncpus = 0;
safe_strcpy(p_vendor, "");
safe_strcpy(p_model, "");
safe_strcpy(p_features, "");
p_fpops = 0;
p_iops = 0;
p_membw = 0;
p_calculated = 0;
p_vm_extensions_disabled = false;
m_nbytes = 0;
m_cache = 0;
m_swap = 0;
d_total = 0;
d_free = 0;
safe_strcpy(os_name, "");
safe_strcpy(os_version, "");
wsl_available = false;
#ifdef _WIN64
wsls.clear();
#endif
safe_strcpy(product_name, "");
safe_strcpy(mac_address, "");
safe_strcpy(virtualbox_version, "");
num_opencl_cpu_platforms = 0;
}
int HOST_INFO::parse(XML_PARSER& xp, bool static_items_only) {
clear();
clear_host_info();
while (!xp.get_tag()) {
if (xp.match_tag("/host_info")) return 0;
if (xp.parse_double("p_fpops", p_fpops)) {

View File

@ -97,12 +97,9 @@ public:
int num_opencl_cpu_platforms;
OPENCL_CPU_PROP opencl_cpu_prop[MAX_OPENCL_CPU_PLATFORMS];
HOST_INFO(int){}
HOST_INFO(){}
void clear() {
static const HOST_INFO x(0);
*this = x;
}
void clear_host_info();
HOST_INFO();
int parse(XML_PARSER&, bool static_items_only = false);
int write(MIOFILE&, bool include_net_info, bool include_coprocs);
int parse_cpu_benchmarks(FILE*);