From 758ff3e13aaeb1424006848cca7aadac4f5b7c82 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Tue, 28 Jan 2020 14:22:28 -0800 Subject: [PATCH] client: fix bug in GPU detection PR #3364 changed the way we clear structures. This introduced a bug: HOST_INFO::clear_host_info() intentionally didn't clear HOST_INFO::coprocs. But it was replaced with HOST_INFO::clear(), which did. This caused the client to lose GPU info. Fix: restore HOST_INFO::clear_host_info(), and add a comment to avoid future errors like this. Also add some comments in GPU detection, which is woefully lacking in them. --- client/gpu_detect.cpp | 52 +++++++++++++++++++++++++------------- client/gpu_nvidia.cpp | 5 +++- lib/app_ipc.cpp | 2 +- lib/coproc.h | 6 ++--- lib/gui_rpc_client_ops.cpp | 2 +- lib/hostinfo.cpp | 48 ++++++++++++++++++++++++++++++++++- lib/hostinfo.h | 9 +++---- 7 files changed, 94 insertions(+), 30 deletions(-) diff --git a/client/gpu_detect.cpp b/client/gpu_detect.cpp index 01b8870a7a..15650a2030 100644 --- a/client/gpu_detect.cpp +++ b/client/gpu_detect.cpp @@ -18,39 +18,39 @@ // client-specific GPU code. Mostly GPU detection // -// theory of operation: -// there are two ways of detecting GPUs: +// theory of operation: +// there are two ways of detecting GPUs: // - vendor-specific libraries like CUDA and CAL, // which detect only that vendor's GPUs // - OpenCL, which can detect multiple types of GPUs, // including nvidia/amd/intel as well was new types // such as ARM integrated GPUs // -// These libraries sometimes crash, -// and we've been unable to trap these via signal and exception handlers. -// So we do GPU detection in a separate process (boinc --detect_gpus) -// This process writes an XML file "coproc_info.xml" containing +// These libraries sometimes crash, +// and we've been unable to trap these via signal and exception handlers. +// So we do GPU detection in a separate process (boinc --detect_gpus) +// This process writes an XML file "coproc_info.xml" containing // - lists of GPU detected via CUDA and CAL // - lists of nvidia/amd/intel GPUs detected via OpenCL // - a list of other GPUs detected via OpenCL // -// When the process finishes, the client parses the info file. -// Then for each vendor it "correlates" the GPUs, which includes: +// Also, some dual-GPU laptops (e.g., Macbook Pro) don't power +// down the more powerful GPU until all applications which used them exit. +// Doing GPU detection in a second, short-lived process +// saves battery life on these laptops. +// +// When the process finishes, the client parses the info file. +// Then for each vendor it "correlates" the GPUs, which includes: // - matching up the OpenCL and vendor-specific descriptions, if both exist // - finding the most capable GPU, and seeing which other GPUs // are similar to it in hardware and RAM. // Other GPUs are not used. // - copy these to the COPROCS structure // -// Also, some dual-GPU laptops (e.g., Macbook Pro) don't power -// down the more powerful GPU until all applications which used -// them exit. Doing GPU detection in a second, short-lived process -// saves battery life on these laptops. -// -// GPUs can also be explicitly described in cc_config.xml +// GPUs can also be explicitly described in cc_config.xml -// uncomment to do GPU detection in same process (for debugging) +// comment out to do GPU detection in same process (for debugging) // #define USE_CHILD_PROCESS_TO_DETECT_GPUS 1 @@ -90,6 +90,8 @@ void segv_handler(int) { } #endif +// the following store GPU instances during initialization +// vector ati_gpus; vector nvidia_gpus; vector intel_gpus; @@ -105,11 +107,16 @@ static char* client_path; static char client_dir[MAXPATHLEN]; // current directory at start of client +// find GPU instances, then correlate (merge) them +// void COPROCS::get( bool use_all, vector&descs, vector&warnings, IGNORE_GPU_INSTANCE& ignore_gpu_instance ) { #if USE_CHILD_PROCESS_TO_DETECT_GPUS + // detect_gpus() can cause crashes even with try/catch, + // so do it in a separate process that writes a file + // int retval = 0; char buf[256]; @@ -135,7 +142,10 @@ void COPROCS::get( correlate_gpus(use_all, descs, ignore_gpu_instance); } - +// populate the global vectors +// ati_gpus, nvidia_gpus, intel_gpus, +// nvidia_opencls, etc. +// void COPROCS::detect_gpus(vector &warnings) { #ifdef _WIN32 try { @@ -194,7 +204,12 @@ void COPROCS::detect_gpus(vector &warnings) { #endif } - +// for each GPU type, scan the GPUs we detected +// (e.g. the vector nvidia_gpus). +// Find the most capable one, and the ones equivalent to it. +// Also correlate OpenCL GPUs with CUDA/CAL GPUs. +// Then create a single COPROC (with appropriate count) +// void COPROCS::correlate_gpus( bool use_all, vector &descs, @@ -441,6 +456,9 @@ int COPROCS::write_coproc_info_file(vector &warnings) { return 0; } +// if we're using a separate process to find GPUs, +// read its output file and create data structures +// int COPROCS::read_coproc_info_file(vector &warnings) { MIOFILE mf; int retval; diff --git a/client/gpu_nvidia.cpp b/client/gpu_nvidia.cpp index defe9ecee3..5d6ddc2a01 100644 --- a/client/gpu_nvidia.cpp +++ b/client/gpu_nvidia.cpp @@ -465,7 +465,10 @@ leave: #endif } - +// Find the most capable instance; copy to *this. +// set is_used (USED, UNUSED, IGNORED) for each instance. +// Don't use less-capable instances (unless use_all is set) +// void COPROC_NVIDIA::correlate( bool use_all, // if false, use only those equivalent to most capable vector& ignore_devs diff --git a/lib/app_ipc.cpp b/lib/app_ipc.cpp index 849d03d03a..1ef96a3e7c 100644 --- a/lib/app_ipc.cpp +++ b/lib/app_ipc.cpp @@ -269,7 +269,7 @@ void APP_INIT_DATA::clear() { host_total_credit = 0; host_expavg_credit = 0; resource_share_fraction = 0; - host_info.clear(); + host_info.clear_host_info(); proxy_info.clear(); global_prefs.defaults(); starting_elapsed_time = 0; diff --git a/lib/coproc.h b/lib/coproc.h index 7c0905469e..3d4746986b 100644 --- a/lib/coproc.h +++ b/lib/coproc.h @@ -304,7 +304,7 @@ struct COPROC_NVIDIA : public COPROC { #ifndef _USING_FCGI_ void write_xml(MIOFILE&, bool scheduler_rpc); #endif - COPROC_NVIDIA(): COPROC() {} + COPROC_NVIDIA(): COPROC() {clear();} void get(std::vector& warnings); void correlate( bool use_all, @@ -339,7 +339,7 @@ struct COPROC_ATI : public COPROC { #ifndef _USING_FCGI_ void write_xml(MIOFILE&, bool scheduler_rpc); #endif - COPROC_ATI(): COPROC() {} + COPROC_ATI(): COPROC() {clear();} void get(std::vector& warnings); void correlate( bool use_all, @@ -361,7 +361,7 @@ struct COPROC_INTEL : public COPROC { #ifndef _USING_FCGI_ void write_xml(MIOFILE&, bool scheduler_rpc); #endif - COPROC_INTEL(): COPROC() {} + COPROC_INTEL(): COPROC() {clear();} void get(std::vector& warnings); void correlate( bool use_all, diff --git a/lib/gui_rpc_client_ops.cpp b/lib/gui_rpc_client_ops.cpp index 546175294f..4d5c2572ff 100644 --- a/lib/gui_rpc_client_ops.cpp +++ b/lib/gui_rpc_client_ops.cpp @@ -1052,7 +1052,7 @@ void CC_STATE::clear() { results.clear(); platforms.clear(); executing_as_daemon = false; - host_info.clear(); + host_info.clear_host_info(); have_nvidia = false; have_ati = false; } diff --git a/lib/hostinfo.cpp b/lib/hostinfo.cpp index 1512b06282..6686d31711 100644 --- a/lib/hostinfo.cpp +++ b/lib/hostinfo.cpp @@ -39,8 +39,54 @@ #include "hostinfo.h" +HOST_INFO::HOST_INFO() { + clear_host_info(); +} + +// this must NOT clear coprocs +// (initialization logic assumes that) +// +void HOST_INFO::clear_host_info() { + timezone = 0; + safe_strcpy(domain_name, ""); + safe_strcpy(serialnum, ""); + safe_strcpy(ip_addr, ""); + safe_strcpy(host_cpid, ""); + + p_ncpus = 0; + safe_strcpy(p_vendor, ""); + safe_strcpy(p_model, ""); + safe_strcpy(p_features, ""); + p_fpops = 0; + p_iops = 0; + p_membw = 0; + p_calculated = 0; + p_vm_extensions_disabled = false; + + m_nbytes = 0; + m_cache = 0; + m_swap = 0; + + d_total = 0; + d_free = 0; + + safe_strcpy(os_name, ""); + safe_strcpy(os_version, ""); + + wsl_available = false; +#ifdef _WIN64 + wsls.clear(); +#endif + + safe_strcpy(product_name, ""); + safe_strcpy(mac_address, ""); + + safe_strcpy(virtualbox_version, ""); + num_opencl_cpu_platforms = 0; +} + int HOST_INFO::parse(XML_PARSER& xp, bool static_items_only) { - clear(); + clear_host_info(); while (!xp.get_tag()) { if (xp.match_tag("/host_info")) return 0; if (xp.parse_double("p_fpops", p_fpops)) { diff --git a/lib/hostinfo.h b/lib/hostinfo.h index c7134394fe..3bb65b061a 100644 --- a/lib/hostinfo.h +++ b/lib/hostinfo.h @@ -95,12 +95,9 @@ public: int num_opencl_cpu_platforms; OPENCL_CPU_PROP opencl_cpu_prop[MAX_OPENCL_CPU_PLATFORMS]; - HOST_INFO(int){} - HOST_INFO(){} - void clear() { - static const HOST_INFO x(0); - *this = x; - } + void clear_host_info(); + HOST_INFO(); + int parse(XML_PARSER&, bool static_items_only = false); int write(MIOFILE&, bool include_net_info, bool include_coprocs); int parse_cpu_benchmarks(FILE*);