client: fix bug in GPU detection

PR #3364 changed the way we clear structures. This introduced a bug: HOST_INFO::clear_host_info() intentionally didn't clear HOST_INFO::coprocs. But it was replaced with HOST_INFO::clear(), which did. This caused the client to lose GPU info. Fix: restore HOST_INFO::clear_host_info(), and add a comment to avoid future errors like this. Also add some comments in GPU detection, which is woefully lacking in them.
2020-01-28 14:22:28 -08:00 · 2020-01-28 14:22:28 -08:00 · 758ff3e13a
parent 0a1d669c59
commit 758ff3e13a
7 changed files with 94 additions and 30 deletions
--- a/client/gpu_detect.cpp
+++ b/client/gpu_detect.cpp
@ -18,39 +18,39 @@

 // client-specific GPU code.  Mostly GPU detection
 //
-//  theory of operation:
-//  there are two ways of detecting GPUs:
+// theory of operation:
+// there are two ways of detecting GPUs:
 //  - vendor-specific libraries like CUDA and CAL,
 //      which detect only that vendor's GPUs
 //  - OpenCL, which can detect multiple types of GPUs,
 //      including nvidia/amd/intel as well was new types
 //      such as ARM integrated GPUs
 //
-//  These libraries sometimes crash,
-//  and we've been unable to trap these via signal and exception handlers.
-//  So we do GPU detection in a separate process (boinc --detect_gpus)
-//  This process writes an XML file "coproc_info.xml" containing
+// These libraries sometimes crash,
+// and we've been unable to trap these via signal and exception handlers.
+// So we do GPU detection in a separate process (boinc --detect_gpus)
+// This process writes an XML file "coproc_info.xml" containing
 //  - lists of GPU detected via CUDA and CAL
 //  - lists of nvidia/amd/intel GPUs detected via OpenCL
 //  - a list of other GPUs detected via OpenCL
 //
-//  When the process finishes, the client parses the info file.
-//  Then for each vendor it "correlates" the GPUs, which includes:
+// Also, some dual-GPU laptops (e.g., Macbook Pro) don't power
+// down the more powerful GPU until all applications which used them exit.
+// Doing GPU detection in a second, short-lived process
+// saves battery life on these laptops.
+//
+// When the process finishes, the client parses the info file.
+// Then for each vendor it "correlates" the GPUs, which includes:
 //  - matching up the OpenCL and vendor-specific descriptions, if both exist
 //  - finding the most capable GPU, and seeing which other GPUs
 //      are similar to it in hardware and RAM.
 //      Other GPUs are not used.
 //  - copy these to the COPROCS structure
 //
-//  Also, some dual-GPU laptops (e.g., Macbook Pro) don't power
-//  down the more powerful GPU until all applications which used
-//  them exit. Doing GPU detection in a second, short-lived process
-//  saves battery life on these laptops.
-//
-//  GPUs can also be explicitly described in cc_config.xml

+// GPUs can also be explicitly described in cc_config.xml

-// uncomment to do GPU detection in same process (for debugging)
+// comment out to do GPU detection in same process (for debugging)
 //
 #define USE_CHILD_PROCESS_TO_DETECT_GPUS 1

@ -90,6 +90,8 @@ void segv_handler(int) {
 }
 #endif

+// the following store GPU instances during initialization
+//
 vector<COPROC_ATI> ati_gpus;
 vector<COPROC_NVIDIA> nvidia_gpus;
 vector<COPROC_INTEL> intel_gpus;
@ -105,11 +107,16 @@ static char* client_path;
 static char client_dir[MAXPATHLEN];
    // current directory at start of client

+// find GPU instances, then correlate (merge) them
+//
 void COPROCS::get(
    bool use_all, vector<string>&descs, vector<string>&warnings,
    IGNORE_GPU_INSTANCE& ignore_gpu_instance
 ) {
 #if USE_CHILD_PROCESS_TO_DETECT_GPUS
+    // detect_gpus() can cause crashes even with try/catch,
+    // so do it in a separate process that writes a file
+    //
    int retval = 0;
    char buf[256];

@ -135,7 +142,10 @@ void COPROCS::get(
    correlate_gpus(use_all, descs, ignore_gpu_instance);
 }

-
+// populate the global vectors
+// ati_gpus, nvidia_gpus, intel_gpus,
+// nvidia_opencls, etc.
+//
 void COPROCS::detect_gpus(vector<string> &warnings) {
 #ifdef _WIN32
    try {
@ -194,7 +204,12 @@ void COPROCS::detect_gpus(vector<string> &warnings) {
 #endif
 }

-
+// for each GPU type, scan the GPUs we detected
+// (e.g. the vector nvidia_gpus).
+// Find the most capable one, and the ones equivalent to it.
+// Also correlate OpenCL GPUs with CUDA/CAL GPUs.
+// Then create a single COPROC (with appropriate count)
+//
 void COPROCS::correlate_gpus(
    bool use_all,
    vector<string> &descs,
@ -441,6 +456,9 @@ int COPROCS::write_coproc_info_file(vector<string> &warnings) {
    return 0;
 }

+// if we're using a separate process to find GPUs,
+// read its output file and create data structures
+//
 int COPROCS::read_coproc_info_file(vector<string> &warnings) {
    MIOFILE mf;
    int retval;
--- a/client/gpu_nvidia.cpp
+++ b/client/gpu_nvidia.cpp
@ -465,7 +465,10 @@ leave:
 #endif
 }

-
+// Find the most capable instance; copy to *this.
+// set is_used (USED, UNUSED, IGNORED) for each instance.
+// Don't use less-capable instances (unless use_all is set)
+//
 void COPROC_NVIDIA::correlate(
    bool use_all,    // if false, use only those equivalent to most capable
    vector<int>& ignore_devs
--- a/lib/app_ipc.cpp
+++ b/lib/app_ipc.cpp
@ -269,7 +269,7 @@ void APP_INIT_DATA::clear() {
    host_total_credit = 0;
    host_expavg_credit = 0;
    resource_share_fraction = 0;
-    host_info.clear();
+    host_info.clear_host_info();
    proxy_info.clear();
    global_prefs.defaults();
    starting_elapsed_time = 0;
--- a/lib/coproc.h
+++ b/lib/coproc.h
@ -304,7 +304,7 @@ struct COPROC_NVIDIA : public COPROC {
 #ifndef _USING_FCGI_
    void write_xml(MIOFILE&, bool scheduler_rpc);
 #endif
-    COPROC_NVIDIA(): COPROC() {}
+    COPROC_NVIDIA(): COPROC() {clear();}
    void get(std::vector<std::string>& warnings);
    void correlate(
        bool use_all,
@ -339,7 +339,7 @@ struct COPROC_ATI : public COPROC {
 #ifndef _USING_FCGI_
    void write_xml(MIOFILE&, bool scheduler_rpc);
 #endif
-    COPROC_ATI(): COPROC() {}
+    COPROC_ATI(): COPROC() {clear();}
    void get(std::vector<std::string>& warnings);
    void correlate(
        bool use_all,
@ -361,7 +361,7 @@ struct COPROC_INTEL : public COPROC {
 #ifndef _USING_FCGI_
    void write_xml(MIOFILE&, bool scheduler_rpc);
 #endif
-    COPROC_INTEL(): COPROC() {}
+    COPROC_INTEL(): COPROC() {clear();}
    void get(std::vector<std::string>& warnings);
    void correlate(
        bool use_all,
--- a/lib/gui_rpc_client_ops.cpp
+++ b/lib/gui_rpc_client_ops.cpp
@ -1052,7 +1052,7 @@ void CC_STATE::clear() {
    results.clear();
    platforms.clear();
    executing_as_daemon = false;
-    host_info.clear();
+    host_info.clear_host_info();
    have_nvidia = false;
    have_ati = false;
 }
--- a/lib/hostinfo.cpp
+++ b/lib/hostinfo.cpp
@ -39,8 +39,54 @@

 #include "hostinfo.h"

+HOST_INFO::HOST_INFO() {
+    clear_host_info();
+}
+
+// this must NOT clear coprocs
+// (initialization logic assumes that)
+//
+void HOST_INFO::clear_host_info() {
+    timezone = 0;
+    safe_strcpy(domain_name, "");
+    safe_strcpy(serialnum, "");
+    safe_strcpy(ip_addr, "");
+    safe_strcpy(host_cpid, "");
+
+    p_ncpus = 0;
+    safe_strcpy(p_vendor, "");
+    safe_strcpy(p_model, "");
+    safe_strcpy(p_features, "");
+    p_fpops = 0;
+    p_iops = 0;
+    p_membw = 0;
+    p_calculated = 0;
+    p_vm_extensions_disabled = false;
+
+    m_nbytes = 0;
+    m_cache = 0;
+    m_swap = 0;
+
+    d_total = 0;
+    d_free = 0;
+
+    safe_strcpy(os_name, "");
+    safe_strcpy(os_version, "");
+
+    wsl_available = false;
+#ifdef _WIN64
+    wsls.clear();
+#endif
+
+    safe_strcpy(product_name, "");
+    safe_strcpy(mac_address, "");
+
+    safe_strcpy(virtualbox_version, "");
+    num_opencl_cpu_platforms = 0;
+}
+
 int HOST_INFO::parse(XML_PARSER& xp, bool static_items_only) {
-    clear();
+    clear_host_info();
    while (!xp.get_tag()) {
        if (xp.match_tag("/host_info")) return 0;
        if (xp.parse_double("p_fpops", p_fpops)) {
--- a/lib/hostinfo.h
+++ b/lib/hostinfo.h
@ -95,12 +95,9 @@ public:
    int num_opencl_cpu_platforms;
    OPENCL_CPU_PROP opencl_cpu_prop[MAX_OPENCL_CPU_PLATFORMS];

-    HOST_INFO(int){}
-    HOST_INFO(){}
-    void clear() {
-        static const HOST_INFO x(0);
-        *this = x;
-    }
+    void clear_host_info();
+    HOST_INFO();
+
    int parse(XML_PARSER&, bool static_items_only = false);
    int write(MIOFILE&, bool include_net_info, bool include_coprocs);
    int parse_cpu_benchmarks(FILE*);