From 758ff3e13aaeb1424006848cca7aadac4f5b7c82 Mon Sep 17 00:00:00 2001
From: David Anderson <davea@ssl.berkeley.edu>
Date: Tue, 28 Jan 2020 14:22:28 -0800
Subject: [PATCH] client: fix bug in GPU detection

PR #3364 changed the way we clear structures.
This introduced a bug: HOST_INFO::clear_host_info()
intentionally didn't clear HOST_INFO::coprocs.
But it was replaced with HOST_INFO::clear(), which did.
This caused the client to lose GPU info.

Fix: restore HOST_INFO::clear_host_info(), and add a comment
to avoid future errors like this.

Also add some comments in GPU detection,
which is woefully lacking in them.
---
 client/gpu_detect.cpp      | 52 +++++++++++++++++++++++++-------------
 client/gpu_nvidia.cpp      |  5 +++-
 lib/app_ipc.cpp            |  2 +-
 lib/coproc.h               |  6 ++---
 lib/gui_rpc_client_ops.cpp |  2 +-
 lib/hostinfo.cpp           | 48 ++++++++++++++++++++++++++++++++++-
 lib/hostinfo.h             |  9 +++----
 7 files changed, 94 insertions(+), 30 deletions(-)
diff --git a/client/gpu_detect.cpp b/client/gpu_detect.cpp
index 01b8870a7a..15650a2030 100644
--- a/client/gpu_detect.cpp
+++ b/client/gpu_detect.cpp
@@ -18,39 +18,39 @@
 
 // client-specific GPU code.  Mostly GPU detection
 //
-//  theory of operation:
-//  there are two ways of detecting GPUs:
+// theory of operation:
+// there are two ways of detecting GPUs:
 //  - vendor-specific libraries like CUDA and CAL,
 //      which detect only that vendor's GPUs
 //  - OpenCL, which can detect multiple types of GPUs,
 //      including nvidia/amd/intel as well was new types
 //      such as ARM integrated GPUs
 //
-//  These libraries sometimes crash,
-//  and we've been unable to trap these via signal and exception handlers.
-//  So we do GPU detection in a separate process (boinc --detect_gpus)
-//  This process writes an XML file "coproc_info.xml" containing
+// These libraries sometimes crash,
+// and we've been unable to trap these via signal and exception handlers.
+// So we do GPU detection in a separate process (boinc --detect_gpus)
+// This process writes an XML file "coproc_info.xml" containing
 //  - lists of GPU detected via CUDA and CAL
 //  - lists of nvidia/amd/intel GPUs detected via OpenCL
 //  - a list of other GPUs detected via OpenCL
 //
-//  When the process finishes, the client parses the info file.
-//  Then for each vendor it "correlates" the GPUs, which includes:
+// Also, some dual-GPU laptops (e.g., Macbook Pro) don't power
+// down the more powerful GPU until all applications which used them exit.
+// Doing GPU detection in a second, short-lived process
+// saves battery life on these laptops.
+//
+// When the process finishes, the client parses the info file.
+// Then for each vendor it "correlates" the GPUs, which includes:
 //  - matching up the OpenCL and vendor-specific descriptions, if both exist
 //  - finding the most capable GPU, and seeing which other GPUs
 //      are similar to it in hardware and RAM.
 //      Other GPUs are not used.
 //  - copy these to the COPROCS structure
 //
-//  Also, some dual-GPU laptops (e.g., Macbook Pro) don't power
-//  down the more powerful GPU until all applications which used
-//  them exit. Doing GPU detection in a second, short-lived process
-//  saves battery life on these laptops.
-//
-//  GPUs can also be explicitly described in cc_config.xml
 
+// GPUs can also be explicitly described in cc_config.xml
 
-// uncomment to do GPU detection in same process (for debugging)
+// comment out to do GPU detection in same process (for debugging)
 //
 #define USE_CHILD_PROCESS_TO_DETECT_GPUS 1
 
@@ -90,6 +90,8 @@ void segv_handler(int) {
 }
 #endif
 
+// the following store GPU instances during initialization
+//
 vector<COPROC_ATI> ati_gpus;
 vector<COPROC_NVIDIA> nvidia_gpus;
 vector<COPROC_INTEL> intel_gpus;
@@ -105,11 +107,16 @@ static char* client_path;
 static char client_dir[MAXPATHLEN];
     // current directory at start of client
 
+// find GPU instances, then correlate (merge) them
+//
 void COPROCS::get(
     bool use_all, vector<string>&descs, vector<string>&warnings,
     IGNORE_GPU_INSTANCE& ignore_gpu_instance
 ) {
 #if USE_CHILD_PROCESS_TO_DETECT_GPUS
+    // detect_gpus() can cause crashes even with try/catch,
+    // so do it in a separate process that writes a file
+    //
     int retval = 0;
     char buf[256];
 
@@ -135,7 +142,10 @@ void COPROCS::get(
     correlate_gpus(use_all, descs, ignore_gpu_instance);
 }
 
-
+// populate the global vectors
+// ati_gpus, nvidia_gpus, intel_gpus,
+// nvidia_opencls, etc.
+//
 void COPROCS::detect_gpus(vector<string> &warnings) {
 #ifdef _WIN32
     try {
@@ -194,7 +204,12 @@ void COPROCS::detect_gpus(vector<string> &warnings) {
 #endif
 }
 
-
+// for each GPU type, scan the GPUs we detected
+// (e.g. the vector nvidia_gpus).
+// Find the most capable one, and the ones equivalent to it.
+// Also correlate OpenCL GPUs with CUDA/CAL GPUs.
+// Then create a single COPROC (with appropriate count)
+//
 void COPROCS::correlate_gpus(
     bool use_all,
     vector<string> &descs,
@@ -441,6 +456,9 @@ int COPROCS::write_coproc_info_file(vector<string> &warnings) {
     return 0;
 }
 
+// if we're using a separate process to find GPUs,
+// read its output file and create data structures
+//
 int COPROCS::read_coproc_info_file(vector<string> &warnings) {
     MIOFILE mf;
     int retval;
diff --git a/client/gpu_nvidia.cpp b/client/gpu_nvidia.cpp
index defe9ecee3..5d6ddc2a01 100644
--- a/client/gpu_nvidia.cpp
+++ b/client/gpu_nvidia.cpp
@@ -465,7 +465,10 @@ leave:
 #endif
 }
 
-
+// Find the most capable instance; copy to *this.
+// set is_used (USED, UNUSED, IGNORED) for each instance.
+// Don't use less-capable instances (unless use_all is set)
+//
 void COPROC_NVIDIA::correlate(
     bool use_all,    // if false, use only those equivalent to most capable
     vector<int>& ignore_devs
diff --git a/lib/app_ipc.cpp b/lib/app_ipc.cpp
index 849d03d03a..1ef96a3e7c 100644
--- a/lib/app_ipc.cpp
+++ b/lib/app_ipc.cpp
@@ -269,7 +269,7 @@ void APP_INIT_DATA::clear() {
     host_total_credit = 0;
     host_expavg_credit = 0;
     resource_share_fraction = 0;
-    host_info.clear();
+    host_info.clear_host_info();
     proxy_info.clear();
     global_prefs.defaults();
     starting_elapsed_time = 0;
diff --git a/lib/coproc.h b/lib/coproc.h
index 7c0905469e..3d4746986b 100644
--- a/lib/coproc.h
+++ b/lib/coproc.h
@@ -304,7 +304,7 @@ struct COPROC_NVIDIA : public COPROC {
 #ifndef _USING_FCGI_
     void write_xml(MIOFILE&, bool scheduler_rpc);
 #endif
-    COPROC_NVIDIA(): COPROC() {}
+    COPROC_NVIDIA(): COPROC() {clear();}
     void get(std::vector<std::string>& warnings);
     void correlate(
         bool use_all,
@@ -339,7 +339,7 @@ struct COPROC_ATI : public COPROC {
 #ifndef _USING_FCGI_
     void write_xml(MIOFILE&, bool scheduler_rpc);
 #endif
-    COPROC_ATI(): COPROC() {}
+    COPROC_ATI(): COPROC() {clear();}
     void get(std::vector<std::string>& warnings);
     void correlate(
         bool use_all,
@@ -361,7 +361,7 @@ struct COPROC_INTEL : public COPROC {
 #ifndef _USING_FCGI_
     void write_xml(MIOFILE&, bool scheduler_rpc);
 #endif
-    COPROC_INTEL(): COPROC() {}
+    COPROC_INTEL(): COPROC() {clear();}
     void get(std::vector<std::string>& warnings);
     void correlate(
         bool use_all,
diff --git a/lib/gui_rpc_client_ops.cpp b/lib/gui_rpc_client_ops.cpp
index 546175294f..4d5c2572ff 100644
--- a/lib/gui_rpc_client_ops.cpp
+++ b/lib/gui_rpc_client_ops.cpp
@@ -1052,7 +1052,7 @@ void CC_STATE::clear() {
     results.clear();
     platforms.clear();
     executing_as_daemon = false;
-    host_info.clear();
+    host_info.clear_host_info();
     have_nvidia = false;
     have_ati = false;
 }
diff --git a/lib/hostinfo.cpp b/lib/hostinfo.cpp
index 1512b06282..6686d31711 100644
--- a/lib/hostinfo.cpp
+++ b/lib/hostinfo.cpp
@@ -39,8 +39,54 @@
 
 #include "hostinfo.h"
 
+HOST_INFO::HOST_INFO() {
+    clear_host_info();
+}
+
+// this must NOT clear coprocs
+// (initialization logic assumes that)
+//
+void HOST_INFO::clear_host_info() {
+    timezone = 0;
+    safe_strcpy(domain_name, "");
+    safe_strcpy(serialnum, "");
+    safe_strcpy(ip_addr, "");
+    safe_strcpy(host_cpid, "");
+
+    p_ncpus = 0;
+    safe_strcpy(p_vendor, "");
+    safe_strcpy(p_model, "");
+    safe_strcpy(p_features, "");
+    p_fpops = 0;
+    p_iops = 0;
+    p_membw = 0;
+    p_calculated = 0;
+    p_vm_extensions_disabled = false;
+
+    m_nbytes = 0;
+    m_cache = 0;
+    m_swap = 0;
+
+    d_total = 0;
+    d_free = 0;
+
+    safe_strcpy(os_name, "");
+    safe_strcpy(os_version, "");
+
+    wsl_available = false;
+#ifdef _WIN64
+    wsls.clear();
+#endif
+
+    safe_strcpy(product_name, "");
+    safe_strcpy(mac_address, "");
+
+    safe_strcpy(virtualbox_version, "");
+    num_opencl_cpu_platforms = 0;
+}
+
 int HOST_INFO::parse(XML_PARSER& xp, bool static_items_only) {
-    clear();
+    clear_host_info();
     while (!xp.get_tag()) {
         if (xp.match_tag("/host_info")) return 0;
         if (xp.parse_double("p_fpops", p_fpops)) {
diff --git a/lib/hostinfo.h b/lib/hostinfo.h
index c7134394fe..3bb65b061a 100644
--- a/lib/hostinfo.h
+++ b/lib/hostinfo.h
@@ -95,12 +95,9 @@ public:
     int num_opencl_cpu_platforms;
     OPENCL_CPU_PROP opencl_cpu_prop[MAX_OPENCL_CPU_PLATFORMS];
 
-    HOST_INFO(int){}
-    HOST_INFO(){}
-    void clear() {
-        static const HOST_INFO x(0);
-        *this = x;
-    }
+    void clear_host_info();
+    HOST_INFO();
+
     int parse(XML_PARSER&, bool static_items_only = false);
     int write(MIOFILE&, bool include_net_info, bool include_coprocs);
     int parse_cpu_benchmarks(FILE*);