From 90f863f08ca86ad20af2e66e6ab06f26123afd58 Mon Sep 17 00:00:00 2001
From: David Anderson <davea@ssl.berkeley.edu>
Date: Tue, 21 Apr 2009 08:11:28 +0000
Subject: [PATCH] - partial checkin so I can edit locally (bad network
 connection)

svn path=/trunk/boinc/; revision=17852
---
 checkin_notes  |  6 ++++++
 lib/coproc.cpp | 24 ++++++++++++++++++------
 lib/coproc.h   | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 6 deletions(-)
diff --git a/checkin_notes b/checkin_notes
index 226c1bae6e..edddc7e078 100644
--- a/checkin_notes
+++ b/checkin_notes
@@ -4025,3 +4025,9 @@ David  20 Apr 2009
 
     sched/
         db_purge.cpp
+
+David  20 Apr 2009
+    - partial checkin so I can edit locally (bad network connection)
+
+    lib/
+        coproc.cpp,h
diff --git a/lib/coproc.cpp b/lib/coproc.cpp
index d65f1d0666..ecb8453874 100644
--- a/lib/coproc.cpp
+++ b/lib/coproc.cpp
@@ -142,6 +142,24 @@ COPROC* COPROCS::lookup(const char* type) {
 
 #endif
 
+// return 1/-1/0 if device 1 is more/less/same capable than device 2
+//
+int cuda_compare(COPROC_CUDA& c1, COPROC_CUDA& c2) {
+    if (c1.prop.major > c2.prop.major) return 1;
+    if (c1.prop.major < c2.prop.major) return -1;
+    if (c1.prop.minor > c2.prop.minor) return 1;
+    if (c1.prop.minor < c2.prop.minor) return -1;
+    if (c1.drvVersion > c2.drvVersion) return 1; 
+    if (c1.drvVersion < c2.drvVersion) return -1; 
+    if (c1.prop.dtotalGlobalMem > c2.prop.dtotalGlobalMem) return 1;
+    if (c1.prop.dtotalGlobalMem < c2.prop.dtotalGlobalMem) return -1;
+	double s1 = c1.flops_estimate();
+	double s2 = c1.flops_estimate();
+	if (s1 > s2) return 1;
+	if (s1 < s2) return -1;
+	return 0;
+}
+
 void COPROC_CUDA::get(COPROCS& coprocs, vector<string>& strings) {
     int count;
 
@@ -220,12 +238,6 @@ void COPROC_CUDA::get(COPROCS& coprocs, vector<string>& strings) {
     }
 #endif
 
-    // NOTE: our design is slightly flawed:
-    // there's no provision for having two coprocs of type CUDA.
-    // So on systems with two GPUs of different hardware types
-    // we have to count them as two of the same type.
-    // Pick the fastest.
-    //
     (*__cudaGetDeviceCount)(&count);
     int real_count = 0;
     COPROC_CUDA cc, cc2;
diff --git a/lib/coproc.h b/lib/coproc.h
index 1c9f582bac..d651a5f4da 100644
--- a/lib/coproc.h
+++ b/lib/coproc.h
@@ -15,6 +15,50 @@
 // You should have received a copy of the GNU Lesser General Public License
 // along with BOINC.  If not, see <http://www.gnu.org/licenses/>.
 
+// Structures representing coprocessors (e.g. GPUs);
+// used in both client and server.
+//
+// Notes:
+//
+// 1) The use of "CUDA" is misleading; it really means "NVIDIA GPU".
+// 2) The design treats each resource type as a pool of identical devices;
+//  for example, there is a single "CUDA long-term debt" per project,
+//  and a scheduler request contains a request (#instances, instance-seconds)
+//  for CUDA jobs.
+//  In reality, the instances of a resource type can have different properties:
+//  In the case of CUDA, "compute capability", driver version, RAM, speed, etc.
+//  How to resolve this discrepancy?
+//
+//  Prior to 21 Apr 09 we identified the fastest instance
+//  and pretended that the others were identical to it.
+//  This approach has a serious flaw:
+//  suppose that the fastest instance has characteristics
+//  (version, RAM etc.) that satisfy the project's requirements,
+//  but other instances to not.
+//  Then BOINC executes jobs on GPUs that can't handle them,
+//  the jobs fail, the host is punished, etc.
+//
+//  We could treat each GPU has a separate resource,
+//  with its own set of debts, backoffs, etc.
+//  However, this would imply tying jobs to instances,
+//  which is undesirable from a scheduling viewpoint.
+//  It would also be a big code change in both client and server.
+//
+//  Instead, (as of 21 Apr 09) our approach is to identify a
+//  "most capable" instance, which in the case of CUDA is based on
+//  a) compute capability
+//  b) driver version
+//  c) RAM size
+//  d) est. FLOPS
+//  (in decreasing priority).
+//  We ignore and don't use any instances that are less capable
+//  on any of these axes.
+//
+//  This design avoids running coprocessor apps on instances
+//  that are incapable of handling them, and it involves no server changes.
+//  Its drawback is that, on systems with multiple and differing GPUs,
+//  it may not use some GPUs that actually could be used.
+
 #ifndef _COPROC_
 #define _COPROC_
 
@@ -46,6 +90,11 @@ struct COPROC {
     //
     void* owner[MAX_COPROC_INSTANCES];
 
+    // the device number of each instance
+    // These are not sequential if we omit instances (see above)
+    //
+    int device_num[MAX_COPROC_INSTANCES];
+
 #ifndef _USING_FCGI_
     virtual void write_xml(MIOFILE&);
 #endif