- partial checkin so I can edit locally (bad network connection)

svn path=/trunk/boinc/; revision=17852
2009-04-21 08:11:28 +00:00 · 2009-04-21 08:11:28 +00:00 · 90f863f08c
parent 5c23fb9f2a
commit 90f863f08c
3 changed files with 73 additions and 6 deletions
--- a/6
+++ b/6
@ -4025,3 +4025,9 @@ David  20 Apr 2009

    sched/
        db_purge.cpp
+
+David  20 Apr 2009
+    - partial checkin so I can edit locally (bad network connection)
+
+    lib/
+        coproc.cpp,h
--- a/lib/coproc.cpp
+++ b/lib/coproc.cpp
@ -142,6 +142,24 @@ COPROC* COPROCS::lookup(const char* type) {

 #endif

+// return 1/-1/0 if device 1 is more/less/same capable than device 2
+//
+int cuda_compare(COPROC_CUDA& c1, COPROC_CUDA& c2) {
+    if (c1.prop.major > c2.prop.major) return 1;
+    if (c1.prop.major < c2.prop.major) return -1;
+    if (c1.prop.minor > c2.prop.minor) return 1;
+    if (c1.prop.minor < c2.prop.minor) return -1;
+    if (c1.drvVersion > c2.drvVersion) return 1; 
+    if (c1.drvVersion < c2.drvVersion) return -1; 
+    if (c1.prop.dtotalGlobalMem > c2.prop.dtotalGlobalMem) return 1;
+    if (c1.prop.dtotalGlobalMem < c2.prop.dtotalGlobalMem) return -1;
+	double s1 = c1.flops_estimate();
+	double s2 = c1.flops_estimate();
+	if (s1 > s2) return 1;
+	if (s1 < s2) return -1;
+	return 0;
+}
+
 void COPROC_CUDA::get(COPROCS& coprocs, vector<string>& strings) {
    int count;

@ -220,12 +238,6 @@ void COPROC_CUDA::get(COPROCS& coprocs, vector<string>& strings) {
    }
 #endif

-    // NOTE: our design is slightly flawed:
-    // there's no provision for having two coprocs of type CUDA.
-    // So on systems with two GPUs of different hardware types
-    // we have to count them as two of the same type.
-    // Pick the fastest.
-    //
    (*__cudaGetDeviceCount)(&count);
    int real_count = 0;
    COPROC_CUDA cc, cc2;
--- a/lib/coproc.h
+++ b/lib/coproc.h
@ -15,6 +15,50 @@
 // You should have received a copy of the GNU Lesser General Public License
 // along with BOINC.  If not, see <http://www.gnu.org/licenses/>.

+// Structures representing coprocessors (e.g. GPUs);
+// used in both client and server.
+//
+// Notes:
+//
+// 1) The use of "CUDA" is misleading; it really means "NVIDIA GPU".
+// 2) The design treats each resource type as a pool of identical devices;
+//  for example, there is a single "CUDA long-term debt" per project,
+//  and a scheduler request contains a request (#instances, instance-seconds)
+//  for CUDA jobs.
+//  In reality, the instances of a resource type can have different properties:
+//  In the case of CUDA, "compute capability", driver version, RAM, speed, etc.
+//  How to resolve this discrepancy?
+//
+//  Prior to 21 Apr 09 we identified the fastest instance
+//  and pretended that the others were identical to it.
+//  This approach has a serious flaw:
+//  suppose that the fastest instance has characteristics
+//  (version, RAM etc.) that satisfy the project's requirements,
+//  but other instances to not.
+//  Then BOINC executes jobs on GPUs that can't handle them,
+//  the jobs fail, the host is punished, etc.
+//
+//  We could treat each GPU has a separate resource,
+//  with its own set of debts, backoffs, etc.
+//  However, this would imply tying jobs to instances,
+//  which is undesirable from a scheduling viewpoint.
+//  It would also be a big code change in both client and server.
+//
+//  Instead, (as of 21 Apr 09) our approach is to identify a
+//  "most capable" instance, which in the case of CUDA is based on
+//  a) compute capability
+//  b) driver version
+//  c) RAM size
+//  d) est. FLOPS
+//  (in decreasing priority).
+//  We ignore and don't use any instances that are less capable
+//  on any of these axes.
+//
+//  This design avoids running coprocessor apps on instances
+//  that are incapable of handling them, and it involves no server changes.
+//  Its drawback is that, on systems with multiple and differing GPUs,
+//  it may not use some GPUs that actually could be used.
+
 #ifndef _COPROC_
 #define _COPROC_

@ -46,6 +90,11 @@ struct COPROC {
    //
    void* owner[MAX_COPROC_INSTANCES];

+    // the device number of each instance
+    // These are not sequential if we omit instances (see above)
+    //
+    int device_num[MAX_COPROC_INSTANCES];
+
 #ifndef _USING_FCGI_
    virtual void write_xml(MIOFILE&);
 #endif