- partial checkin so I can edit locally (bad network connection)

svn path=/trunk/boinc/; revision=17852
This commit is contained in:
David Anderson 2009-04-21 08:11:28 +00:00
parent 5c23fb9f2a
commit 90f863f08c
3 changed files with 73 additions and 6 deletions

View File

@ -4025,3 +4025,9 @@ David 20 Apr 2009
sched/
db_purge.cpp
David 20 Apr 2009
- partial checkin so I can edit locally (bad network connection)
lib/
coproc.cpp,h

View File

@ -142,6 +142,24 @@ COPROC* COPROCS::lookup(const char* type) {
#endif
// return 1/-1/0 if device 1 is more/less/same capable than device 2
//
int cuda_compare(COPROC_CUDA& c1, COPROC_CUDA& c2) {
if (c1.prop.major > c2.prop.major) return 1;
if (c1.prop.major < c2.prop.major) return -1;
if (c1.prop.minor > c2.prop.minor) return 1;
if (c1.prop.minor < c2.prop.minor) return -1;
if (c1.drvVersion > c2.drvVersion) return 1;
if (c1.drvVersion < c2.drvVersion) return -1;
if (c1.prop.dtotalGlobalMem > c2.prop.dtotalGlobalMem) return 1;
if (c1.prop.dtotalGlobalMem < c2.prop.dtotalGlobalMem) return -1;
double s1 = c1.flops_estimate();
double s2 = c1.flops_estimate();
if (s1 > s2) return 1;
if (s1 < s2) return -1;
return 0;
}
void COPROC_CUDA::get(COPROCS& coprocs, vector<string>& strings) {
int count;
@ -220,12 +238,6 @@ void COPROC_CUDA::get(COPROCS& coprocs, vector<string>& strings) {
}
#endif
// NOTE: our design is slightly flawed:
// there's no provision for having two coprocs of type CUDA.
// So on systems with two GPUs of different hardware types
// we have to count them as two of the same type.
// Pick the fastest.
//
(*__cudaGetDeviceCount)(&count);
int real_count = 0;
COPROC_CUDA cc, cc2;

View File

@ -15,6 +15,50 @@
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
// Structures representing coprocessors (e.g. GPUs);
// used in both client and server.
//
// Notes:
//
// 1) The use of "CUDA" is misleading; it really means "NVIDIA GPU".
// 2) The design treats each resource type as a pool of identical devices;
// for example, there is a single "CUDA long-term debt" per project,
// and a scheduler request contains a request (#instances, instance-seconds)
// for CUDA jobs.
// In reality, the instances of a resource type can have different properties:
// In the case of CUDA, "compute capability", driver version, RAM, speed, etc.
// How to resolve this discrepancy?
//
// Prior to 21 Apr 09 we identified the fastest instance
// and pretended that the others were identical to it.
// This approach has a serious flaw:
// suppose that the fastest instance has characteristics
// (version, RAM etc.) that satisfy the project's requirements,
// but other instances to not.
// Then BOINC executes jobs on GPUs that can't handle them,
// the jobs fail, the host is punished, etc.
//
// We could treat each GPU has a separate resource,
// with its own set of debts, backoffs, etc.
// However, this would imply tying jobs to instances,
// which is undesirable from a scheduling viewpoint.
// It would also be a big code change in both client and server.
//
// Instead, (as of 21 Apr 09) our approach is to identify a
// "most capable" instance, which in the case of CUDA is based on
// a) compute capability
// b) driver version
// c) RAM size
// d) est. FLOPS
// (in decreasing priority).
// We ignore and don't use any instances that are less capable
// on any of these axes.
//
// This design avoids running coprocessor apps on instances
// that are incapable of handling them, and it involves no server changes.
// Its drawback is that, on systems with multiple and differing GPUs,
// it may not use some GPUs that actually could be used.
#ifndef _COPROC_
#define _COPROC_
@ -46,6 +90,11 @@ struct COPROC {
//
void* owner[MAX_COPROC_INSTANCES];
// the device number of each instance
// These are not sequential if we omit instances (see above)
//
int device_num[MAX_COPROC_INSTANCES];
#ifndef _USING_FCGI_
virtual void write_xml(MIOFILE&);
#endif