// This file is part of BOINC. // http://boinc.berkeley.edu // Copyright (C) 2008 University of California // // BOINC is free software; you can redistribute it and/or modify it // under the terms of the GNU Lesser General Public License // as published by the Free Software Foundation, // either version 3 of the License, or (at your option) any later version. // // BOINC is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // See the GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with BOINC. If not, see . // Structures representing coprocessors (e.g. GPUs); // used in both client and server. // // Notes: // // 1) The use of "CUDA" is misleading; it really means "NVIDIA GPU". // 2) The design treats each resource type as a pool of identical devices; // for example, there is a single "CUDA long-term debt" per project, // and a scheduler request contains a request (#instances, instance-seconds) // for CUDA jobs. // In reality, the instances of a resource type can have different properties: // In the case of CUDA, "compute capability", driver version, RAM, speed, etc. // How to resolve this discrepancy? // // Prior to 21 Apr 09 we identified the fastest instance // and pretended that the others were identical to it. // This approach has a serious flaw: // suppose that the fastest instance has characteristics // (version, RAM etc.) that satisfy the project's requirements, // but other instances to not. // Then BOINC executes jobs on GPUs that can't handle them, // the jobs fail, the host is punished, etc. // // We could treat each GPU has a separate resource, // with its own set of debts, backoffs, etc. // However, this would imply tying jobs to instances, // which is undesirable from a scheduling viewpoint. // It would also be a big code change in both client and server. // // Instead, (as of 21 Apr 09) our approach is to identify a // "most capable" instance, which in the case of CUDA is based on // a) compute capability // b) driver version // c) RAM size // d) est. FLOPS // (in decreasing priority). // We ignore and don't use any instances that are less capable // on any of these axes. // // This design avoids running coprocessor apps on instances // that are incapable of handling them, and it involves no server changes. // Its drawback is that, on systems with multiple and differing GPUs, // it may not use some GPUs that actually could be used. #ifndef _COPROC_ #define _COPROC_ #include #include #include #ifdef _USING_FCGI_ #include "boinc_fcgi.h" #endif #include "miofile.h" #include "error_numbers.h" #include "parse.h" #include "cal_boinc.h" #include "cl_boinc.h" #define DEFER_ON_GPU_AVAIL_RAM 0 #define MAX_COPROC_INSTANCES 64 #define MAX_RSC 8 // max # of processing resources types #define MAX_OPENCL_PLATFORMS 16 #define GPU_TYPE_NVIDIA "NVIDIA" #define GPU_TYPE_ATI "ATI" enum COPROC_USAGE { COPROC_IGNORED, COPROC_UNUSED, COPROC_USED }; // represents a requirement for a coproc. // This is a parsed version of the elements in an // (used in client only) // struct COPROC_REQ { char type[256]; // must be unique double count; int parse(XML_PARSER&); }; // there's some duplication between the values in // the OPENCL_DEVICE_PROP struct and the NVIDIA/ATI structs // struct OPENCL_DEVICE_PROP { cl_device_id device_id; char name[256]; // Device name char vendor[256]; // Device vendor (NVIDIA, ATI, AMD, etc.) cl_uint vendor_id; // OpenCL ID of device vendor cl_bool available; // Is this device available? cl_device_fp_config half_fp_config; // Half precision capabilities cl_device_fp_config single_fp_config; // Single precision cl_device_fp_config double_fp_config; // Double precision cl_bool endian_little; // TRUE if little-endian cl_device_exec_capabilities execution_capabilities; char extensions[1024]; // List of device extensions cl_ulong global_mem_size; // in bytes cl_ulong local_mem_size; cl_uint max_clock_frequency; // in MHz cl_uint max_compute_units; char opencl_platform_version[64]; // Version of OpenCL supported // the device's platform char opencl_device_version[64]; // OpenCL version supported by device; // example: "OpenCL 1.1 beta" int opencl_device_version_int; // same, encoded as e.g. 101 int get_device_version_int(); // call this to encode char opencl_driver_version[32]; // For example: "CLH 1.0" int device_num; // temp used in scan process double peak_flops; // temp used in scan process COPROC_USAGE is_used; // temp used in scan process double opencl_available_ram; // temp used in scan process int opencl_device_index; // temp used in scan process #ifndef _USING_FCGI_ void write_xml(MIOFILE&); #endif int parse(XML_PARSER&); void description(char* buf, const char* type); }; // represents a set of identical coprocessors on a particular computer. // Abstract class; // objects will always be a derived class (COPROC_CUDA, COPROC_ATI) // Used in both client and server. // struct COPROC { char type[256]; // must be unique int count; // how many are present double peak_flops; double used; // how many are in use (used by client) bool have_cuda; // True if this GPU supports CUDA on this computer bool have_cal; // True if this GPU supports CAL on this computer bool have_opencl; // True if this GPU supports openCL on this computer double available_ram; bool specified_in_config; // If true, this coproc was listed in cc_config.xml // rather than being detected by the client. // the following are used in both client and server for work-fetch info // double req_secs; // how many instance-seconds of work requested double req_instances; // client is requesting enough jobs to use this many instances double estimated_delay; // resource will be saturated for this long // temps used in client (enforce_schedule()) // to keep track of what fraction of each instance is in use // during instance assignment // double usage[MAX_COPROC_INSTANCES]; double pending_usage[MAX_COPROC_INSTANCES]; // the device number of each instance // These are not sequential if we omit instances (see above) // int device_nums[MAX_COPROC_INSTANCES]; int device_num; // temp used in scan process cl_device_id opencl_device_ids[MAX_COPROC_INSTANCES]; int opencl_device_count; int opencl_device_indexes[MAX_COPROC_INSTANCES]; bool running_graphics_app[MAX_COPROC_INSTANCES]; // is this GPU running a graphics app (NVIDIA only) #if DEFER_ON_GPU_AVAIL_RAM double available_ram_temp[MAX_COPROC_INSTANCES]; // used during job scheduling #endif double last_print_time; OPENCL_DEVICE_PROP opencl_prop; #ifndef _USING_FCGI_ void write_xml(MIOFILE&); void write_request(MIOFILE&); #endif int parse(XML_PARSER&); inline void clear() { // can't just memcpy() - trashes vtable type[0] = 0; count = 0; peak_flops = 0; used = 0; have_cuda = false; have_cal = false; have_opencl = false; specified_in_config = false; available_ram = -1; req_secs = 0; req_instances = 0; opencl_device_count = 0; estimated_delay = 0; available_ram = 0; for (int i=0; i &opencls, std::vector& ignore_dev ); void find_best_opencls( bool use_all, std::vector &opencls, std::vector& ignore_dev ); }; // based on cudaDeviceProp from /usr/local/cuda/include/driver_types.h // doesn't have to match exactly since we get the attributes one at a time. // struct CUDA_DEVICE_PROP { char name[256]; int deviceHandle; unsigned int totalGlobalMem; // not used on the server; dtotalGlobalMem is used instead // (since some boards have >= 4GB) int sharedMemPerBlock; int regsPerBlock; int warpSize; int memPitch; int maxThreadsPerBlock; int maxThreadsDim[3]; int maxGridSize[3]; int clockRate; int totalConstMem; int major; // compute capability int minor; int textureAlignment; int deviceOverlap; int multiProcessorCount; double dtotalGlobalMem; // not defined in client }; struct COPROC_NVIDIA : public COPROC { int cuda_version; // CUDA runtime version int display_driver_version; CUDA_DEVICE_PROP prop; COPROC_USAGE is_used; // temp used in scan process #ifndef _USING_FCGI_ void write_xml(MIOFILE&, bool include_request); #endif COPROC_NVIDIA(): COPROC(GPU_TYPE_NVIDIA){} void get( bool use_all, std::vector&, std::vector&, std::vector& ignore_devs ); void description(char*); void clear(); int parse(XML_PARSER&); void get_available_ram(); void set_peak_flops(); bool check_running_graphics_app(); void fake(int driver_version, double ram, double avail_ram, int count); }; // encode a 3-part version as // 10000000*major + 10000*minor + release // Note: ATI release #s can exceed 1000 // inline int ati_version_int(int major, int minor, int release) { return major*10000000 + minor*10000 + release; } struct COPROC_ATI : public COPROC { char name[256]; char version[50]; int version_num; // CAL version (not driver version) encoded as an int bool atirt_detected; bool amdrt_detected; CALdeviceattribs attribs; CALdeviceinfo info; COPROC_USAGE is_used; // temp used in scan process #ifndef _USING_FCGI_ void write_xml(MIOFILE&, bool include_request); #endif COPROC_ATI(): COPROC(GPU_TYPE_ATI){} void get( bool use_all, std::vector&, std::vector&, std::vector& ignore_devs ); void description(char*); void clear(); int parse(XML_PARSER&); void get_available_ram(); void set_peak_flops(); void fake(double ram, double avail_ram, int); }; struct COPROCS { int n_rsc; COPROC coprocs[MAX_RSC]; COPROC_NVIDIA nvidia; COPROC_ATI ati; void write_xml(MIOFILE& out, bool include_request); void get( bool use_all, std::vector &descs, std::vector &warnings, std::vector& ignore_nvidia_dev, std::vector& ignore_ati_dev ); void get_opencl( bool use_all, std::vector& descs, std::vector &warnings, std::vector& ignore_nvidia_dev, std::vector& ignore_ati_dev ); cl_int get_opencl_info( OPENCL_DEVICE_PROP& prop, cl_uint device_index, std::vector& warnings ); int parse(XML_PARSER&); void summary_string(char* buf, int len); // Copy a coproc set, possibly setting usage to zero. // used in round-robin simulator and CPU scheduler, // to avoid messing w/ master copy // void clone(COPROCS& c, bool copy_used) { n_rsc = c.n_rsc; for (int i=0; i 0); } inline bool have_ati() { return (ati.count > 0); } int add(COPROC& c) { if (n_rsc >= MAX_RSC) return ERR_BUFFER_OVERFLOW; for (int i=1; i