boinc/client/work_fetch.h

340 lines
10 KiB
C
Raw Normal View History

// This file is part of BOINC.
// http://boinc.berkeley.edu
// Copyright (C) 2008 University of California
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation,
// either version 3 of the License, or (at your option) any later version.
//
// BOINC is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
// Work fetch logic for CPU, GPU, and other processing resources.
// See http://boinc.berkeley.edu/trac/wiki/GpuWorkFetch
2008-12-31 23:14:57 +00:00
#ifndef _WORK_FETCH_
#define _WORK_FETCH_
#include <vector>
#include <deque>
2008-12-31 23:14:57 +00:00
#define RSC_TYPE_ANY -1
#define RSC_TYPE_CPU 0
2008-12-31 23:14:57 +00:00
// reasons for not fetching work
//
#define CANT_FETCH_WORK_NON_CPU_INTENSIVE 1
#define CANT_FETCH_WORK_SUSPENDED_VIA_GUI 2
#define CANT_FETCH_WORK_MASTER_URL_FETCH_PENDING 3
#define CANT_FETCH_WORK_MIN_RPC_TIME 4
#define CANT_FETCH_WORK_DONT_REQUEST_MORE_WORK 5
#define CANT_FETCH_WORK_DOWNLOAD_STALLED 6
#define CANT_FETCH_WORK_RESULT_SUSPENDED 7
#define CANT_FETCH_WORK_TOO_MANY_UPLOADS 8
#define CANT_FETCH_WORK_NOT_HIGHEST_PRIORITY 9
#define CANT_FETCH_WORK_DONT_NEED 10
#define CANT_FETCH_WORK_TOO_MANY_RUNNABLE 11
inline const char* cant_fetch_work_string(int reason) {
switch (reason) {
case CANT_FETCH_WORK_NON_CPU_INTENSIVE:
return "non CPU intensive";
case CANT_FETCH_WORK_SUSPENDED_VIA_GUI:
return "suspended via Manager";
case CANT_FETCH_WORK_MASTER_URL_FETCH_PENDING:
return "master URL fetch pending";
case CANT_FETCH_WORK_MIN_RPC_TIME:
return "scheduler RPC backoff";
case CANT_FETCH_WORK_DONT_REQUEST_MORE_WORK:
return "\"no new tasks\" requested via Manager";
case CANT_FETCH_WORK_DOWNLOAD_STALLED:
return "some download is stalled";
case CANT_FETCH_WORK_RESULT_SUSPENDED:
return "some task is suspended via Manager";
case CANT_FETCH_WORK_TOO_MANY_UPLOADS:
return "too many uploads in progress";
case CANT_FETCH_WORK_NOT_HIGHEST_PRIORITY:
return "project is not highest priority";
case CANT_FETCH_WORK_DONT_NEED:
return "don't need";
case CANT_FETCH_WORK_TOO_MANY_RUNNABLE:
return "too many runnable tasks";
}
return "";
}
struct PROJECT;
2008-12-31 23:14:57 +00:00
struct RESULT;
struct ACTIVE_TASK;
2008-12-31 23:14:57 +00:00
struct RSC_WORK_FETCH;
struct SCHEDULER_REPLY;
struct APP_VERSION;
2008-12-31 23:14:57 +00:00
// state per (resource, project) pair
2008-12-31 23:14:57 +00:00
//
struct RSC_PROJECT_WORK_FETCH {
// the following are persistent (saved in state file)
2008-12-31 23:14:57 +00:00
double backoff_time;
double backoff_interval;
// the following used by REC accounting
double secs_this_rec_interval;
inline void reset_rec_accounting() {
secs_this_rec_interval = 0;
2008-12-31 23:14:57 +00:00
}
double queue_est;
// an estimate of instance-secs of queued work;
bool anon_skip;
// set if this project is anonymous platform
// and it has no app version that uses this resource
double fetchable_share;
// this project's share relative to projects from which
// we could probably get work for this resource;
// determines how many instances this project deserves
int n_runnable_jobs;
double sim_nused;
// # of instances used at this point in the simulation
double nused_total; // sum of instances over all runnable jobs
int ncoprocs_excluded;
// number of excluded instances
int non_excluded_instances;
// bitmap of non-excluded instances
// (i.e. instances this project's jobs can run on)
int deadlines_missed;
int deadlines_missed_copy;
// copy of the above used during schedule_cpus()
std::deque<RESULT*> pending;
std::deque<RESULT*>::iterator pending_iter;
2008-12-31 23:14:57 +00:00
RSC_PROJECT_WORK_FETCH() {
backoff_time = 0;
backoff_interval = 0;
secs_this_rec_interval = 0;
queue_est = 0;
anon_skip = false;
fetchable_share = 0;
n_runnable_jobs = 0;
sim_nused = 0;
nused_total = 0;
ncoprocs_excluded = 0;
non_excluded_instances = 0;
deadlines_missed = 0;
deadlines_missed_copy = 0;
2008-12-31 23:14:57 +00:00
}
inline void reset() {
2008-12-31 23:14:57 +00:00
backoff_time = 0;
backoff_interval = 0;
}
bool may_have_work;
bool compute_may_have_work(PROJECT*, int rsc_type);
void resource_backoff(PROJECT*, const char*);
void rr_init(PROJECT*, int rsc_type);
void clear_backoff() {
backoff_time = 0;
backoff_interval = 0;
}
2008-12-31 23:14:57 +00:00
};
// estimate the time a resource will be saturated
// with high-priority jobs.
//
struct BUSY_TIME_ESTIMATOR {
std::vector<double> busy_time;
int ninstances;
inline void reset() {
for (int i=0; i<ninstances; i++) {
busy_time[i] = 0;
}
}
inline void init(int n) {
ninstances = n;
busy_time.resize(n);
reset();
}
// called for each high-priority job.
// Find the least-busy instance, and put this job
// on that and following instances
//
inline void update(double dur, double nused) {
if (ninstances==0) return;
int i, j;
if (nused < 1) return;
double best = 0;
int ibest = 0;
for (i=0; i<ninstances; i++) {
if (!i || busy_time[i] < best) {
best = busy_time[i];
ibest = i;
}
}
2009-09-16 16:54:42 +00:00
int inused = (int) nused; // ignore fractional usage
for (i=0; i<inused; i++) {
j = (ibest + i) % ninstances;
busy_time[j] += dur;
}
}
// the overall busy time is the busy time of
// the least busy instance
//
inline double get_busy_time() {
double best = 0;
for (int i=0; i<ninstances; i++) {
if (!i || busy_time[i] < best) {
best = busy_time[i];
}
}
return best;
}
};
2008-12-31 23:14:57 +00:00
// per-resource state
//
struct RSC_WORK_FETCH {
int rsc_type;
int ninstances;
double relative_speed; // total FLOPS relative to CPU total FLOPS
bool has_exclusions;
2008-12-31 23:14:57 +00:00
// the following used/set by rr_simulation():
//
double shortfall;
// seconds of idle instances between now and now+work_buf_total()
2008-12-31 23:14:57 +00:00
double nidle_now;
double sim_nused;
int sim_used_instances;
// bitmap of instances used in simulation,
// taking into account GPU exclusions
int sim_excluded_instances;
// bitmap of instances not used (i.e. starved because of exclusion)
double total_fetchable_share;
// total RS of projects from which we could fetch jobs for this device
double saturated_time;
// estimated time until resource is not saturated
// used to calculate work request
double deadline_missed_instances;
// instance count for jobs that miss deadline
BUSY_TIME_ESTIMATOR busy_time_estimator;
#ifdef SIM
double estimated_delay;
#endif
void init(int t, int n, double sp) {
rsc_type = t;
ninstances = n;
relative_speed = sp;
busy_time_estimator.init(n);
}
// the following specify the work request for this resource
//
double req_secs;
double req_instances;
2008-12-31 23:14:57 +00:00
// REC accounting
double secs_this_rec_interval;
inline void reset_rec_accounting() {
this->secs_this_rec_interval = 0;
2008-12-31 23:14:57 +00:00
}
// temp in choose_project()
PROJECT* found_project; // a project able to ask for this work
2008-12-31 23:14:57 +00:00
void rr_init();
void update_stats(double sim_now, double dt, double buf_end);
void update_busy_time(double dur, double nused);
void supplement(PROJECT*);
2008-12-31 23:14:57 +00:00
RSC_PROJECT_WORK_FETCH& project_state(PROJECT*);
void print_state(const char*);
void clear_request();
void set_request(PROJECT*);
void set_request_excluded(PROJECT*);
bool may_have_work(PROJECT*);
bool can_fetch(PROJECT*);
bool backed_off(PROJECT*);
bool uses_starved_excluded_instances(PROJECT*);
2008-12-31 23:14:57 +00:00
RSC_WORK_FETCH() {
rsc_type = 0;
ninstances = 0;
relative_speed = 0;
shortfall = 0;
nidle_now = 0;
sim_nused = 0;
total_fetchable_share = 0;
saturated_time = 0;
deadline_missed_instances = 0;
has_exclusions = false;
2008-12-31 23:14:57 +00:00
}
};
// per project state
//
struct PROJECT_WORK_FETCH {
double rec;
// recent estimated credit
double rec_time;
// when it was last updated
double rec_temp;
// temporary copy used during schedule_cpus() and work fetch
double rec_temp_save;
// temporary used during RR simulation
int cant_fetch_work_reason;
int compute_cant_fetch_work_reason(PROJECT*);
int n_runnable_jobs;
2008-12-31 23:14:57 +00:00
PROJECT_WORK_FETCH() {
memset(this, 0, sizeof(*this));
}
void reset(PROJECT*);
2008-12-31 23:14:57 +00:00
};
// global work fetch state
//
struct WORK_FETCH {
void setup();
PROJECT* choose_project();
// Find a project to ask for work.
PROJECT* non_cpu_intensive_project_needing_work();
void piggyback_work_request(PROJECT*);
// we're going to contact this project anyway;
// piggyback a work request if appropriate.
2008-12-31 23:14:57 +00:00
void accumulate_inst_sec(ACTIVE_TASK*, double dt);
void write_request(FILE*, PROJECT*);
void handle_reply(
PROJECT*, SCHEDULER_REPLY*, std::vector<RESULT*>new_results
);
void set_initial_work_request(PROJECT*);
void set_all_requests(PROJECT*);
void set_all_requests_hyst(PROJECT*, int rsc_type);
2008-12-31 23:14:57 +00:00
void print_state();
void init();
void compute_cant_fetch_work_reason();
2008-12-31 23:14:57 +00:00
void rr_init();
void clear_request();
void compute_shares();
void clear_backoffs(APP_VERSION&);
void request_string(char*);
bool requested_work();
2008-12-31 23:14:57 +00:00
};
- client: generalize the GPU framework so that - new GPU types can be added easily - users can specify GPUs in cc_config.xml, referred to by app_info.xml, and they will be scheduled by BOINC and passed --device N options Note: the parsing of cc_config.xml is not done yet. - RPC protocols (account manager and scheduler) can now specify GPU types in separate elements rather than embedding them in tag names e.g. <no_rsc>NVIDIA</no_rsc> rather than <no_cuda/> - client: in account manager replies, parse elements of the form <no_rsc>NAME</no_rsc> indicating the GPUs of type NAME should not be used. This allows account managers to control GPU types not hardwired into the client. Note: <no_cuda/> and <no_ati/> will continue to be supported. - scheduler RPC reply: add <no_rsc_apps>NAME</no_rsc_apps> (NAME = GPU name) to indicate that the project has no jobs for the indicated GPU type. <no_cuda_apps> etc. are still supported - client/lib: remove set_debts() GUI RPC - client/scheduler RPC remove <cuda_backoff> etc. (superceded by no_app) Exception: <ip_result> elements in sched request still have <ncudas> and <natis>. Fix this later. Implementation notes: - client/lib: change "CUDA" to "NVIDIA" in type/variable names, and in XML Continue to recognize "CUDA" for compatibility - host_info.coprocs no longer used within the client; use a global var (COPROCS coprocs) instead. COPROCS now has an array of COPROCs; GPUs types are identified by the array index. Index zero means CPU. - a bunch of other resource-specific structs (like RSC_WORK_FETCH) are now stored in arrays, with same indices as COPROCS (i.e. index 0 is CPU) - COPROCS still has COPROC_NVIDIA and COPROC_ATI structs to hold vendor-specific info - APP_VERSION now has a struct GPU_USAGE to describe its GPU usage svn path=/trunk/boinc/; revision=23253
2011-03-25 03:44:09 +00:00
extern RSC_WORK_FETCH rsc_work_fetch[MAX_RSC];
2008-12-31 23:14:57 +00:00
extern WORK_FETCH work_fetch;
extern void set_no_rsc_config();
extern void project_priority_init(bool for_work_fetch);
extern double project_priority(PROJECT*);
extern void adjust_rec_sched(RESULT*);
extern void adjust_rec_work_fetch(RESULT*);
extern double total_peak_flops();
2008-12-31 23:14:57 +00:00
#endif