2008-08-06 18:36:30 +00:00
|
|
|
// This file is part of BOINC.
|
2007-02-21 16:58:29 +00:00
|
|
|
// http://boinc.berkeley.edu
|
2008-08-06 18:36:30 +00:00
|
|
|
// Copyright (C) 2008 University of California
|
2007-02-21 16:58:29 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
2007-02-21 16:58:29 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
2007-02-21 16:58:29 +00:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
2007-02-21 16:58:29 +00:00
|
|
|
|
2010-05-11 19:10:29 +00:00
|
|
|
#include "cpp.h"
|
|
|
|
|
|
|
|
#ifdef _WIN32
|
|
|
|
#include "boinc_win.h"
|
|
|
|
#else
|
|
|
|
#include "config.h"
|
|
|
|
#endif
|
|
|
|
|
2009-03-26 16:56:20 +00:00
|
|
|
#include "util.h"
|
|
|
|
|
2010-01-09 16:41:17 +00:00
|
|
|
#include "client_msgs.h"
|
2012-04-30 21:00:28 +00:00
|
|
|
#include "client_state.h"
|
|
|
|
#include "project.h"
|
|
|
|
#include "result.h"
|
2009-09-18 20:49:54 +00:00
|
|
|
#include "scheduler_op.h"
|
2012-04-30 21:00:28 +00:00
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
#include "work_fetch.h"
|
|
|
|
|
2007-02-21 16:58:29 +00:00
|
|
|
using std::vector;
|
2008-12-31 23:07:59 +00:00
|
|
|
|
2011-02-18 01:06:00 +00:00
|
|
|
bool use_hyst_fetch = true;
|
2010-12-25 19:05:57 +00:00
|
|
|
|
2011-03-25 03:44:09 +00:00
|
|
|
RSC_WORK_FETCH rsc_work_fetch[MAX_RSC];
|
2008-12-31 23:07:59 +00:00
|
|
|
WORK_FETCH work_fetch;
|
|
|
|
|
2010-01-05 16:36:42 +00:00
|
|
|
#define FETCH_IF_IDLE_INSTANCE 0
|
|
|
|
// If resource has an idle instance,
|
|
|
|
// get work for it from the project with greatest LTD,
|
|
|
|
// even if it's overworked.
|
|
|
|
#define FETCH_IF_MAJOR_SHORTFALL 1
|
|
|
|
// If resource is saturated for less than work_buf_min(),
|
|
|
|
// get work for it from the project with greatest LTD,
|
|
|
|
// even if it's overworked.
|
|
|
|
#define FETCH_IF_MINOR_SHORTFALL 2
|
|
|
|
// If resource is saturated for less than work_buf_total(),
|
|
|
|
// get work for it from the non-overworked project with greatest LTD.
|
|
|
|
#define FETCH_IF_PROJECT_STARVED 3
|
|
|
|
// If any project is not overworked and has too few jobs
|
|
|
|
// to use its instance share,
|
|
|
|
// get work from the one with greatest LTD.
|
|
|
|
|
2010-03-25 23:48:58 +00:00
|
|
|
static const char* criterion_name(int criterion) {
|
2010-01-05 16:36:42 +00:00
|
|
|
switch (criterion) {
|
|
|
|
case FETCH_IF_IDLE_INSTANCE: return "idle instance";
|
|
|
|
case FETCH_IF_MAJOR_SHORTFALL: return "major shortfall";
|
|
|
|
case FETCH_IF_MINOR_SHORTFALL: return "minor shortfall";
|
|
|
|
case FETCH_IF_PROJECT_STARVED: return "starved";
|
|
|
|
}
|
|
|
|
return "unknown";
|
|
|
|
}
|
|
|
|
|
2010-11-09 19:04:24 +00:00
|
|
|
inline bool dont_fetch(PROJECT* p, int rsc_type) {
|
2011-03-25 03:44:09 +00:00
|
|
|
if (p->no_rsc_pref[rsc_type]) return true;
|
2011-07-29 00:07:20 +00:00
|
|
|
if (p->no_rsc_config[rsc_type]) return true;
|
2011-03-25 03:44:09 +00:00
|
|
|
if (p->no_rsc_apps[rsc_type]) return true;
|
2011-07-08 19:22:22 +00:00
|
|
|
if (p->no_rsc_ams[rsc_type]) return true;
|
2009-11-12 17:19:50 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2011-07-29 00:07:20 +00:00
|
|
|
// if the configuration file disallows the use of a GPU type
|
|
|
|
// for a project, set a flag to that effect
|
|
|
|
//
|
|
|
|
void set_no_rsc_config() {
|
|
|
|
for (unsigned int i=0; i<gstate.projects.size(); i++) {
|
|
|
|
PROJECT& p = *gstate.projects[i];
|
|
|
|
for (int j=1; j<coprocs.n_rsc; j++) {
|
|
|
|
bool allowed[MAX_COPROC_INSTANCES];
|
|
|
|
memset(allowed, 0, sizeof(allowed));
|
|
|
|
COPROC& c = coprocs.coprocs[j];
|
|
|
|
for (int k=0; k<c.count; k++) {
|
|
|
|
allowed[c.device_nums[k]] = true;
|
|
|
|
}
|
|
|
|
for (unsigned int k=0; k<config.exclude_gpus.size(); k++) {
|
|
|
|
EXCLUDE_GPU& e = config.exclude_gpus[k];
|
|
|
|
if (strcmp(e.url.c_str(), p.master_url)) continue;
|
2011-07-29 17:54:49 +00:00
|
|
|
if (!e.type.empty() && strcmp(e.type.c_str(), c.type)) continue;
|
2011-07-29 00:07:20 +00:00
|
|
|
if (!e.appname.empty()) continue;
|
2011-07-29 17:54:49 +00:00
|
|
|
if (e.device_num < 0) {
|
2011-07-29 00:07:20 +00:00
|
|
|
memset(allowed, 0, sizeof(allowed));
|
|
|
|
break;
|
|
|
|
}
|
2011-07-29 17:54:49 +00:00
|
|
|
allowed[e.device_num] = false;
|
2011-07-29 00:07:20 +00:00
|
|
|
}
|
|
|
|
p.no_rsc_config[j] = true;
|
|
|
|
for (int k=0; k<c.count; k++) {
|
2012-04-22 06:28:47 +00:00
|
|
|
if (allowed[c.device_nums[k]]) {
|
2011-07-29 00:07:20 +00:00
|
|
|
p.no_rsc_config[j] = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-01-05 16:36:42 +00:00
|
|
|
// does the project have a downloading or runnable job?
|
|
|
|
//
|
|
|
|
static bool has_a_job(PROJECT* p) {
|
|
|
|
for (unsigned int j=0; j<gstate.results.size(); j++) {
|
|
|
|
RESULT* rp = gstate.results[j];
|
|
|
|
if (rp->project != p) continue;
|
|
|
|
if (rp->state() <= RESULT_FILES_DOWNLOADED) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline bool has_coproc_app(PROJECT* p, int rsc_type) {
|
|
|
|
unsigned int i;
|
|
|
|
for (i=0; i<gstate.app_versions.size(); i++) {
|
|
|
|
APP_VERSION* avp = gstate.app_versions[i];
|
|
|
|
if (avp->project != p) continue;
|
2011-03-25 03:44:09 +00:00
|
|
|
if (avp->gpu_usage.rsc_type == rsc_type) return true;
|
2010-01-05 16:36:42 +00:00
|
|
|
}
|
|
|
|
return false;
|
2008-12-31 23:07:59 +00:00
|
|
|
}
|
|
|
|
|
2010-01-05 16:36:42 +00:00
|
|
|
/////////////// RSC_PROJECT_WORK_FETCH ///////////////
|
|
|
|
|
2009-09-28 04:24:18 +00:00
|
|
|
bool RSC_PROJECT_WORK_FETCH::compute_may_have_work(PROJECT* p, int rsc_type) {
|
2011-07-08 19:22:22 +00:00
|
|
|
if (dont_fetch(p, rsc_type)) return false;
|
2011-03-25 03:44:09 +00:00
|
|
|
if (p->rsc_defer_sched[rsc_type]) return false;
|
2009-01-08 00:41:15 +00:00
|
|
|
return (backoff_time < gstate.now);
|
|
|
|
}
|
|
|
|
|
2009-09-28 04:24:18 +00:00
|
|
|
void RSC_PROJECT_WORK_FETCH::rr_init(PROJECT* p, int rsc_type) {
|
|
|
|
may_have_work = compute_may_have_work(p, rsc_type);
|
2009-02-02 05:15:12 +00:00
|
|
|
fetchable_share = 0;
|
|
|
|
has_runnable_jobs = false;
|
2009-02-02 18:47:34 +00:00
|
|
|
sim_nused = 0;
|
2009-12-24 20:40:27 +00:00
|
|
|
nused_total = 0;
|
2009-04-10 19:01:38 +00:00
|
|
|
deadlines_missed = 0;
|
2009-01-03 06:01:17 +00:00
|
|
|
}
|
|
|
|
|
2010-01-05 16:36:42 +00:00
|
|
|
void RSC_PROJECT_WORK_FETCH::backoff(PROJECT* p, const char* name) {
|
|
|
|
if (backoff_interval) {
|
|
|
|
backoff_interval *= 2;
|
2010-07-09 19:24:13 +00:00
|
|
|
if (backoff_interval > WF_MAX_BACKOFF_INTERVAL) backoff_interval = WF_MAX_BACKOFF_INTERVAL;
|
2010-01-05 16:36:42 +00:00
|
|
|
} else {
|
2010-07-09 19:24:13 +00:00
|
|
|
backoff_interval = WF_MIN_BACKOFF_INTERVAL;
|
2010-01-05 16:36:42 +00:00
|
|
|
}
|
2010-09-24 21:24:02 +00:00
|
|
|
double x = (.5 + drand())*backoff_interval;
|
2010-01-05 16:36:42 +00:00
|
|
|
backoff_time = gstate.now + x;
|
|
|
|
if (log_flags.work_fetch_debug) {
|
|
|
|
msg_printf(p, MSG_INFO,
|
2010-04-29 20:32:51 +00:00
|
|
|
"[work_fetch] backing off %s %.0f sec", name, x
|
2010-01-05 16:36:42 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/////////////// RSC_WORK_FETCH ///////////////
|
|
|
|
|
|
|
|
RSC_PROJECT_WORK_FETCH& RSC_WORK_FETCH::project_state(PROJECT* p) {
|
2011-03-25 03:44:09 +00:00
|
|
|
return p->rsc_pwf[rsc_type];
|
2010-01-05 16:36:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool RSC_WORK_FETCH::may_have_work(PROJECT* p) {
|
2010-11-09 19:04:24 +00:00
|
|
|
if (dont_fetch(p, rsc_type)) return false;
|
2010-01-05 16:36:42 +00:00
|
|
|
RSC_PROJECT_WORK_FETCH& w = project_state(p);
|
|
|
|
return (w.backoff_time < gstate.now);
|
|
|
|
}
|
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
void RSC_WORK_FETCH::rr_init() {
|
|
|
|
shortfall = 0;
|
|
|
|
nidle_now = 0;
|
2009-02-02 18:47:34 +00:00
|
|
|
sim_nused = 0;
|
2009-01-03 06:01:17 +00:00
|
|
|
total_fetchable_share = 0;
|
2009-03-19 16:55:04 +00:00
|
|
|
deadline_missed_instances = 0;
|
2009-07-17 18:29:10 +00:00
|
|
|
saturated_time = 0;
|
2009-09-03 20:31:04 +00:00
|
|
|
busy_time_estimator.reset();
|
2008-12-31 23:07:59 +00:00
|
|
|
}
|
|
|
|
|
2009-02-02 18:45:00 +00:00
|
|
|
void RSC_WORK_FETCH::accumulate_shortfall(double d_time) {
|
|
|
|
double idle = ninstances - sim_nused;
|
2009-06-02 23:26:12 +00:00
|
|
|
if (idle > 1e-6) {
|
2011-01-12 00:47:51 +00:00
|
|
|
//msg_printf(0, MSG_INFO, "adding shortfall %d %f", rsc_type, idle*d_time);
|
2008-12-31 23:07:59 +00:00
|
|
|
shortfall += idle*d_time;
|
|
|
|
}
|
2009-06-02 22:53:57 +00:00
|
|
|
#if 0
|
|
|
|
msg_printf(0, MSG_INFO, "accum shortf (%s): idle %f dt %f sf %f",
|
|
|
|
rsc_name(rsc_type), idle, d_time, shortfall
|
|
|
|
);
|
|
|
|
#endif
|
2008-12-31 23:07:59 +00:00
|
|
|
}
|
2007-02-21 16:58:29 +00:00
|
|
|
|
2009-07-17 18:29:10 +00:00
|
|
|
void RSC_WORK_FETCH::update_saturated_time(double dt) {
|
2009-06-02 23:26:12 +00:00
|
|
|
double idle = ninstances - sim_nused;
|
2009-07-17 18:29:10 +00:00
|
|
|
if (idle < 1e-6) {
|
|
|
|
saturated_time = dt;
|
2009-01-30 21:25:24 +00:00
|
|
|
}
|
2009-07-17 18:29:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void RSC_WORK_FETCH::update_busy_time(double dur, double nused) {
|
2009-09-03 20:31:04 +00:00
|
|
|
busy_time_estimator.update(dur, nused);
|
2009-01-30 21:25:24 +00:00
|
|
|
}
|
|
|
|
|
2010-01-15 23:08:55 +00:00
|
|
|
static bool wacky_dcf(PROJECT* p) {
|
2012-03-23 16:25:19 +00:00
|
|
|
if (p->dont_use_dcf) return false;
|
2010-01-15 23:08:55 +00:00
|
|
|
double dcf = p->duration_correction_factor;
|
|
|
|
return (dcf < 0.02 || dcf > 80.0);
|
|
|
|
}
|
|
|
|
|
2011-11-04 08:15:04 +00:00
|
|
|
// If this resource is below min buffer level,
|
|
|
|
// return the highest-priority project that may have jobs for it.
|
|
|
|
//
|
2012-02-18 23:19:06 +00:00
|
|
|
PROJECT* RSC_WORK_FETCH::choose_project_hyst(bool enforce_hyst) {
|
2010-12-23 23:39:30 +00:00
|
|
|
PROJECT* pbest = NULL;
|
2012-02-18 23:19:06 +00:00
|
|
|
if (enforce_hyst) {
|
|
|
|
if (saturated_time > gstate.work_buf_min()) return NULL;
|
|
|
|
}
|
2012-02-24 16:42:38 +00:00
|
|
|
if (saturated_time > gstate.work_buf_total()) return NULL;
|
|
|
|
|
2010-12-23 23:39:30 +00:00
|
|
|
for (unsigned i=0; i<gstate.projects.size(); i++) {
|
|
|
|
PROJECT* p = gstate.projects[i];
|
2012-07-02 03:43:05 +00:00
|
|
|
if (p->pwf.cant_fetch_work_reason) continue;
|
2010-12-23 23:39:30 +00:00
|
|
|
if (!project_state(p).may_have_work) continue;
|
2011-10-01 16:23:28 +00:00
|
|
|
|
2012-02-16 04:54:09 +00:00
|
|
|
// if project has zero resource share,
|
|
|
|
// only fetch work if a device is idle
|
|
|
|
//
|
|
|
|
if (p->resource_share == 0 && nidle_now == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2011-10-01 16:23:28 +00:00
|
|
|
// if project has excluded GPUs of this type,
|
|
|
|
// and it has runnable jobs for this type,
|
|
|
|
// don't fetch work for it.
|
|
|
|
// TODO: THIS IS CRUDE. Making it smarter would require
|
|
|
|
// computing shortfall etc. on a per-project basis
|
|
|
|
//
|
|
|
|
if (rsc_type) {
|
2012-03-04 04:58:41 +00:00
|
|
|
if (p->ncoprocs_excluded[rsc_type] == ninstances) {
|
|
|
|
continue;
|
|
|
|
}
|
2011-10-01 16:23:28 +00:00
|
|
|
if (p->ncoprocs_excluded[rsc_type]
|
|
|
|
&& p->rsc_pwf[rsc_type].has_runnable_jobs
|
|
|
|
){
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-12-23 23:39:30 +00:00
|
|
|
RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
|
|
|
|
if (rpwf.anon_skip) continue;
|
|
|
|
if (pbest) {
|
2011-10-03 06:18:58 +00:00
|
|
|
if (pbest->sched_priority > p->sched_priority) {
|
2010-12-23 23:39:30 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
pbest = p;
|
|
|
|
}
|
|
|
|
if (!pbest) return NULL;
|
|
|
|
work_fetch.clear_request();
|
2010-12-30 22:41:50 +00:00
|
|
|
work_fetch.set_all_requests_hyst(pbest, rsc_type);
|
2010-12-23 23:39:30 +00:00
|
|
|
return pbest;
|
|
|
|
}
|
|
|
|
|
2009-02-20 21:44:39 +00:00
|
|
|
// Choose the best project to ask for work for this resource,
|
2009-02-23 20:35:52 +00:00
|
|
|
// given the specific criterion
|
2009-01-08 00:41:15 +00:00
|
|
|
//
|
2009-02-23 20:35:52 +00:00
|
|
|
PROJECT* RSC_WORK_FETCH::choose_project(int criterion) {
|
2008-12-31 23:07:59 +00:00
|
|
|
PROJECT* pbest = NULL;
|
2007-02-21 16:58:29 +00:00
|
|
|
|
2009-02-23 20:35:52 +00:00
|
|
|
switch (criterion) {
|
2009-03-25 16:59:59 +00:00
|
|
|
case FETCH_IF_IDLE_INSTANCE:
|
|
|
|
if (nidle_now == 0) return NULL;
|
|
|
|
break;
|
2009-03-18 15:53:02 +00:00
|
|
|
case FETCH_IF_MAJOR_SHORTFALL:
|
2009-07-17 18:29:10 +00:00
|
|
|
if (saturated_time > gstate.work_buf_min()) return NULL;
|
2009-02-23 20:35:52 +00:00
|
|
|
break;
|
2009-03-18 15:53:02 +00:00
|
|
|
case FETCH_IF_MINOR_SHORTFALL:
|
2009-07-17 18:29:10 +00:00
|
|
|
if (saturated_time > gstate.work_buf_total()) return NULL;
|
2009-02-23 20:35:52 +00:00
|
|
|
break;
|
2009-03-19 16:55:04 +00:00
|
|
|
case FETCH_IF_PROJECT_STARVED:
|
|
|
|
if (deadline_missed_instances >= ninstances) return NULL;
|
|
|
|
break;
|
2009-02-23 20:35:52 +00:00
|
|
|
}
|
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
for (unsigned i=0; i<gstate.projects.size(); i++) {
|
|
|
|
PROJECT* p = gstate.projects[i];
|
2012-07-02 03:43:05 +00:00
|
|
|
if (p->pwf.cant_fetch_work_reason) continue;
|
2009-01-08 00:41:15 +00:00
|
|
|
if (!project_state(p).may_have_work) continue;
|
2009-02-20 21:44:39 +00:00
|
|
|
RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
|
2010-02-11 22:19:22 +00:00
|
|
|
if (rpwf.anon_skip) continue;
|
2009-02-23 20:35:52 +00:00
|
|
|
switch (criterion) {
|
2009-03-18 15:53:02 +00:00
|
|
|
case FETCH_IF_MINOR_SHORTFALL:
|
2010-01-15 23:08:55 +00:00
|
|
|
if (wacky_dcf(p)) continue;
|
2010-01-28 05:21:14 +00:00
|
|
|
if (!p->resource_share) continue;
|
2010-01-15 23:08:55 +00:00
|
|
|
break;
|
|
|
|
case FETCH_IF_MAJOR_SHORTFALL:
|
|
|
|
if (wacky_dcf(p)) continue;
|
2010-01-28 05:21:14 +00:00
|
|
|
if (!p->resource_share) continue;
|
2009-02-23 20:35:52 +00:00
|
|
|
break;
|
|
|
|
case FETCH_IF_PROJECT_STARVED:
|
2011-10-03 06:18:58 +00:00
|
|
|
if (p->sched_priority < 0) continue;
|
2011-01-12 00:47:51 +00:00
|
|
|
if (rpwf.nused_total >= ninstances) continue;
|
2010-01-28 05:21:14 +00:00
|
|
|
if (!p->resource_share) continue;
|
2009-02-23 20:35:52 +00:00
|
|
|
break;
|
2009-01-28 04:58:01 +00:00
|
|
|
}
|
2010-01-07 21:00:42 +00:00
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
if (pbest) {
|
2010-01-28 05:21:14 +00:00
|
|
|
if (!p->resource_share) {
|
|
|
|
continue;
|
|
|
|
}
|
2011-10-03 06:18:58 +00:00
|
|
|
if (pbest->sched_priority > p->sched_priority) {
|
2010-11-23 19:39:47 +00:00
|
|
|
continue;
|
|
|
|
}
|
2007-02-21 16:58:29 +00:00
|
|
|
}
|
2008-12-31 23:07:59 +00:00
|
|
|
pbest = p;
|
2007-02-21 16:58:29 +00:00
|
|
|
}
|
2009-02-23 20:35:52 +00:00
|
|
|
if (!pbest) return NULL;
|
|
|
|
|
|
|
|
// decide how much work to request from each resource
|
|
|
|
//
|
|
|
|
work_fetch.clear_request();
|
|
|
|
switch (criterion) {
|
2009-03-25 16:59:59 +00:00
|
|
|
case FETCH_IF_IDLE_INSTANCE:
|
2009-03-18 15:53:02 +00:00
|
|
|
case FETCH_IF_MAJOR_SHORTFALL:
|
2011-09-13 22:58:39 +00:00
|
|
|
set_request(pbest);
|
2010-01-27 19:14:29 +00:00
|
|
|
break;
|
2009-12-24 20:40:27 +00:00
|
|
|
case FETCH_IF_PROJECT_STARVED:
|
2011-09-13 22:58:39 +00:00
|
|
|
set_request(pbest);
|
2009-02-23 20:35:52 +00:00
|
|
|
break;
|
2009-03-18 15:53:02 +00:00
|
|
|
case FETCH_IF_MINOR_SHORTFALL:
|
2009-12-24 20:40:27 +00:00
|
|
|
// in this case, potentially request work for all resources
|
|
|
|
//
|
2011-10-03 06:18:58 +00:00
|
|
|
if (pbest->sched_priority < 0) {
|
2011-09-13 22:58:39 +00:00
|
|
|
set_request(pbest);
|
2011-01-12 00:47:51 +00:00
|
|
|
} else {
|
|
|
|
work_fetch.set_all_requests(pbest);
|
|
|
|
}
|
2009-02-23 20:35:52 +00:00
|
|
|
break;
|
|
|
|
}
|
2010-01-07 21:00:42 +00:00
|
|
|
// in principle there should be a nonzero request.
|
|
|
|
// check, just in case
|
|
|
|
//
|
|
|
|
if (!req_secs && !req_instances) {
|
2010-01-19 23:01:09 +00:00
|
|
|
if (log_flags.work_fetch_debug) {
|
|
|
|
msg_printf(pbest, MSG_INFO,
|
2010-04-29 20:32:51 +00:00
|
|
|
"[work_fetch] error: project chosen but zero request"
|
2010-01-19 23:01:09 +00:00
|
|
|
);
|
|
|
|
}
|
2010-01-07 21:00:42 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-01-27 19:06:40 +00:00
|
|
|
if (log_flags.work_fetch_debug) {
|
|
|
|
msg_printf(pbest, MSG_INFO,
|
2010-04-29 20:32:51 +00:00
|
|
|
"[work_fetch] chosen: %s %s: %.2f inst, %.2f sec",
|
2010-01-27 19:06:40 +00:00
|
|
|
criterion_name(criterion), rsc_name(rsc_type),
|
|
|
|
req_instances, req_secs
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
return pbest;
|
|
|
|
}
|
|
|
|
|
2010-01-12 21:53:40 +00:00
|
|
|
// request this project's share of shortfall and instances.
|
|
|
|
// don't request anything if project is overworked or backed off.
|
2009-12-24 20:40:27 +00:00
|
|
|
//
|
2011-09-13 22:58:39 +00:00
|
|
|
void RSC_WORK_FETCH::set_request(PROJECT* p) {
|
2011-07-08 19:22:22 +00:00
|
|
|
if (dont_fetch(p, rsc_type)) return;
|
|
|
|
|
2010-08-05 20:42:13 +00:00
|
|
|
// if backup project, fetch 1 job per idle instance
|
|
|
|
//
|
|
|
|
if (p->resource_share == 0) {
|
|
|
|
req_instances = nidle_now;
|
|
|
|
req_secs = 1;
|
|
|
|
return;
|
|
|
|
}
|
2010-06-02 17:50:47 +00:00
|
|
|
if (config.fetch_minimal_work) {
|
|
|
|
req_instances = ninstances;
|
|
|
|
req_secs = 1;
|
|
|
|
return;
|
|
|
|
}
|
2009-02-23 21:34:13 +00:00
|
|
|
RSC_PROJECT_WORK_FETCH& w = project_state(p);
|
|
|
|
if (!w.may_have_work) return;
|
2010-02-11 22:19:22 +00:00
|
|
|
if (w.anon_skip) return;
|
2010-01-12 21:53:40 +00:00
|
|
|
if (shortfall) {
|
2010-01-15 23:08:55 +00:00
|
|
|
if (wacky_dcf(p)) {
|
2010-01-12 21:53:40 +00:00
|
|
|
// if project's DCF is too big or small,
|
|
|
|
// its completion time estimates are useless; just ask for 1 second
|
|
|
|
//
|
|
|
|
req_secs = 1;
|
|
|
|
} else {
|
2011-09-12 17:57:31 +00:00
|
|
|
req_secs = shortfall;
|
2010-01-12 21:53:40 +00:00
|
|
|
}
|
2009-12-24 20:40:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// our share of the idle instances
|
|
|
|
//
|
2011-10-19 06:37:03 +00:00
|
|
|
req_instances = nidle_now * w.fetchable_share;
|
2009-12-24 20:40:27 +00:00
|
|
|
|
2011-10-19 06:37:03 +00:00
|
|
|
if (log_flags.work_fetch_debug) {
|
|
|
|
msg_printf(0, MSG_INFO,
|
2011-12-06 04:21:27 +00:00
|
|
|
"[work_fetch] set_request(): ninst %d nused_total %f nidle_now %f fetch share %f req_inst %f",
|
2011-10-19 06:37:03 +00:00
|
|
|
ninstances, w.nused_total, nidle_now, w.fetchable_share, req_instances
|
|
|
|
);
|
|
|
|
}
|
2010-01-07 21:00:42 +00:00
|
|
|
if (req_instances && !req_secs) {
|
|
|
|
req_secs = 1;
|
|
|
|
}
|
2009-12-24 20:40:27 +00:00
|
|
|
}
|
|
|
|
|
2009-04-03 21:55:26 +00:00
|
|
|
void RSC_WORK_FETCH::print_state(const char* name) {
|
2008-12-31 23:07:59 +00:00
|
|
|
msg_printf(0, MSG_INFO,
|
2011-09-12 17:57:31 +00:00
|
|
|
"[work_fetch] %s: shortfall %.2f nidle %.2f saturated %.2f busy %.2f",
|
2008-12-31 23:07:59 +00:00
|
|
|
name,
|
2011-09-12 17:57:31 +00:00
|
|
|
shortfall, nidle_now, saturated_time,
|
|
|
|
busy_time_estimator.get_busy_time()
|
2008-12-31 23:07:59 +00:00
|
|
|
);
|
|
|
|
for (unsigned int i=0; i<gstate.projects.size(); i++) {
|
|
|
|
PROJECT* p = gstate.projects[i];
|
2009-01-21 20:51:33 +00:00
|
|
|
if (p->non_cpu_intensive) continue;
|
2008-12-31 23:07:59 +00:00
|
|
|
RSC_PROJECT_WORK_FETCH& pwf = project_state(p);
|
2011-07-08 19:22:22 +00:00
|
|
|
bool no_rsc_pref = p->no_rsc_pref[rsc_type];
|
2011-07-29 00:07:20 +00:00
|
|
|
bool no_rsc_config = p->no_rsc_config[rsc_type];
|
2011-07-08 19:22:22 +00:00
|
|
|
bool no_rsc_apps = p->no_rsc_apps[rsc_type];
|
|
|
|
bool no_rsc_ams = p->no_rsc_ams[rsc_type];
|
2010-11-23 19:39:47 +00:00
|
|
|
double bt = pwf.backoff_time>gstate.now?pwf.backoff_time-gstate.now:0;
|
|
|
|
msg_printf(p, MSG_INFO,
|
2012-02-20 19:59:36 +00:00
|
|
|
"[work_fetch] %s: fetch share %.3f rsc backoff (dt %.2f, inc %.2f)%s%s%s%s",
|
2010-11-23 19:39:47 +00:00
|
|
|
name,
|
2012-02-15 06:59:35 +00:00
|
|
|
pwf.fetchable_share, bt, pwf.backoff_interval,
|
2011-07-08 19:22:22 +00:00
|
|
|
no_rsc_pref?" (blocked by prefs)":"",
|
|
|
|
no_rsc_apps?" (no apps)":"",
|
2011-07-29 00:07:20 +00:00
|
|
|
no_rsc_ams?" (blocked by account manager)":"",
|
|
|
|
no_rsc_config?" (blocked by configuration file)":""
|
2010-11-23 19:39:47 +00:00
|
|
|
);
|
2008-12-31 23:07:59 +00:00
|
|
|
}
|
2007-02-21 16:58:29 +00:00
|
|
|
}
|
|
|
|
|
2010-01-05 16:36:42 +00:00
|
|
|
void RSC_WORK_FETCH::clear_request() {
|
|
|
|
req_secs = 0;
|
|
|
|
req_instances = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/////////////// PROJECT_WORK_FETCH ///////////////
|
|
|
|
|
2012-07-02 03:43:05 +00:00
|
|
|
bool PROJECT_WORK_FETCH::compute_cant_fetch_work_reason(PROJECT* p) {
|
|
|
|
if (p->non_cpu_intensive) return CANT_FETCH_WORK_NON_CPU_INTENSIVE;
|
|
|
|
if (p->suspended_via_gui) return CANT_FETCH_WORK_SUSPENDED_VIA_GUI;
|
|
|
|
if (p->master_url_fetch_pending) return CANT_FETCH_WORK_MASTER_URL_FETCH_PENDING;
|
|
|
|
if (p->min_rpc_time > gstate.now) return CANT_FETCH_WORK_MIN_RPC_TIME;
|
|
|
|
if (p->dont_request_more_work) return CANT_FETCH_WORK_DONT_REQUEST_MORE_WORK;
|
|
|
|
if (p->some_download_stalled()) return CANT_FETCH_WORK_DOWNLOAD_STALLED;
|
|
|
|
if (p->some_result_suspended()) return CANT_FETCH_WORK_RESULT_SUSPENDED;
|
|
|
|
if (p->too_many_uploading_results) return CANT_FETCH_WORK_TOO_MANY_UPLOADS;
|
2010-01-05 16:36:42 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void PROJECT_WORK_FETCH::reset(PROJECT* p) {
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int i=0; i<coprocs.n_rsc; i++) {
|
|
|
|
p->rsc_pwf[i].reset();
|
|
|
|
}
|
2010-01-05 16:36:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/////////////// WORK_FETCH ///////////////
|
|
|
|
|
|
|
|
void WORK_FETCH::rr_init() {
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int i=0; i<coprocs.n_rsc; i++) {
|
|
|
|
rsc_work_fetch[i].rr_init();
|
|
|
|
}
|
2008-12-31 23:07:59 +00:00
|
|
|
for (unsigned int i=0; i<gstate.projects.size(); i++) {
|
|
|
|
PROJECT* p = gstate.projects[i];
|
2012-07-02 03:43:05 +00:00
|
|
|
p->pwf.cant_fetch_work_reason = p->pwf.compute_cant_fetch_work_reason(p);
|
2010-01-05 16:36:42 +00:00
|
|
|
p->pwf.has_runnable_jobs = false;
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int j=0; j<coprocs.n_rsc; j++) {
|
|
|
|
p->rsc_pwf[j].rr_init(p, j);
|
2010-01-05 16:36:42 +00:00
|
|
|
}
|
2008-12-31 23:07:59 +00:00
|
|
|
}
|
2007-02-21 16:58:29 +00:00
|
|
|
}
|
|
|
|
|
2010-12-23 23:39:30 +00:00
|
|
|
// if the given project is highest-priority among the projects
|
|
|
|
// eligible for the resource, set request fields
|
|
|
|
//
|
|
|
|
void RSC_WORK_FETCH::supplement(PROJECT* pp) {
|
2011-10-03 06:18:58 +00:00
|
|
|
double x = pp->sched_priority;
|
2010-12-23 23:39:30 +00:00
|
|
|
for (unsigned i=0; i<gstate.projects.size(); i++) {
|
|
|
|
PROJECT* p = gstate.projects[i];
|
|
|
|
if (p == pp) continue;
|
2012-07-02 03:43:05 +00:00
|
|
|
if (p->pwf.cant_fetch_work_reason) continue;
|
2010-12-23 23:39:30 +00:00
|
|
|
if (!project_state(p).may_have_work) continue;
|
|
|
|
RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
|
|
|
|
if (rpwf.anon_skip) continue;
|
2011-10-03 06:18:58 +00:00
|
|
|
if (p->sched_priority > x) {
|
2010-12-23 23:39:30 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// didn't find a better project; ask for work
|
2011-11-04 08:15:04 +00:00
|
|
|
//
|
2011-09-13 22:58:39 +00:00
|
|
|
set_request(pp);
|
2010-12-23 23:39:30 +00:00
|
|
|
}
|
|
|
|
|
2011-11-04 08:15:04 +00:00
|
|
|
// we're going to ask the given project for work of the given type.
|
|
|
|
// (or -1 if none)
|
|
|
|
// Set requests for this type and perhaps other types
|
|
|
|
//
|
2010-12-30 22:41:50 +00:00
|
|
|
void WORK_FETCH::set_all_requests_hyst(PROJECT* p, int rsc_type) {
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int i=0; i<coprocs.n_rsc; i++) {
|
|
|
|
if (i == rsc_type) {
|
2011-09-13 22:58:39 +00:00
|
|
|
rsc_work_fetch[i].set_request(p);
|
2011-03-25 03:44:09 +00:00
|
|
|
} else {
|
2012-04-20 23:51:20 +00:00
|
|
|
// don't fetch work for a resource if the buffer is above max
|
|
|
|
//
|
|
|
|
if (rsc_work_fetch[i].saturated_time > gstate.work_buf_total()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (i>0 && !gpus_usable) {
|
|
|
|
continue;
|
2011-03-25 03:44:09 +00:00
|
|
|
}
|
2012-04-20 23:51:20 +00:00
|
|
|
rsc_work_fetch[i].supplement(p);
|
2010-12-23 23:39:30 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-01-05 16:36:42 +00:00
|
|
|
void WORK_FETCH::set_all_requests(PROJECT* p) {
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int i=0; i<coprocs.n_rsc; i++) {
|
|
|
|
if (i==0 || gpus_usable) {
|
2011-09-13 22:58:39 +00:00
|
|
|
rsc_work_fetch[i].set_request(p);
|
2011-03-25 03:44:09 +00:00
|
|
|
}
|
2010-01-05 16:36:42 +00:00
|
|
|
}
|
2009-01-03 06:01:17 +00:00
|
|
|
}
|
|
|
|
|
2010-01-05 16:36:42 +00:00
|
|
|
void WORK_FETCH::print_state() {
|
2010-04-29 20:32:51 +00:00
|
|
|
msg_printf(0, MSG_INFO, "[work_fetch] ------- start work fetch state -------");
|
|
|
|
msg_printf(0, MSG_INFO, "[work_fetch] target work buffer: %.2f + %.2f sec",
|
2010-01-05 16:36:42 +00:00
|
|
|
gstate.work_buf_min(), gstate.work_buf_additional()
|
|
|
|
);
|
|
|
|
for (unsigned int i=0; i<gstate.projects.size(); i++) {
|
2012-02-15 06:59:35 +00:00
|
|
|
char buf[256];
|
2010-01-05 16:36:42 +00:00
|
|
|
PROJECT* p = gstate.projects[i];
|
|
|
|
if (p->non_cpu_intensive) continue;
|
2012-02-15 06:59:35 +00:00
|
|
|
if (p->min_rpc_time > gstate.now) {
|
|
|
|
sprintf(buf, " (project backoff %.2f)", p->min_rpc_time - gstate.now);
|
|
|
|
} else {
|
|
|
|
strcpy(buf, "");
|
|
|
|
}
|
2012-02-20 19:59:36 +00:00
|
|
|
msg_printf(p, MSG_INFO, "[work_fetch] REC %.3f priority %.6f%s%s%s%s%s",
|
2012-02-15 06:59:35 +00:00
|
|
|
p->pwf.rec,
|
|
|
|
p->sched_priority,
|
|
|
|
buf,
|
|
|
|
p->suspended_via_gui?" (susp via GUI)":"",
|
|
|
|
p->master_url_fetch_pending?" (master fetch pending)":"",
|
|
|
|
p->dont_request_more_work?" (no new tasks)":"",
|
|
|
|
p->too_many_uploading_results?" (too many uploads)":""
|
|
|
|
);
|
|
|
|
}
|
|
|
|
for (int i=0; i<coprocs.n_rsc; i++) {
|
|
|
|
rsc_work_fetch[i].print_state(rsc_name(i));
|
2009-01-27 00:16:31 +00:00
|
|
|
}
|
2010-04-29 20:32:51 +00:00
|
|
|
msg_printf(0, MSG_INFO, "[work_fetch] ------- end work fetch state -------");
|
2010-01-05 16:36:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void WORK_FETCH::clear_request() {
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int i=0; i<coprocs.n_rsc; i++) {
|
|
|
|
rsc_work_fetch[i].clear_request();
|
|
|
|
}
|
2009-01-27 00:16:31 +00:00
|
|
|
}
|
|
|
|
|
2009-09-08 18:42:24 +00:00
|
|
|
// we're going to contact this project for reasons other than work fetch;
|
2009-02-20 21:44:39 +00:00
|
|
|
// decide if we should piggy-back a work fetch request.
|
2009-01-03 06:01:17 +00:00
|
|
|
//
|
|
|
|
void WORK_FETCH::compute_work_request(PROJECT* p) {
|
2009-02-18 16:21:30 +00:00
|
|
|
clear_request();
|
2010-06-02 17:50:47 +00:00
|
|
|
if (config.fetch_minimal_work && gstate.had_or_requested_work) return;
|
2009-02-18 16:21:30 +00:00
|
|
|
if (p->dont_request_more_work) return;
|
2009-01-27 00:16:31 +00:00
|
|
|
if (p->non_cpu_intensive) {
|
2009-02-18 16:21:30 +00:00
|
|
|
if (!has_a_job(p)) {
|
2011-03-25 03:44:09 +00:00
|
|
|
rsc_work_fetch[0].req_secs = 1;
|
2009-01-27 00:16:31 +00:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2012-02-18 23:19:06 +00:00
|
|
|
PROJECT* bestp = choose_project(false);
|
2011-12-06 04:21:27 +00:00
|
|
|
if (p != bestp) {
|
|
|
|
clear_request();
|
|
|
|
}
|
2009-01-03 06:01:17 +00:00
|
|
|
}
|
|
|
|
|
2009-01-21 20:28:00 +00:00
|
|
|
// see if there's a fetchable non-CPU-intensive project without work
|
|
|
|
//
|
|
|
|
PROJECT* WORK_FETCH::non_cpu_intensive_project_needing_work() {
|
2009-01-21 20:51:33 +00:00
|
|
|
for (unsigned int i=0; i<gstate.projects.size(); i++) {
|
|
|
|
PROJECT* p = gstate.projects[i];
|
|
|
|
if (!p->non_cpu_intensive) continue;
|
|
|
|
if (!p->can_request_work()) continue;
|
2011-03-25 03:44:09 +00:00
|
|
|
if (p->rsc_pwf[0].backoff_time > gstate.now) continue;
|
2009-01-27 00:16:31 +00:00
|
|
|
if (has_a_job(p)) continue;
|
|
|
|
clear_request();
|
2011-03-25 03:44:09 +00:00
|
|
|
rsc_work_fetch[0].req_secs = 1;
|
2009-01-27 00:16:31 +00:00
|
|
|
return p;
|
2009-01-21 20:51:33 +00:00
|
|
|
}
|
|
|
|
return 0;
|
2009-01-21 20:28:00 +00:00
|
|
|
}
|
|
|
|
|
2009-01-03 06:01:17 +00:00
|
|
|
// choose a project to fetch work from,
|
|
|
|
// and set the request fields of resource objects
|
2007-02-21 16:58:29 +00:00
|
|
|
//
|
2012-02-18 23:19:06 +00:00
|
|
|
PROJECT* WORK_FETCH::choose_project(bool enforce_hyst) {
|
2011-10-16 01:29:57 +00:00
|
|
|
PROJECT* p;
|
2009-01-08 00:41:15 +00:00
|
|
|
|
2010-01-21 00:14:56 +00:00
|
|
|
if (log_flags.work_fetch_debug) {
|
2011-04-21 17:04:42 +00:00
|
|
|
msg_printf(0, MSG_INFO, "[work_fetch] work fetch start");
|
2010-01-21 00:14:56 +00:00
|
|
|
}
|
|
|
|
|
2009-01-21 20:51:33 +00:00
|
|
|
p = non_cpu_intensive_project_needing_work();
|
|
|
|
if (p) return p;
|
2009-01-21 20:28:00 +00:00
|
|
|
|
2009-01-08 00:41:15 +00:00
|
|
|
gstate.compute_nuploading_results();
|
|
|
|
|
2011-09-12 17:01:54 +00:00
|
|
|
rr_simulation();
|
2011-03-08 02:28:46 +00:00
|
|
|
compute_shares();
|
2011-10-19 06:37:03 +00:00
|
|
|
project_priority_init(true);
|
2011-10-15 20:28:26 +00:00
|
|
|
|
2011-12-06 04:21:27 +00:00
|
|
|
// Decrement the priority of projects that have a lot of work queued.
|
|
|
|
// Specifically, subtract
|
|
|
|
// (FLOPs queued for P)/(FLOPs of max queue)
|
|
|
|
// which will generally be between 0 and 1.
|
|
|
|
// This is a little arbitrary but I can't think of anything better.
|
2011-10-15 20:28:26 +00:00
|
|
|
//
|
2011-12-06 04:21:27 +00:00
|
|
|
double max_queued_flops = gstate.work_buf_total()*total_peak_flops();
|
2011-07-13 19:46:03 +00:00
|
|
|
for (unsigned int i=0; i<gstate.results.size(); i++) {
|
|
|
|
RESULT* rp = gstate.results[i];
|
2011-12-06 04:21:27 +00:00
|
|
|
p = rp->project;
|
|
|
|
p->sched_priority -= rp->estimated_flops_remaining()/max_queued_flops;
|
2011-07-13 19:46:03 +00:00
|
|
|
}
|
2009-01-03 06:01:17 +00:00
|
|
|
|
2011-10-16 01:29:57 +00:00
|
|
|
p = 0;
|
2011-01-07 20:23:22 +00:00
|
|
|
if (use_hyst_fetch) {
|
2011-03-25 03:44:09 +00:00
|
|
|
if (gpus_usable) {
|
|
|
|
for (int i=1; i<coprocs.n_rsc; i++) {
|
2012-02-18 23:19:06 +00:00
|
|
|
p = rsc_work_fetch[i].choose_project_hyst(enforce_hyst);
|
2011-03-25 03:44:09 +00:00
|
|
|
if (p) break;
|
|
|
|
}
|
2010-12-23 23:39:30 +00:00
|
|
|
}
|
|
|
|
if (!p) {
|
2012-02-18 23:19:06 +00:00
|
|
|
p = rsc_work_fetch[0].choose_project_hyst(enforce_hyst);
|
2010-12-23 23:39:30 +00:00
|
|
|
}
|
2010-12-30 22:41:50 +00:00
|
|
|
} else {
|
2011-03-25 03:44:09 +00:00
|
|
|
if (gpus_usable) {
|
|
|
|
for (int i=1; i<coprocs.n_rsc; i++) {
|
|
|
|
p = rsc_work_fetch[i].choose_project(FETCH_IF_IDLE_INSTANCE);
|
|
|
|
if (p) break;
|
|
|
|
}
|
2009-08-17 16:50:40 +00:00
|
|
|
}
|
2009-03-25 16:59:59 +00:00
|
|
|
if (!p) {
|
2011-03-25 03:44:09 +00:00
|
|
|
p = rsc_work_fetch[0].choose_project(FETCH_IF_IDLE_INSTANCE);
|
2008-12-31 23:07:59 +00:00
|
|
|
}
|
2011-03-25 03:44:09 +00:00
|
|
|
|
|
|
|
if (!p && gpus_usable) {
|
|
|
|
for (int i=1; i<coprocs.n_rsc; i++) {
|
|
|
|
p = rsc_work_fetch[i].choose_project(FETCH_IF_MAJOR_SHORTFALL);
|
|
|
|
if (p) break;
|
|
|
|
}
|
2009-08-17 16:50:40 +00:00
|
|
|
}
|
2009-02-23 20:35:52 +00:00
|
|
|
if (!p) {
|
2011-03-25 03:44:09 +00:00
|
|
|
p = rsc_work_fetch[0].choose_project(FETCH_IF_MAJOR_SHORTFALL);
|
2007-02-21 16:58:29 +00:00
|
|
|
}
|
2011-03-25 03:44:09 +00:00
|
|
|
|
|
|
|
if (!p && gpus_usable) {
|
|
|
|
for (int i=1; i<coprocs.n_rsc; i++) {
|
|
|
|
p = rsc_work_fetch[i].choose_project(FETCH_IF_MINOR_SHORTFALL);
|
|
|
|
if (p) break;
|
|
|
|
}
|
2009-08-17 16:50:40 +00:00
|
|
|
}
|
2009-02-23 20:35:52 +00:00
|
|
|
if (!p) {
|
2011-03-25 03:44:09 +00:00
|
|
|
p = rsc_work_fetch[0].choose_project(FETCH_IF_MINOR_SHORTFALL);
|
2009-02-20 21:44:39 +00:00
|
|
|
}
|
2011-03-25 03:44:09 +00:00
|
|
|
|
|
|
|
if (!p && gpus_usable) {
|
|
|
|
for (int i=1; i<coprocs.n_rsc; i++) {
|
|
|
|
p = rsc_work_fetch[i].choose_project(FETCH_IF_PROJECT_STARVED);
|
|
|
|
if (p) break;
|
|
|
|
}
|
2009-08-17 16:50:40 +00:00
|
|
|
}
|
2009-02-20 21:44:39 +00:00
|
|
|
if (!p) {
|
2011-03-25 03:44:09 +00:00
|
|
|
p = rsc_work_fetch[0].choose_project(FETCH_IF_PROJECT_STARVED);
|
2007-02-21 16:58:29 +00:00
|
|
|
}
|
2010-12-30 22:41:50 +00:00
|
|
|
}
|
2009-01-03 06:01:17 +00:00
|
|
|
|
2007-02-21 16:58:29 +00:00
|
|
|
if (log_flags.work_fetch_debug) {
|
2008-12-31 23:07:59 +00:00
|
|
|
print_state();
|
2009-09-02 21:36:25 +00:00
|
|
|
if (!p) {
|
2010-04-29 20:32:51 +00:00
|
|
|
msg_printf(0, MSG_INFO, "[work_fetch] No project chosen for work fetch");
|
2007-07-11 20:13:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
return p;
|
|
|
|
}
|
2007-02-21 16:58:29 +00:00
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
void WORK_FETCH::accumulate_inst_sec(ACTIVE_TASK* atp, double dt) {
|
|
|
|
APP_VERSION* avp = atp->result->avp;
|
|
|
|
PROJECT* p = atp->result->project;
|
|
|
|
double x = dt*avp->avg_ncpus;
|
2011-03-25 03:44:09 +00:00
|
|
|
p->rsc_pwf[0].secs_this_debt_interval += x;
|
|
|
|
rsc_work_fetch[0].secs_this_debt_interval += x;
|
|
|
|
int rt = avp->gpu_usage.rsc_type;
|
|
|
|
if (rt) {
|
|
|
|
x = dt*avp->gpu_usage.usage;
|
|
|
|
p->rsc_pwf[rt].secs_this_debt_interval += x;
|
|
|
|
rsc_work_fetch[rt].secs_this_debt_interval += x;
|
2009-08-17 16:50:40 +00:00
|
|
|
}
|
2008-12-31 23:07:59 +00:00
|
|
|
}
|
2007-02-21 16:58:29 +00:00
|
|
|
|
2009-01-03 06:01:17 +00:00
|
|
|
// find total and per-project resource shares for each resource
|
|
|
|
//
|
|
|
|
void WORK_FETCH::compute_shares() {
|
|
|
|
unsigned int i;
|
|
|
|
PROJECT* p;
|
|
|
|
for (i=0; i<gstate.projects.size(); i++) {
|
|
|
|
p = gstate.projects[i];
|
|
|
|
if (p->non_cpu_intensive) continue;
|
2012-07-02 03:43:05 +00:00
|
|
|
if (p->pwf.cant_fetch_work_reason) continue;
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int j=0; j<coprocs.n_rsc; j++) {
|
|
|
|
if (p->rsc_pwf[j].may_have_work) {
|
|
|
|
rsc_work_fetch[j].total_fetchable_share += p->resource_share;
|
|
|
|
}
|
2009-08-17 16:50:40 +00:00
|
|
|
}
|
2009-01-03 06:01:17 +00:00
|
|
|
}
|
|
|
|
for (i=0; i<gstate.projects.size(); i++) {
|
|
|
|
p = gstate.projects[i];
|
|
|
|
if (p->non_cpu_intensive) continue;
|
2012-07-02 03:43:05 +00:00
|
|
|
if (p->pwf.cant_fetch_work_reason) continue;
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int j=0; j<coprocs.n_rsc; j++) {
|
|
|
|
if (p->rsc_pwf[j].may_have_work) {
|
|
|
|
p->rsc_pwf[j].fetchable_share = rsc_work_fetch[j].total_fetchable_share?p->resource_share/rsc_work_fetch[j].total_fetchable_share:1;
|
|
|
|
}
|
2009-08-17 16:50:40 +00:00
|
|
|
}
|
2009-01-03 06:01:17 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-09-27 20:34:47 +00:00
|
|
|
void WORK_FETCH::request_string(char* buf) {
|
|
|
|
char buf2[256];
|
|
|
|
sprintf(buf,
|
2011-02-23 23:11:59 +00:00
|
|
|
"[work_fetch] request: CPU (%.2f sec, %.2f inst)",
|
2011-03-25 03:44:09 +00:00
|
|
|
rsc_work_fetch[0].req_secs, rsc_work_fetch[0].req_instances
|
2010-09-27 20:34:47 +00:00
|
|
|
);
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int i=1; i<coprocs.n_rsc; i++) {
|
|
|
|
sprintf(buf2, " %s (%.2f sec, %.2f inst)",
|
|
|
|
rsc_name(i), rsc_work_fetch[i].req_secs, rsc_work_fetch[i].req_instances
|
2010-09-27 20:34:47 +00:00
|
|
|
);
|
|
|
|
strcat(buf, buf2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-09-02 21:36:25 +00:00
|
|
|
void WORK_FETCH::write_request(FILE* f, PROJECT* p) {
|
2011-03-25 03:44:09 +00:00
|
|
|
double work_req = rsc_work_fetch[0].req_secs;
|
2009-08-31 03:42:01 +00:00
|
|
|
|
|
|
|
// if project is anonymous platform, set the overall work req
|
2009-09-02 21:36:25 +00:00
|
|
|
// to the max of the requests of resource types for which we have versions.
|
2009-08-31 03:42:01 +00:00
|
|
|
// Otherwise projects with old schedulers won't send us work.
|
2009-09-02 21:36:25 +00:00
|
|
|
// THIS CAN BE REMOVED AT SOME POINT
|
2009-08-31 03:42:01 +00:00
|
|
|
//
|
2009-09-02 21:36:25 +00:00
|
|
|
if (p->anonymous_platform) {
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int i=1; i<coprocs.n_rsc; i++) {
|
|
|
|
if (has_coproc_app(p, i)) {
|
|
|
|
if (rsc_work_fetch[i].req_secs > work_req) {
|
|
|
|
work_req = rsc_work_fetch[i].req_secs;
|
|
|
|
}
|
2009-09-02 21:36:25 +00:00
|
|
|
}
|
2009-08-31 03:42:01 +00:00
|
|
|
}
|
|
|
|
}
|
2008-12-31 23:07:59 +00:00
|
|
|
fprintf(f,
|
2009-01-11 05:14:47 +00:00
|
|
|
" <work_req_seconds>%f</work_req_seconds>\n"
|
2009-01-03 06:01:17 +00:00
|
|
|
" <cpu_req_secs>%f</cpu_req_secs>\n"
|
2010-01-05 16:36:42 +00:00
|
|
|
" <cpu_req_instances>%f</cpu_req_instances>\n"
|
2009-02-27 18:46:00 +00:00
|
|
|
" <estimated_delay>%f</estimated_delay>\n",
|
2009-08-31 03:42:01 +00:00
|
|
|
work_req,
|
2011-03-25 03:44:09 +00:00
|
|
|
rsc_work_fetch[0].req_secs,
|
|
|
|
rsc_work_fetch[0].req_instances,
|
|
|
|
rsc_work_fetch[0].req_secs?rsc_work_fetch[0].busy_time_estimator.get_busy_time():0
|
2008-12-31 23:07:59 +00:00
|
|
|
);
|
2009-09-02 21:36:25 +00:00
|
|
|
if (log_flags.work_fetch_debug) {
|
2010-09-27 20:34:47 +00:00
|
|
|
char buf[256];
|
|
|
|
request_string(buf);
|
2009-09-02 21:36:25 +00:00
|
|
|
msg_printf(p, MSG_INFO, buf);
|
|
|
|
}
|
2007-02-21 16:58:29 +00:00
|
|
|
}
|
|
|
|
|
2009-01-03 06:01:17 +00:00
|
|
|
// we just got a scheduler reply with the given jobs; update backoffs
|
2007-02-21 16:58:29 +00:00
|
|
|
//
|
2009-09-16 17:34:19 +00:00
|
|
|
void WORK_FETCH::handle_reply(
|
2011-03-25 03:44:09 +00:00
|
|
|
PROJECT* p, SCHEDULER_REPLY*, vector<RESULT*> new_results
|
2009-09-16 17:34:19 +00:00
|
|
|
) {
|
2011-03-25 03:44:09 +00:00
|
|
|
bool got_rsc[MAX_RSC];
|
|
|
|
for (int i=0; i<coprocs.n_rsc; i++) {
|
|
|
|
got_rsc[i] = false;
|
2009-09-16 17:34:19 +00:00
|
|
|
}
|
|
|
|
|
2009-01-03 06:01:17 +00:00
|
|
|
// if didn't get any jobs, back off on requested resource types
|
|
|
|
//
|
|
|
|
if (!new_results.size()) {
|
2009-02-09 22:00:31 +00:00
|
|
|
// but not if RPC was requested by project
|
|
|
|
//
|
|
|
|
if (p->sched_rpc_pending != RPC_REASON_PROJECT_REQ) {
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int i=0; i<coprocs.n_rsc; i++) {
|
|
|
|
if (rsc_work_fetch[i].req_secs) {
|
|
|
|
p->rsc_pwf[i].backoff(p, rsc_name(i));
|
|
|
|
}
|
2009-02-09 22:00:31 +00:00
|
|
|
}
|
2009-01-03 06:01:17 +00:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
2008-12-31 23:30:38 +00:00
|
|
|
|
2009-01-03 06:01:17 +00:00
|
|
|
// if we did get jobs, clear backoff on resource types
|
|
|
|
//
|
2011-06-12 20:58:43 +00:00
|
|
|
for (unsigned int i=0; i<new_results.size(); i++) {
|
2008-12-31 23:07:59 +00:00
|
|
|
RESULT* rp = new_results[i];
|
2011-03-25 03:44:09 +00:00
|
|
|
got_rsc[rp->avp->gpu_usage.rsc_type] = true;
|
|
|
|
}
|
|
|
|
for (int i=0; i<coprocs.n_rsc; i++) {
|
|
|
|
if (got_rsc[i]) p->rsc_pwf[i].clear_backoff();
|
2007-02-21 16:58:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-02-27 18:46:00 +00:00
|
|
|
// set up for initial RPC.
|
|
|
|
// arrange to always get one job, even if we don't need it or can't handle it.
|
|
|
|
// (this is probably what user wants)
|
|
|
|
//
|
2009-01-10 00:48:22 +00:00
|
|
|
void WORK_FETCH::set_initial_work_request() {
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int i=0; i<coprocs.n_rsc; i++) {
|
|
|
|
rsc_work_fetch[i].req_secs = 1;
|
|
|
|
rsc_work_fetch[i].req_instances = 0;
|
|
|
|
rsc_work_fetch[i].busy_time_estimator.reset();
|
2009-08-17 16:50:40 +00:00
|
|
|
}
|
2008-12-31 23:07:59 +00:00
|
|
|
}
|
|
|
|
|
2009-01-08 00:41:15 +00:00
|
|
|
// called once, at client startup
|
|
|
|
//
|
2008-12-31 23:07:59 +00:00
|
|
|
void WORK_FETCH::init() {
|
2011-03-25 03:44:09 +00:00
|
|
|
rsc_work_fetch[0].init(0, gstate.ncpus, 1);
|
2011-01-12 00:47:51 +00:00
|
|
|
double cpu_flops = gstate.host_info.p_fpops;
|
2008-12-31 23:07:59 +00:00
|
|
|
|
2009-10-16 00:13:01 +00:00
|
|
|
// use 20% as a rough estimate of GPU efficiency
|
|
|
|
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int i=1; i<coprocs.n_rsc; i++) {
|
|
|
|
rsc_work_fetch[i].init(
|
|
|
|
i, coprocs.coprocs[i].count,
|
|
|
|
coprocs.coprocs[i].count*0.2*coprocs.coprocs[i].peak_flops/cpu_flops
|
2009-09-03 20:31:04 +00:00
|
|
|
);
|
2009-08-17 16:50:40 +00:00
|
|
|
}
|
2009-02-20 22:16:03 +00:00
|
|
|
|
2010-02-11 22:19:22 +00:00
|
|
|
// see what resources anon platform projects can use
|
|
|
|
//
|
|
|
|
unsigned int i, j;
|
|
|
|
for (i=0; i<gstate.projects.size(); i++) {
|
|
|
|
PROJECT* p = gstate.projects[i];
|
|
|
|
if (!p->anonymous_platform) continue;
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int k=0; k<coprocs.n_rsc; k++) {
|
|
|
|
p->rsc_pwf[k].anon_skip = true;
|
|
|
|
}
|
2010-02-11 22:19:22 +00:00
|
|
|
for (j=0; j<gstate.app_versions.size(); j++) {
|
|
|
|
APP_VERSION* avp = gstate.app_versions[j];
|
|
|
|
if (avp->project != p) continue;
|
2011-03-25 03:44:09 +00:00
|
|
|
p->rsc_pwf[avp->gpu_usage.rsc_type].anon_skip = false;
|
2010-02-11 22:19:22 +00:00
|
|
|
}
|
|
|
|
}
|
2007-02-21 16:58:29 +00:00
|
|
|
}
|
|
|
|
|
2010-09-24 21:24:02 +00:00
|
|
|
// clear backoff for app's resource
|
|
|
|
//
|
|
|
|
void WORK_FETCH::clear_backoffs(APP_VERSION& av) {
|
2011-03-25 03:44:09 +00:00
|
|
|
av.project->rsc_pwf[av.gpu_usage.rsc_type].clear_backoff();
|
2010-09-24 21:24:02 +00:00
|
|
|
}
|
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
////////////////////////
|
|
|
|
|
2007-02-21 16:58:29 +00:00
|
|
|
void CLIENT_STATE::compute_nuploading_results() {
|
|
|
|
unsigned int i;
|
|
|
|
|
|
|
|
for (i=0; i<projects.size(); i++) {
|
|
|
|
projects[i]->nuploading_results = 0;
|
2009-07-22 22:00:51 +00:00
|
|
|
projects[i]->too_many_uploading_results = false;
|
2007-02-21 16:58:29 +00:00
|
|
|
}
|
|
|
|
for (i=0; i<results.size(); i++) {
|
|
|
|
RESULT* rp = results[i];
|
|
|
|
if (rp->state() == RESULT_FILES_UPLOADING) {
|
|
|
|
rp->project->nuploading_results++;
|
|
|
|
}
|
|
|
|
}
|
2009-07-22 22:00:51 +00:00
|
|
|
int n = gstate.ncpus;
|
2011-03-25 03:44:09 +00:00
|
|
|
for (int j=1; j<coprocs.n_rsc; j++) {
|
|
|
|
if (coprocs.coprocs[j].count > n) {
|
|
|
|
n = coprocs.coprocs[j].count;
|
|
|
|
}
|
2009-08-17 16:50:40 +00:00
|
|
|
}
|
2009-07-22 22:00:51 +00:00
|
|
|
n *= 2;
|
|
|
|
for (i=0; i<projects.size(); i++) {
|
|
|
|
if (projects[i]->nuploading_results > n) {
|
|
|
|
projects[i]->too_many_uploading_results = true;
|
|
|
|
}
|
|
|
|
}
|
2007-02-21 16:58:29 +00:00
|
|
|
}
|
|
|
|
|
2010-01-06 06:01:23 +00:00
|
|
|
// Returns the estimated total elapsed time of this task.
|
2007-02-21 22:27:35 +00:00
|
|
|
// Compute this as a weighted average of estimates based on
|
2009-07-07 22:58:58 +00:00
|
|
|
// 1) the workunit's flops count (static estimate)
|
|
|
|
// 2) the current elapsed time and fraction done (dynamic estimate)
|
2007-02-21 22:27:35 +00:00
|
|
|
//
|
2010-11-17 20:04:58 +00:00
|
|
|
double ACTIVE_TASK::est_dur() {
|
2010-12-08 16:59:42 +00:00
|
|
|
if (fraction_done >= 1) return elapsed_time;
|
2012-04-25 04:10:29 +00:00
|
|
|
double wu_est = result->estimated_runtime();
|
2007-02-21 22:27:35 +00:00
|
|
|
if (fraction_done <= 0) return wu_est;
|
2010-01-06 06:01:23 +00:00
|
|
|
if (wu_est < elapsed_time) wu_est = elapsed_time;
|
2011-04-18 16:32:57 +00:00
|
|
|
double frac_est = fraction_done_elapsed_time / fraction_done;
|
2007-02-21 22:27:35 +00:00
|
|
|
double fraction_left = 1-fraction_done;
|
2010-04-01 03:32:14 +00:00
|
|
|
double wu_weight = fraction_left * fraction_left * fraction_left;
|
2009-01-03 06:01:17 +00:00
|
|
|
double fd_weight = 1 - wu_weight;
|
2009-12-08 20:39:46 +00:00
|
|
|
double x = fd_weight*frac_est + wu_weight*wu_est;
|
2010-11-05 19:17:07 +00:00
|
|
|
#if 0
|
2010-11-17 20:04:58 +00:00
|
|
|
if (log_flags.rr_simulation) {
|
2011-04-28 21:54:42 +00:00
|
|
|
msg_printf(result->project, MSG_INFO,
|
|
|
|
"[rr_sim] %s frac_est %f = %f/%f",
|
|
|
|
result->name, frac_est, fraction_done_elapsed_time, fraction_done
|
|
|
|
);
|
2009-12-08 20:39:46 +00:00
|
|
|
msg_printf(result->project, MSG_INFO,
|
2010-01-07 21:00:42 +00:00
|
|
|
"[rr_sim] %s dur: %.2f = %.3f*%.2f + %.3f*%.2f",
|
2009-12-08 20:39:46 +00:00
|
|
|
result->name, x, fd_weight, frac_est, wu_weight, wu_est
|
|
|
|
);
|
|
|
|
}
|
|
|
|
#endif
|
2007-05-08 01:55:28 +00:00
|
|
|
return x;
|
2007-02-21 22:27:35 +00:00
|
|
|
}
|
|
|
|
|
2009-02-27 21:44:39 +00:00
|
|
|
// the fraction of time BOINC is processing
|
2008-12-31 23:07:59 +00:00
|
|
|
//
|
|
|
|
double CLIENT_STATE::overall_cpu_frac() {
|
2010-08-23 05:00:22 +00:00
|
|
|
double x = time_stats.on_frac * time_stats.active_frac;
|
|
|
|
if (x < 0.01) x = 0.01;
|
|
|
|
if (x > 1) x = 1;
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
double CLIENT_STATE::overall_gpu_frac() {
|
|
|
|
double x = time_stats.on_frac * time_stats.gpu_active_frac;
|
|
|
|
if (x < 0.01) x = 0.01;
|
2012-06-26 20:30:56 +00:00
|
|
|
if (x > 1) x = 1;
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
double CLIENT_STATE::overall_cpu_and_network_frac() {
|
|
|
|
double x = time_stats.on_frac * time_stats.cpu_and_network_available_frac;
|
|
|
|
if (x < 0.01) x = 0.01;
|
2010-08-23 05:00:22 +00:00
|
|
|
if (x > 1) x = 1;
|
|
|
|
return x;
|
2008-12-31 23:07:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// called when benchmarks change
|
|
|
|
//
|
|
|
|
void CLIENT_STATE::scale_duration_correction_factors(double factor) {
|
|
|
|
if (factor <= 0) return;
|
|
|
|
for (unsigned int i=0; i<projects.size(); i++) {
|
|
|
|
PROJECT* p = projects[i];
|
2012-03-23 16:25:19 +00:00
|
|
|
if (p->dont_use_dcf) continue;
|
2008-12-31 23:07:59 +00:00
|
|
|
p->duration_correction_factor *= factor;
|
|
|
|
}
|
2009-02-17 03:16:25 +00:00
|
|
|
if (log_flags.dcf_debug) {
|
2009-01-03 06:01:17 +00:00
|
|
|
msg_printf(NULL, MSG_INFO,
|
2009-02-17 03:16:25 +00:00
|
|
|
"[dcf] scaling all duration correction factors by %f",
|
2008-12-31 23:07:59 +00:00
|
|
|
factor
|
|
|
|
);
|
2009-01-03 06:01:17 +00:00
|
|
|
}
|
2008-12-31 23:07:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Choose a new host CPID.
|
|
|
|
// If using account manager, do scheduler RPCs
|
|
|
|
// to all acct-mgr-attached projects to propagate the CPID
|
|
|
|
//
|
|
|
|
void CLIENT_STATE::generate_new_host_cpid() {
|
|
|
|
host_info.generate_host_cpid();
|
|
|
|
for (unsigned int i=0; i<projects.size(); i++) {
|
|
|
|
if (projects[i]->attached_via_acct_mgr) {
|
|
|
|
projects[i]->sched_rpc_pending = RPC_REASON_ACCT_MGR_REQ;
|
|
|
|
projects[i]->set_min_rpc_time(now + 15, "Sending new host CPID");
|
|
|
|
}
|
2007-02-21 22:27:35 +00:00
|
|
|
}
|
|
|
|
}
|
2007-02-21 16:58:29 +00:00
|
|
|
|