2008-11-14 22:08:50 +00:00
|
|
|
// This file is part of BOINC.
|
|
|
|
// http://boinc.berkeley.edu
|
|
|
|
// Copyright (C) 2008 University of California
|
|
|
|
//
|
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
|
|
|
//
|
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
#ifdef _WIN32
|
|
|
|
#include "boinc_win.h"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef SIM
|
|
|
|
#include "sim.h"
|
|
|
|
#else
|
|
|
|
#include "client_state.h"
|
|
|
|
#endif
|
2008-12-31 23:07:59 +00:00
|
|
|
|
|
|
|
#include "coproc.h"
|
2008-11-14 22:08:50 +00:00
|
|
|
#include "client_msgs.h"
|
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
// this is here (rather than rr_sim.h) because its inline functions
|
|
|
|
// refer to RESULT
|
|
|
|
//
|
2008-11-14 22:08:50 +00:00
|
|
|
struct RR_SIM_STATUS {
|
|
|
|
std::vector<RESULT*> active;
|
|
|
|
COPROCS coprocs;
|
|
|
|
double active_ncpus;
|
2008-12-31 23:07:59 +00:00
|
|
|
double active_cudas;
|
2008-11-14 22:08:50 +00:00
|
|
|
|
|
|
|
inline bool can_run(RESULT* rp) {
|
|
|
|
return coprocs.sufficient_coprocs(
|
2008-12-08 23:00:23 +00:00
|
|
|
rp->avp->coprocs, log_flags.rr_simulation, "rr_sim"
|
2008-11-14 22:08:50 +00:00
|
|
|
);
|
|
|
|
}
|
2008-12-03 23:30:54 +00:00
|
|
|
inline void activate(RESULT* rp, double when) {
|
2008-11-14 22:08:50 +00:00
|
|
|
coprocs.reserve_coprocs(
|
2008-12-08 23:00:23 +00:00
|
|
|
rp->avp->coprocs, rp, log_flags.rr_simulation, "rr_sim"
|
2008-11-14 22:08:50 +00:00
|
|
|
);
|
2008-12-03 23:30:54 +00:00
|
|
|
if (log_flags.rr_simulation) {
|
|
|
|
msg_printf(rp->project, MSG_INFO,
|
|
|
|
"[rr_sim] starting at %f: %s", when, rp->name
|
|
|
|
);
|
|
|
|
}
|
2008-11-14 22:08:50 +00:00
|
|
|
active.push_back(rp);
|
|
|
|
active_ncpus += rp->avp->avg_ncpus;
|
2008-12-31 23:07:59 +00:00
|
|
|
active_cudas += rp->avp->ncudas;
|
2008-11-14 22:08:50 +00:00
|
|
|
}
|
|
|
|
// remove *rpbest from active set,
|
|
|
|
// and adjust CPU time left for other results
|
|
|
|
//
|
|
|
|
inline void remove_active(RESULT* rpbest) {
|
2008-12-08 23:00:23 +00:00
|
|
|
coprocs.free_coprocs(rpbest->avp->coprocs, rpbest, log_flags.rr_simulation, "rr_sim");
|
2008-11-14 22:08:50 +00:00
|
|
|
vector<RESULT*>::iterator it = active.begin();
|
|
|
|
while (it != active.end()) {
|
|
|
|
RESULT* rp = *it;
|
|
|
|
if (rp == rpbest) {
|
|
|
|
it = active.erase(it);
|
|
|
|
} else {
|
2008-12-02 03:58:32 +00:00
|
|
|
rp->rrsim_flops_left -= rp->rrsim_flops*rpbest->rrsim_finish_delay;
|
2008-11-14 22:08:50 +00:00
|
|
|
|
|
|
|
// can be slightly less than 0 due to roundoff
|
|
|
|
//
|
2008-12-02 03:58:32 +00:00
|
|
|
if (rp->rrsim_flops_left < -1) {
|
2008-11-14 22:08:50 +00:00
|
|
|
msg_printf(rp->project, MSG_INTERNAL_ERROR,
|
2008-12-02 03:58:32 +00:00
|
|
|
"%s: negative FLOPs left %f", rp->name, rp->rrsim_flops_left
|
2008-11-14 22:08:50 +00:00
|
|
|
);
|
|
|
|
}
|
2008-12-02 03:58:32 +00:00
|
|
|
if (rp->rrsim_flops_left < 0) {
|
|
|
|
rp->rrsim_flops_left = 0;
|
2008-11-14 22:08:50 +00:00
|
|
|
}
|
|
|
|
it++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
active_ncpus -= rpbest->avp->avg_ncpus;
|
2008-12-31 23:07:59 +00:00
|
|
|
active_cudas -= rpbest->avp->ncudas;
|
2008-11-14 22:08:50 +00:00
|
|
|
}
|
2008-12-09 20:01:01 +00:00
|
|
|
|
2008-11-14 22:08:50 +00:00
|
|
|
RR_SIM_STATUS() {
|
|
|
|
active_ncpus = 0;
|
|
|
|
}
|
|
|
|
~RR_SIM_STATUS() {
|
|
|
|
coprocs.delete_coprocs();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
void RR_SIM_PROJECT_STATUS::activate(RESULT* rp) {
|
|
|
|
active.push_back(rp);
|
|
|
|
active_ncpus += rp->avp->avg_ncpus;
|
2008-12-31 23:07:59 +00:00
|
|
|
active_cudas += rp->avp->ncudas;
|
2008-11-14 22:08:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool RR_SIM_PROJECT_STATUS::can_run(RESULT* rp, int ncpus) {
|
|
|
|
if (rp->uses_coprocs()) return true;
|
|
|
|
return active_ncpus < ncpus;
|
|
|
|
}
|
|
|
|
void RR_SIM_PROJECT_STATUS::remove_active(RESULT* r) {
|
|
|
|
std::vector<RESULT*>::iterator it = active.begin();
|
|
|
|
while (it != active.end()) {
|
|
|
|
if (*it == r) {
|
|
|
|
it = active.erase(it);
|
|
|
|
} else {
|
|
|
|
it++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
active_ncpus -= r->avp->avg_ncpus;
|
2008-12-31 23:07:59 +00:00
|
|
|
active_cudas -= r->avp->ncudas;
|
2008-11-14 22:08:50 +00:00
|
|
|
}
|
|
|
|
|
2008-12-09 20:01:01 +00:00
|
|
|
// estimate the rate (FLOPS) that this job will get long-term
|
|
|
|
// with weighted round-robin scheduling
|
2008-11-14 22:08:50 +00:00
|
|
|
//
|
2008-12-31 23:07:59 +00:00
|
|
|
void set_rrsim_flops(RESULT* rp) {
|
2008-12-09 20:01:01 +00:00
|
|
|
// if it's a coproc job, use app version estimate
|
|
|
|
if (rp->uses_coprocs()) {
|
|
|
|
rp->rrsim_flops = rp->avp->flops;
|
|
|
|
return;
|
2008-11-14 22:08:50 +00:00
|
|
|
}
|
2008-12-09 20:01:01 +00:00
|
|
|
PROJECT* p = rp->project;
|
2008-11-14 22:08:50 +00:00
|
|
|
|
2008-12-09 20:01:01 +00:00
|
|
|
// first, estimate how many CPU seconds per second this job would get
|
|
|
|
// running with other jobs of this project, ignoring other factors
|
|
|
|
//
|
|
|
|
double x = 1;
|
2008-12-31 23:07:59 +00:00
|
|
|
if (p->rr_sim_status.active_ncpus > gstate.ncpus) {
|
|
|
|
x = gstate.ncpus/p->rr_sim_status.active_ncpus;
|
2008-12-09 20:01:01 +00:00
|
|
|
}
|
|
|
|
double r1 = x*rp->avp->avg_ncpus;
|
|
|
|
|
|
|
|
// if the project's total CPU usage is more than its share, scale
|
|
|
|
//
|
2009-01-03 06:01:17 +00:00
|
|
|
double share_cpus = p->cpu_pwf.runnable_share*gstate.ncpus;
|
2008-12-09 20:01:01 +00:00
|
|
|
double r2 = r1;
|
|
|
|
if (p->rr_sim_status.active_ncpus > share_cpus) {
|
|
|
|
r2 *= (share_cpus / p->rr_sim_status.active_ncpus);
|
|
|
|
}
|
|
|
|
|
|
|
|
// scale by overall CPU availability
|
|
|
|
//
|
2008-12-31 23:07:59 +00:00
|
|
|
double r3 = r2 * gstate.overall_cpu_frac();
|
2008-12-09 20:01:01 +00:00
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
rp->rrsim_flops = r3 * gstate.host_info.p_fpops;
|
2008-11-14 22:08:50 +00:00
|
|
|
if (log_flags.rr_simulation) {
|
2008-12-09 20:01:01 +00:00
|
|
|
msg_printf(p, MSG_INFO,
|
|
|
|
"[rr_sim] set_rrsim_flops: %f (r1 %f r2 %f r3 %f)",
|
|
|
|
rp->rrsim_flops, r1, r2, r3
|
2008-11-14 22:08:50 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void CLIENT_STATE::print_deadline_misses() {
|
|
|
|
unsigned int i;
|
|
|
|
RESULT* rp;
|
|
|
|
PROJECT* p;
|
|
|
|
for (i=0; i<results.size(); i++){
|
|
|
|
rp = results[i];
|
2009-01-03 06:01:17 +00:00
|
|
|
if (rp->rr_sim_misses_deadline) {
|
2008-11-14 22:08:50 +00:00
|
|
|
msg_printf(rp->project, MSG_INFO,
|
2009-01-03 06:01:17 +00:00
|
|
|
"[cpu_sched_debug] Result %s projected to miss deadline.",
|
|
|
|
rp->name
|
2008-11-14 22:08:50 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (i=0; i<projects.size(); i++) {
|
|
|
|
p = projects[i];
|
|
|
|
if (p->rr_sim_status.deadlines_missed) {
|
|
|
|
msg_printf(p, MSG_INFO,
|
|
|
|
"[cpu_sched_debug] Project has %d projected deadline misses",
|
|
|
|
p->rr_sim_status.deadlines_missed
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Do a simulation of the current workload
|
|
|
|
// with weighted round-robin (WRR) scheduling.
|
|
|
|
// Include jobs that are downloading.
|
|
|
|
//
|
2009-01-03 06:01:17 +00:00
|
|
|
// For efficiency, we simulate an approximation of WRR.
|
2008-11-14 22:08:50 +00:00
|
|
|
// We don't model time-slicing.
|
|
|
|
// Instead we use a continuous model where, at a given point,
|
|
|
|
// each project has a set of running jobs that uses at most all CPUs
|
|
|
|
// (and obeys coprocessor limits).
|
|
|
|
// These jobs are assumed to run at a rate proportionate to their avg_ncpus,
|
|
|
|
// and each project gets CPU proportionate to its RRS.
|
|
|
|
//
|
|
|
|
// Outputs are changes to global state:
|
2009-01-03 06:01:17 +00:00
|
|
|
// - deadline misses (per-project count, per-result flag)
|
|
|
|
// Deadline misses are not counted for tasks
|
|
|
|
// that are too large to run in RAM right now.
|
|
|
|
// - resource shortfalls (per-project and total)
|
|
|
|
// - counts of resources idle now
|
2008-11-14 22:08:50 +00:00
|
|
|
//
|
|
|
|
void CLIENT_STATE::rr_simulation() {
|
|
|
|
PROJECT* p, *pbest;
|
|
|
|
RESULT* rp, *rpbest;
|
|
|
|
RR_SIM_STATUS sim_status;
|
|
|
|
unsigned int i;
|
|
|
|
|
|
|
|
sim_status.coprocs.clone(coprocs, false);
|
|
|
|
double ar = available_ram();
|
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
work_fetch.rr_init();
|
|
|
|
|
2008-11-14 22:08:50 +00:00
|
|
|
if (log_flags.rr_simulation) {
|
|
|
|
msg_printf(0, MSG_INFO,
|
2008-12-31 23:07:59 +00:00
|
|
|
"[rr_sim] rr_sim start: now %f work_buf_total %f",
|
|
|
|
now, work_buf_total()
|
2008-11-14 22:08:50 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i=0; i<projects.size(); i++) {
|
|
|
|
p = projects[i];
|
|
|
|
if (p->non_cpu_intensive) continue;
|
|
|
|
p->rr_sim_status.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Decide what jobs to include in the simulation,
|
|
|
|
// and pick the ones that are initially running
|
|
|
|
//
|
|
|
|
for (i=0; i<results.size(); i++) {
|
|
|
|
rp = results[i];
|
|
|
|
if (!rp->nearly_runnable()) continue;
|
|
|
|
if (rp->some_download_stalled()) continue;
|
|
|
|
if (rp->project->non_cpu_intensive) continue;
|
2008-12-02 03:58:32 +00:00
|
|
|
rp->rrsim_flops_left = rp->estimated_flops_remaining();
|
|
|
|
if (rp->rrsim_flops_left <= 0) continue;
|
2008-11-14 22:08:50 +00:00
|
|
|
p = rp->project;
|
|
|
|
if (p->rr_sim_status.can_run(rp, gstate.ncpus) && sim_status.can_run(rp)) {
|
2008-12-03 23:30:54 +00:00
|
|
|
sim_status.activate(rp, now);
|
2008-11-14 22:08:50 +00:00
|
|
|
p->rr_sim_status.activate(rp);
|
|
|
|
} else {
|
|
|
|
p->rr_sim_status.add_pending(rp);
|
|
|
|
}
|
|
|
|
rp->rr_sim_misses_deadline = false;
|
2008-12-08 23:00:23 +00:00
|
|
|
if (rp->uses_coprocs()) {
|
2008-12-31 23:07:59 +00:00
|
|
|
p->rr_sim_status.has_cuda_jobs = true;
|
2008-12-08 23:00:23 +00:00
|
|
|
} else {
|
|
|
|
p->rr_sim_status.has_cpu_jobs = true;
|
|
|
|
}
|
2008-11-14 22:08:50 +00:00
|
|
|
}
|
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
// note the number of idle instances
|
|
|
|
//
|
|
|
|
cpu_work_fetch.nidle_now = ncpus - sim_status.active_ncpus;
|
2009-01-08 00:20:04 +00:00
|
|
|
if (cpu_work_fetch.nidle_now < 0) cpu_work_fetch.nidle_now = 0;
|
2008-12-31 23:07:59 +00:00
|
|
|
if (coproc_cuda) {
|
|
|
|
cuda_work_fetch.nidle_now = coproc_cuda->count - coproc_cuda->used;
|
2009-01-08 00:20:04 +00:00
|
|
|
if (cuda_work_fetch.nidle_now < 0) cuda_work_fetch.nidle_now = 0;
|
2008-12-31 23:07:59 +00:00
|
|
|
}
|
|
|
|
|
2009-01-03 06:01:17 +00:00
|
|
|
work_fetch.compute_shares();
|
2008-11-14 22:08:50 +00:00
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
// Simulation loop. Keep going until all work done
|
2008-11-14 22:08:50 +00:00
|
|
|
//
|
2009-01-03 06:01:17 +00:00
|
|
|
double buf_end = now + work_buf_total();
|
2008-11-14 22:08:50 +00:00
|
|
|
double sim_now = now;
|
|
|
|
while (sim_status.active.size()) {
|
|
|
|
|
|
|
|
// compute finish times and see which result finishes first
|
|
|
|
//
|
|
|
|
rpbest = NULL;
|
|
|
|
for (i=0; i<sim_status.active.size(); i++) {
|
|
|
|
rp = sim_status.active[i];
|
2008-12-31 23:07:59 +00:00
|
|
|
set_rrsim_flops(rp);
|
2008-12-02 03:58:32 +00:00
|
|
|
rp->rrsim_finish_delay = rp->rrsim_flops_left/rp->rrsim_flops;
|
2008-11-14 22:08:50 +00:00
|
|
|
if (!rpbest || rp->rrsim_finish_delay < rpbest->rrsim_finish_delay) {
|
|
|
|
rpbest = rp;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pbest = rpbest->project;
|
|
|
|
|
|
|
|
if (log_flags.rr_simulation) {
|
|
|
|
msg_printf(pbest, MSG_INFO,
|
|
|
|
"[rr_sim] result %s finishes after %f (%f/%f)",
|
|
|
|
rpbest->name, rpbest->rrsim_finish_delay,
|
2008-12-02 03:58:32 +00:00
|
|
|
rpbest->rrsim_flops_left, rpbest->rrsim_flops
|
2008-11-14 22:08:50 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
// "rpbest" is first result to finish. Does it miss its deadline?
|
|
|
|
//
|
|
|
|
double diff = sim_now + rpbest->rrsim_finish_delay - ((rpbest->computation_deadline()-now)*CPU_PESSIMISM_FACTOR + now);
|
|
|
|
if (diff > 0) {
|
|
|
|
ACTIVE_TASK* atp = lookup_active_task_by_result(rpbest);
|
|
|
|
if (atp && atp->procinfo.working_set_size_smoothed > ar) {
|
|
|
|
if (log_flags.rr_simulation) {
|
|
|
|
msg_printf(pbest, MSG_INFO,
|
|
|
|
"[rr_sim] result %s misses deadline but too large to run",
|
|
|
|
rpbest->name
|
|
|
|
);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
rpbest->rr_sim_misses_deadline = true;
|
|
|
|
pbest->rr_sim_status.deadlines_missed++;
|
|
|
|
if (log_flags.rr_simulation) {
|
|
|
|
msg_printf(pbest, MSG_INFO,
|
|
|
|
"[rr_sim] result %s misses deadline by %f",
|
|
|
|
rpbest->name, diff
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-31 23:07:59 +00:00
|
|
|
// increment resource shortfalls
|
2008-12-03 23:30:54 +00:00
|
|
|
//
|
|
|
|
if (sim_now < buf_end) {
|
|
|
|
double end_time = sim_now + rpbest->rrsim_finish_delay;
|
|
|
|
if (end_time > buf_end) end_time = buf_end;
|
|
|
|
double d_time = end_time - sim_now;
|
2008-12-31 23:07:59 +00:00
|
|
|
|
|
|
|
if (sim_status.active_ncpus >= ncpus) {
|
|
|
|
work_fetch.estimated_delay = end_time - gstate.now;
|
|
|
|
}
|
|
|
|
|
|
|
|
cpu_work_fetch.accumulate_shortfall(d_time, sim_status.active_ncpus);
|
|
|
|
|
|
|
|
if (coproc_cuda) {
|
|
|
|
cuda_work_fetch.accumulate_shortfall(d_time, sim_status.active_cudas);
|
2008-12-03 23:30:54 +00:00
|
|
|
}
|
|
|
|
}
|
2008-11-14 22:08:50 +00:00
|
|
|
|
|
|
|
sim_status.remove_active(rpbest);
|
|
|
|
pbest->rr_sim_status.remove_active(rpbest);
|
|
|
|
|
|
|
|
// If project has more results, add one or more to active set.
|
2008-11-24 18:57:04 +00:00
|
|
|
// TODO: do this for other projects too, since coproc may have been freed
|
2008-11-14 22:08:50 +00:00
|
|
|
//
|
|
|
|
while (1) {
|
|
|
|
rp = pbest->rr_sim_status.get_pending();
|
|
|
|
if (!rp) break;
|
|
|
|
if (pbest->rr_sim_status.can_run(rp, gstate.ncpus) && sim_status.can_run(rp)) {
|
2008-12-03 23:30:54 +00:00
|
|
|
sim_status.activate(rp, sim_now);
|
2008-11-14 22:08:50 +00:00
|
|
|
pbest->rr_sim_status.activate(rp);
|
|
|
|
} else {
|
|
|
|
pbest->rr_sim_status.add_pending(rp);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If all work done for a project, subtract that project's share
|
|
|
|
//
|
|
|
|
if (pbest->rr_sim_status.none_active()) {
|
2008-12-31 23:07:59 +00:00
|
|
|
if (pbest->rr_sim_status.has_cpu_jobs) {
|
2009-01-03 06:01:17 +00:00
|
|
|
cpu_work_fetch.total_runnable_share -= pbest->resource_share;
|
2008-12-09 20:01:01 +00:00
|
|
|
}
|
2008-11-14 22:08:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
sim_now += rpbest->rrsim_finish_delay;
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:30:54 +00:00
|
|
|
// if simulation ends before end of buffer, take the tail into account
|
|
|
|
//
|
|
|
|
if (sim_now < buf_end) {
|
|
|
|
double d_time = buf_end - sim_now;
|
2008-12-31 23:07:59 +00:00
|
|
|
cpu_work_fetch.accumulate_shortfall(d_time, 0);
|
|
|
|
if (coproc_cuda) {
|
|
|
|
cuda_work_fetch.accumulate_shortfall(d_time, 0);
|
|
|
|
}
|
2008-11-14 22:08:50 +00:00
|
|
|
}
|
|
|
|
}
|