- client: brute-force attempt at eliminating domino-effect preemption:

if job A is unstarted and EDF,
    and there's a job B that is later in the list,
    is started, has the same app version,
    and has the same arrival time,
    move A after B.
- client: remove the "temp_dcf" mechanism,
    which had the same goal but didn't work.
- client: in computing overall debt for a project,
    subtract a term that reflects pending work.
    This should reduce repeated fetches from the same project.
- client simulator: tweaks

svn path=/trunk/boinc/; revision=20223
This commit is contained in:
David Anderson 2010-01-21 00:14:56 +00:00
parent 3d61129049
commit b5124fe729
10 changed files with 259 additions and 57 deletions

View File

@ -562,3 +562,27 @@ David 19 Jan 2010
client/
sim.cpp
rr_sim.cpp
David 19 Jan 2010
- client: brute-force attempt at eliminating domino-effect preemption:
if job A is unstarted and EDF,
and there's a job B that is later in the list,
is started, has the same app version,
and has the same arrival time,
move A after B.
- client: remove the "temp_dcf" mechanism,
which had the same goal but didn't work.
- client: in computing overall debt for a project,
subtract a term that reflects pending work.
This should reduce repeated fetches from the same project.
- client simulator: tweaks
client/
client_types.h
cpu_sched.cpp
rr_sim.cpp
sim.cpp,h
sim_util.cpp
work_fetch.cpp,h
html/user/
language_select.php

View File

@ -434,7 +434,7 @@ struct APP_VERSION {
double max_working_set_size;
// max working set of tasks using this app version.
// temp var used in schedule_cpus()
double temp_dcf;
//double temp_dcf;
APP_VERSION(){}
~APP_VERSION(){}

View File

@ -42,6 +42,7 @@
#include <string>
#include <cstring>
#include <list>
#include "str_util.h"
#include "util.h"
@ -59,6 +60,7 @@
#endif
using std::vector;
using std::list;
#define DEADLINE_CUSHION 0
// try to finish jobs this much in advance of their deadline
@ -817,6 +819,67 @@ static void promote_multi_thread_jobs(vector<RESULT*>& runnable_jobs) {
}
}
// if job A is unstarted and EDF,
// and there's a job later in the list that is started
// and has the same arrival time and app version,
// move A after B
//
static void demote_unstarted_edf(vector<RESULT*>& runnable_jobs) {
list<RESULT*> x;
RESULT *rp, *rp2;
// transfer to a list
//
unsigned int i;
for (i=0; i<runnable_jobs.size(); i++) {
x.push_back(runnable_jobs[i]);
}
// scan backwards through the list.
// if find a started job, scan backwards from there,
// looking for jobs to demote
//
for (list<RESULT*>::iterator p = x.end(); p != x.begin(); --p) {
rp = *p;
if (rp->not_started()) continue;
list<RESULT*>::iterator q = p;
--q;
while (1) {
rp2 = *q;
if (rp2->not_started()
&& (rp2->received_time==rp->received_time)
&& (rp2->avp==rp->avp)
&& rp2->edf_scheduled
) {
list<RESULT*>::iterator p2 = p;
p2++;
x.insert(p2, rp2);
if (q == x.begin()) {
x.erase(q);
break;
}
list<RESULT*>::iterator q2 = q;
--q;
x.erase(q2);
} else {
if (q == x.begin()) {
break;
}
--q;
}
}
}
// transfer back to vector
//
runnable_jobs.clear();
for (list<RESULT*>::iterator p = x.begin(); p!= x.end(); ++p) {
runnable_jobs.push_back(*p);
}
}
// return true if r0 is more important to run than r1
//
static inline bool more_important(RESULT* r0, RESULT* r1) {
@ -843,7 +906,7 @@ static inline bool more_important(RESULT* r0, RESULT* r1) {
if (!unfin0 && unfin1) return false;
// favor jobs selected first by schedule_cpus()
// (e.g., because their project has high debt)
// (e.g., because their project has high STD)
//
if (r0->seqno < r1->seqno) return true;
if (r0->seqno > r1->seqno) return false;
@ -1196,6 +1259,15 @@ bool CLIENT_STATE::enforce_schedule() {
//
append_unfinished_time_slice(runnable_jobs);
// Remove the EDF flag from unstarted jobs for which
// there's a running job with the same app version.
// This is a (crude) mechanism to avoid the situation
// where there's a set of EDF unstarted jobs,
// each one runs for a little and leaves EDF
// and is preempted by the next.
//
demote_unstarted_edf(runnable_jobs);
// sort to-run list by decreasing importance
//
std::sort(

View File

@ -221,6 +221,7 @@ void CLIENT_STATE::print_deadline_misses() {
}
}
#if 0
// compute a per-app-version "temporary DCF" based on the elapsed time
// and fraction done of running jobs
//
@ -238,6 +239,7 @@ void compute_temp_dcf() {
}
}
}
#endif
void CLIENT_STATE::rr_simulation() {
PROJECT* p, *pbest;
@ -248,7 +250,7 @@ void CLIENT_STATE::rr_simulation() {
double ar = available_ram();
work_fetch.rr_init();
compute_temp_dcf();
//compute_temp_dcf();
if (log_flags.rr_simulation) {
msg_printf(0, MSG_INFO,
@ -338,7 +340,8 @@ void CLIENT_STATE::rr_simulation() {
for (i=0; i<sim_status.active.size(); i++) {
rp = sim_status.active[i];
set_rrsim_flops(rp);
rp->rrsim_finish_delay = rp->avp->temp_dcf*rp->rrsim_flops_left/rp->rrsim_flops;
//rp->rrsim_finish_delay = rp->avp->temp_dcf*rp->rrsim_flops_left/rp->rrsim_flops;
rp->rrsim_finish_delay = rp->rrsim_flops_left/rp->rrsim_flops;
if (!rpbest || rp->rrsim_finish_delay < rpbest->rrsim_finish_delay) {
rpbest = rp;
}
@ -348,10 +351,9 @@ void CLIENT_STATE::rr_simulation() {
if (log_flags.rr_simulation) {
msg_printf(pbest, MSG_INFO,
"[rr_sim] %.2f: %s finishes after %.2f (%f * %.2fG/%.2fG)",
"[rr_sim] %.2f: %s finishes after %.2f (%.2fG/%.2fG)",
sim_now - now,
rpbest->name, rpbest->rrsim_finish_delay,
rpbest->avp->temp_dcf,
rpbest->rrsim_flops_left/1e9, rpbest->rrsim_flops/1e9
);
}

View File

@ -189,10 +189,13 @@ void CLIENT_STATE::get_workload(vector<IP_RESULT>& ip_results) {
init_ip_results(work_buf_min(), ncpus, ip_results);
}
// simulate trying to do an RPC
// return false if we didn't actually do one
//
bool CLIENT_STATE::simulate_rpc(PROJECT* _p) {
char buf[256];
SIM_PROJECT* p = (SIM_PROJECT*) _p;
static double last_time=0;
static double last_time=-1e9;
vector<IP_RESULT> ip_results;
int infeasible_count = 0;
@ -206,11 +209,13 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* _p) {
}
last_time = now;
sprintf(buf, "RPC to %s; asking for %f<br>",
p->project_name, cpu_work_fetch.req_secs
sprintf(buf, "RPC to %s; asking for %f/%.2f<br>",
p->project_name, cpu_work_fetch.req_secs, cpu_work_fetch.req_instances
);
html_msg += buf;
msg_printf(0, MSG_INFO, buf);
handle_completed_results();
if (server_uses_workload) {
@ -219,7 +224,8 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* _p) {
bool sent_something = false;
double work_left = cpu_work_fetch.req_secs;
while (work_left > 0) {
double instances_needed = cpu_work_fetch.req_instances;
while (work_left > 0 || instances_needed>0) {
RESULT* rp = new RESULT;
WORKUNIT* wup = new WORKUNIT;
make_job(p, wup, rp);
@ -246,17 +252,18 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* _p) {
);
html_msg += buf;
work_left -= p->duration_correction_factor*wup->rsc_fpops_est/host_info.p_fpops;
instances_needed -= 1;
}
if (cpu_work_fetch.req_secs > 0 && !sent_something) {
p->backoff();
return false;
} else {
}
p->nrpc_failures = 0;
if (sent_something) {
request_schedule_cpus("simulate_rpc");
request_work_fetch("simulate_rpc");
return true;
}
return true;
}
void SIM_PROJECT::backoff() {
@ -269,23 +276,62 @@ void SIM_PROJECT::backoff() {
bool CLIENT_STATE::scheduler_rpc_poll() {
PROJECT *p;
bool action = false;
static double last_time=0;
static double last_work_fetch_time = 0;
double elapsed_time;
// check only every 5 sec
//
if (now - last_time < SCHEDULER_RPC_POLL_PERIOD) {
msg_printf(NULL, MSG_INFO, "RPC poll: not time %f - %f < %f",
now, last_time, SCHEDULER_RPC_POLL_PERIOD
);
return false;
}
last_time = now;
msg_printf(NULL, MSG_INFO, "RPC poll start");
while (1) {
p = next_project_sched_rpc_pending();
if (p) {
return simulate_rpc(p);
work_fetch.compute_work_request(p);
action = simulate_rpc(p);
break;
}
p = find_project_with_overdue_results();
if (p) {
return simulate_rpc(p);
work_fetch.compute_work_request(p);
action = simulate_rpc(p);
break;
}
// should we check work fetch? Do this at most once/minute
if (must_check_work_fetch) {
last_work_fetch_time = 0;
}
elapsed_time = now - last_work_fetch_time;
if (elapsed_time < WORK_FETCH_PERIOD) {
return false;
}
must_check_work_fetch = false;
last_work_fetch_time = now;
p = work_fetch.choose_project();
if (p) {
return simulate_rpc(p);
action = simulate_rpc(p);
break;
}
msg_printf(NULL, MSG_INFO, "RPC poll: nothing to do");
return false;
break;
}
if (action) {
msg_printf(p, MSG_INFO, "RPC poll: did an RPC");
} else {
msg_printf(0, MSG_INFO, "RPC poll: didn't do an RPC");
}
return action;
}
bool ACTIVE_TASK_SET::poll() {
@ -303,9 +349,9 @@ bool ACTIVE_TASK_SET::poll() {
for (i=0; i<gstate.projects.size(); i++) {
p = (SIM_PROJECT*) gstate.projects[i];
p->idle = true;
sprintf(buf, "%s STD: %f min RPC<br>",
sprintf(buf, "%s STD: %f LTD %f<br>",
p->project_name, p->cpu_pwf.short_term_debt,
time_to_string(p->min_rpc_time)
p->pwf.overall_debt
);
gstate.html_msg += buf;
}
@ -606,11 +652,16 @@ void CLIENT_STATE::html_end(bool show_next) {
void CLIENT_STATE::simulate() {
bool action;
now = 0;
double start = START_TIME;
now = start;
html_start(false);
msg_printf(0, MSG_INFO,
"starting simultion. delta %f duration %f", delta, duration
);
while (1) {
running = host_info.available.sample(now);
while (1) {
msg_printf(0, MSG_INFO, "polling");
action = active_tasks.poll();
if (running) {
action |= handle_finished_apps();
@ -618,9 +669,11 @@ void CLIENT_STATE::simulate() {
action |= enforce_schedule();
action |= scheduler_rpc_poll();
}
msg_printf(0, MSG_INFO, action?"did action":"did no action");
if (!action) break;
}
now += delta;
msg_printf(0, MSG_INFO, "took time step");
for (unsigned int i=0; i<active_tasks.active_tasks.size(); i++) {
ACTIVE_TASK* atp = active_tasks.active_tasks[i];
if (atp->task_state() == PROCESS_EXECUTING) {
@ -628,7 +681,7 @@ void CLIENT_STATE::simulate() {
}
}
html_rec();
if (now > duration) break;
if (now > start + duration) break;
}
html_end(false);
}

View File

@ -90,7 +90,7 @@ public:
double lambda;
int parse(XML_PARSER&, char* end_tag);
bool sample(double);
void init();
void init(double);
RANDOM_PROCESS();
};
@ -313,6 +313,8 @@ extern bool dual_dcf;
extern bool work_fetch_old;
extern bool gpus_usable;
#define SCHEDULER_RPC_POLL_PERIOD 5.0
#define WORK_FETCH_PERIOD 60
#define CPU_SCHED_ENFORCE_PERIOD 60
@ -325,3 +327,6 @@ extern bool gpus_usable;
#define MAX_STD (86400)
// maximum short-term debt
#define START_TIME 946684800
// Jan 1 2000

View File

@ -395,6 +395,10 @@ bool RANDOM_PROCESS::sample(double t) {
value = true;
}
}
msg_printf(0, MSG_INFO,
"value: %d lambda: %f t %f time_left %f",
value, lambda, t, time_left
);
return value;
}
@ -402,7 +406,8 @@ RANDOM_PROCESS::RANDOM_PROCESS() {
frac = 1;
}
void RANDOM_PROCESS::init() {
void RANDOM_PROCESS::init(double st) {
last_time = st;
value = true;
time_left = exponential(lambda);
off_lambda = lambda/frac - lambda;
@ -502,23 +507,23 @@ int SIM_HOST::parse(XML_PARSER& xp) {
bool is_tag;
int retval;
p_ncpus = 1;
connection_interval = 0;
p_ncpus = 1;
while(!xp.get(tag, sizeof(tag), is_tag)) {
if (!is_tag) return ERR_XML_PARSE;
if (!strcmp(tag, "/host")) return 0;
else if (xp.parse_double(tag, "p_fpops", p_fpops)) continue;
else if (xp.parse_double(tag, "m_nbytes", m_nbytes)) continue;
else if (xp.parse_double(tag, "connection_interval", connection_interval)) continue;
else if (xp.parse_int(tag, "p_ncpus", p_ncpus)) continue;
else if (xp.parse_double(tag, "connection_interval", connection_interval)) continue;
else if (!strcmp(tag, "available")) {
retval = available.parse(xp, "/available");
if (retval) return retval;
available.init();
available.init(START_TIME);
} else if (!strcmp(tag, "idle")) {
retval = idle.parse(xp, "/idle");
if (retval) return retval;
idle.init();
idle.init(START_TIME);
} else {
printf("unrecognized: %s\n", tag);
return ERR_XML_PARSE;

View File

@ -513,12 +513,13 @@ void RSC_WORK_FETCH::update_long_term_debts() {
double offset;
double delta_limit = secs_this_debt_interval;
if (max_debt > -2*delta_limit) {
if (fabs(max_debt) < 1e-6) max_debt = 0;
offset = max_debt?-max_debt:0; // avoid -0
} else {
offset = 2*delta_limit;
}
if (log_flags.debt_debug) {
msg_printf(0, MSG_INFO, "[debt] %s LTD: adding offset %.2f",
msg_printf(0, MSG_INFO, "[debt] %s LTD: adding offset %f",
rsc_name(rsc_type), offset
);
}
@ -658,15 +659,43 @@ void WORK_FETCH::set_all_requests(PROJECT* p) {
}
}
// Compute an "overall long-term debt" for each project.
// This is a sum of per-resource terms, scaled by the relative speed of the resource.
// The term for a resource is its LTD plus an estimate of queued work.
//
void WORK_FETCH::set_overall_debts() {
for (unsigned i=0; i<gstate.projects.size(); i++) {
PROJECT* p = gstate.projects[i];
p->pwf.overall_debt = p->cpu_pwf.long_term_debt;
unsigned int i;
PROJECT* p;
RESULT* rp;
APP_VERSION* avp;
for (i=0; i<gstate.projects.size(); i++) {
p = gstate.projects[i];
p->cpu_pwf.queue_est = 0;
p->cuda_pwf.queue_est = 0;
p->ati_pwf.queue_est = 0;
}
for (i=0; i<gstate.results.size(); i++) {
rp = gstate.results[i];
p = rp->project;
if (!rp->nearly_runnable()) continue;
if (p->non_cpu_intensive) continue;
double dt = rp->estimated_time_remaining(false);
avp = rp->avp;
p->cpu_pwf.queue_est += dt*avp->avg_ncpus;
p->cuda_pwf.queue_est += dt*avp->ncudas;
p->ati_pwf.queue_est += dt*avp->natis;
}
for (i=0; i<gstate.projects.size(); i++) {
p = gstate.projects[i];
p->pwf.overall_debt = p->cpu_pwf.long_term_debt - p->cpu_pwf.queue_est/gstate.ncpus;
if (coproc_cuda) {
p->pwf.overall_debt += cuda_work_fetch.speed*p->cuda_pwf.long_term_debt;
p->pwf.overall_debt += cuda_work_fetch.relative_speed*
(p->cuda_pwf.long_term_debt - p->cuda_pwf.queue_est/coproc_cuda->count);
}
if (coproc_ati) {
p->pwf.overall_debt += ati_work_fetch.speed*p->ati_pwf.long_term_debt;
p->pwf.overall_debt += ati_work_fetch.relative_speed*
(p->ati_pwf.long_term_debt - p->ati_pwf.queue_est/coproc_ati->count);
}
}
}
@ -769,6 +798,10 @@ PROJECT* WORK_FETCH::non_cpu_intensive_project_needing_work() {
PROJECT* WORK_FETCH::choose_project() {
PROJECT* p = 0;
if (log_flags.work_fetch_debug) {
msg_printf(0, MSG_INFO, "[wfd]: work fetch start");
}
p = non_cpu_intensive_project_needing_work();
if (p) return p;
@ -893,6 +926,7 @@ void WORK_FETCH::compute_shares() {
if (!p->pwf.can_fetch_work) continue;
if (p->cpu_pwf.may_have_work) {
p->cpu_pwf.fetchable_share = p->resource_share/cpu_work_fetch.total_fetchable_share;
msg_printf(p, MSG_INFO, "FS: %f = %f/%f\n", p->cpu_pwf.fetchable_share, p->resource_share, cpu_work_fetch.total_fetchable_share);
}
if (coproc_cuda && p->cuda_pwf.may_have_work) {
p->cuda_pwf.fetchable_share = p->resource_share/cuda_work_fetch.total_fetchable_share;

View File

@ -45,10 +45,14 @@ struct RSC_PROJECT_WORK_FETCH {
// the following used by debt accounting
double anticipated_debt;
// short-term debt, adjusted by scheduled jobs
double secs_this_debt_interval;
inline void reset_debt_accounting() {
secs_this_debt_interval = 0;
}
double queue_est;
// an estimate of instance-secs of queued work;
// a temp used in computing overall debts
// the following are used by rr_simulation()
//
@ -157,7 +161,7 @@ struct BUSY_TIME_ESTIMATOR {
struct RSC_WORK_FETCH {
int rsc_type;
int ninstances;
double speed; // total FLOPS relative to CPU total FLOPS
double relative_speed; // total FLOPS relative to CPU total FLOPS
// the following used/set by rr_simulation():
//
@ -180,7 +184,7 @@ struct RSC_WORK_FETCH {
void init(int t, int n, double sp) {
rsc_type = t;
ninstances = n;
speed = sp;
relative_speed = sp;
busy_time_estimator.init(n);
}
// the following specify the work request for this resource

View File

@ -41,24 +41,27 @@ if ($set_lang){
page_head(tra("Language selection"));
echo "<p>",
tra("This web site is available in several languages. ".
"The currently selected language is: %1 (%2).",
tra(
"This web site is available in several languages. The currently selected language is: %1 (%2).",
"<em>".tra("LANG_NAME_INTERNATIONAL")."</em>",
tra("LANG_NAME_NATIVE")),
tra("LANG_NAME_NATIVE")
),
"</p>",
"<p>",
tra("Normally the choice of language ".
"is determined by your browser's language setting, ".
"which is: %1. ".
"You can change this setting using:", "<b>$prefs</b>"),
tra(
"Normally the choice of language is determined by your browser's language setting, which is: %1. You can change this setting using:",
"<b>$prefs</b>"
),
"</p><ul>",
"<li>", tra("Firefox: Tools/Options/General"),
"<li>", tra("Microsoft IE: Tools/Internet Options/Languages"),
"<li>",
tra("Firefox: Tools/Options/General"),
"<li>",
tra("Microsoft IE: Tools/Internet Options/Languages"),
"</ul>",
"<p>",
tra("Or you can select a language by clicking on one of the links. ".
"This will send your browser a cookie; ".
"make sure your browser accepts cookies from our domain."),
tra(
"Or you can select a language by clicking on one of the links. This will send your browser a cookie; make sure your browser accepts cookies from our domain."
),
"</p>"
;