mirror of https://github.com/BOINC/boinc.git
*** empty log message ***
svn path=/trunk/boinc/; revision=6325
This commit is contained in:
parent
d74f627d41
commit
289db963ab
|
@ -7596,3 +7596,18 @@ David 8 June 2005
|
||||||
cs_apps.C
|
cs_apps.C
|
||||||
cs_benchmark.C
|
cs_benchmark.C
|
||||||
net_stats.C
|
net_stats.C
|
||||||
|
|
||||||
|
David 9 June 2005
|
||||||
|
- Changed variable names from "work_done" to "wall_cpu_time";
|
||||||
|
it's important to emphasize that debt accounting
|
||||||
|
is done on the basis of how long a process is running,
|
||||||
|
NOT how much CPU time it actually or reportedly got.
|
||||||
|
- don't count non-CPU-intensive apps in calculation of
|
||||||
|
total_wall_cpu_time_this_period
|
||||||
|
- fixed crash if you run core client with -attach_project X
|
||||||
|
|
||||||
|
client/
|
||||||
|
client_state.C,h
|
||||||
|
client_types.C,h
|
||||||
|
cs_apps.C
|
||||||
|
cs_cmdline.C
|
||||||
|
|
|
@ -115,7 +115,7 @@ CLIENT_STATE::CLIENT_STATE() {
|
||||||
cpu_earliest_deadline_first = false;
|
cpu_earliest_deadline_first = false;
|
||||||
|
|
||||||
cpu_sched_last_time = 0;
|
cpu_sched_last_time = 0;
|
||||||
cpu_sched_work_done_this_period = 0;
|
total_wall_cpu_time_this_period = 0;
|
||||||
must_schedule_cpus = true;
|
must_schedule_cpus = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -156,8 +156,8 @@ private:
|
||||||
// CPU sched state
|
// CPU sched state
|
||||||
//
|
//
|
||||||
double cpu_sched_last_time;
|
double cpu_sched_last_time;
|
||||||
double cpu_sched_work_done_this_period;
|
double total_wall_cpu_time_this_period;
|
||||||
// CPU time since last schedule_cpus()
|
// "wall CPU time" accumulated since last schedule_cpus()
|
||||||
bool work_fetch_no_new_work;
|
bool work_fetch_no_new_work;
|
||||||
bool cpu_earliest_deadline_first;
|
bool cpu_earliest_deadline_first;
|
||||||
|
|
||||||
|
|
|
@ -93,7 +93,7 @@ void PROJECT::init() {
|
||||||
strcpy(code_sign_key, "");
|
strcpy(code_sign_key, "");
|
||||||
user_files.clear();
|
user_files.clear();
|
||||||
anticipated_debt = 0;
|
anticipated_debt = 0;
|
||||||
work_done_this_period = 0;
|
wall_cpu_time_this_period = 0;
|
||||||
next_runnable_result = NULL;
|
next_runnable_result = NULL;
|
||||||
work_request = 0;
|
work_request = 0;
|
||||||
work_request_urgency = WORK_FETCH_DONT_NEED;
|
work_request_urgency = WORK_FETCH_DONT_NEED;
|
||||||
|
|
|
@ -248,9 +248,9 @@ public:
|
||||||
|
|
||||||
double anticipated_debt;
|
double anticipated_debt;
|
||||||
// expected debt by the end of the preemption period
|
// expected debt by the end of the preemption period
|
||||||
double work_done_this_period;
|
double wall_cpu_time_this_period;
|
||||||
// how much CPU time has been devoted to this
|
// how much "wall CPU time" has been devoted to this
|
||||||
// project in the current period (secs)
|
// project in the current scheduling period (secs)
|
||||||
struct RESULT *next_runnable_result;
|
struct RESULT *next_runnable_result;
|
||||||
// the next result to run for this project
|
// the next result to run for this project
|
||||||
|
|
||||||
|
|
|
@ -83,7 +83,6 @@ int CLIENT_STATE::app_finished(ACTIVE_TASK& at) {
|
||||||
char path[256];
|
char path[256];
|
||||||
int retval;
|
int retval;
|
||||||
double size;
|
double size;
|
||||||
double task_cpu_time;
|
|
||||||
|
|
||||||
bool had_error = false;
|
bool had_error = false;
|
||||||
|
|
||||||
|
@ -157,9 +156,9 @@ int CLIENT_STATE::app_finished(ACTIVE_TASK& at) {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
task_cpu_time = at.current_cpu_time - at.cpu_time_at_last_sched;
|
double wall_cpu_time = now - cpu_sched_last_time;
|
||||||
at.result->project->work_done_this_period += task_cpu_time;
|
at.result->project->wall_cpu_time_this_period += wall_cpu_time;
|
||||||
cpu_sched_work_done_this_period += task_cpu_time;
|
total_wall_cpu_time_this_period += wall_cpu_time;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -394,18 +393,29 @@ void CLIENT_STATE::adjust_debts() {
|
||||||
int count_cpu_intensive = 0;
|
int count_cpu_intensive = 0;
|
||||||
PROJECT *p;
|
PROJECT *p;
|
||||||
double min_short_term_debt=0, share_frac;
|
double min_short_term_debt=0, share_frac;
|
||||||
double elapsed_time = gstate.now - cpu_sched_last_time;
|
double wall_cpu_time = gstate.now - cpu_sched_last_time;
|
||||||
|
|
||||||
SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_TASK);
|
SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_TASK);
|
||||||
|
|
||||||
// total up work done since last CPU reschedule
|
// Total up total and per-project "wall CPU" since last CPU reschedule.
|
||||||
|
// "Wall CPU" is the wall time during which a task was
|
||||||
|
// runnable (at the OS level).
|
||||||
|
//
|
||||||
|
// We use wall CPU for debt calculation
|
||||||
|
// (instead of reported actual CPU) for two reasons:
|
||||||
|
// 1) the process might have paged a lot, so the actual CPU
|
||||||
|
// may be a lot less than wall CPU
|
||||||
|
// 2) BOINC relies on apps to report their CPU time.
|
||||||
|
// Sometimes there are bugs and apps report zero CPU.
|
||||||
|
// It's safer not to trust them.
|
||||||
//
|
//
|
||||||
for (i=0; i<active_tasks.active_tasks.size(); i++) {
|
for (i=0; i<active_tasks.active_tasks.size(); i++) {
|
||||||
ACTIVE_TASK* atp = active_tasks.active_tasks[i];
|
ACTIVE_TASK* atp = active_tasks.active_tasks[i];
|
||||||
if (atp->scheduler_state != CPU_SCHED_SCHEDULED) continue;
|
if (atp->scheduler_state != CPU_SCHED_SCHEDULED) continue;
|
||||||
double task_cpu_time = elapsed_time;
|
if (atp->result->project->non_cpu_intensive) continue;
|
||||||
atp->result->project->work_done_this_period += task_cpu_time;
|
|
||||||
cpu_sched_work_done_this_period += task_cpu_time;
|
atp->result->project->wall_cpu_time_this_period += wall_cpu_time;
|
||||||
|
total_wall_cpu_time_this_period += wall_cpu_time;
|
||||||
}
|
}
|
||||||
|
|
||||||
// find total resource shares of runnable and potentially runnable projects
|
// find total resource shares of runnable and potentially runnable projects
|
||||||
|
@ -429,8 +439,8 @@ void CLIENT_STATE::adjust_debts() {
|
||||||
//
|
//
|
||||||
if (p->potentially_runnable()) {
|
if (p->potentially_runnable()) {
|
||||||
share_frac = p->resource_share/potentially_runnable_resource_share;
|
share_frac = p->resource_share/potentially_runnable_resource_share;
|
||||||
p->long_term_debt += share_frac*cpu_sched_work_done_this_period
|
p->long_term_debt += share_frac*total_wall_cpu_time_this_period
|
||||||
- p->work_done_this_period
|
- p->wall_cpu_time_this_period
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
total_long_term_debt += p->long_term_debt;
|
total_long_term_debt += p->long_term_debt;
|
||||||
|
@ -442,8 +452,8 @@ void CLIENT_STATE::adjust_debts() {
|
||||||
p->anticipated_debt = 0;
|
p->anticipated_debt = 0;
|
||||||
} else {
|
} else {
|
||||||
share_frac = p->resource_share/runnable_resource_share;
|
share_frac = p->resource_share/runnable_resource_share;
|
||||||
p->short_term_debt += share_frac*cpu_sched_work_done_this_period
|
p->short_term_debt += share_frac*total_wall_cpu_time_this_period
|
||||||
- p->work_done_this_period
|
- p->wall_cpu_time_this_period
|
||||||
;
|
;
|
||||||
if (first) {
|
if (first) {
|
||||||
first = false;
|
first = false;
|
||||||
|
@ -536,7 +546,7 @@ bool CLIENT_STATE::schedule_cpus() {
|
||||||
atp = active_tasks.active_tasks[i];
|
atp = active_tasks.active_tasks[i];
|
||||||
atp->next_scheduler_state = CPU_SCHED_PREEMPTED;
|
atp->next_scheduler_state = CPU_SCHED_PREEMPTED;
|
||||||
}
|
}
|
||||||
expected_pay_off = cpu_sched_work_done_this_period / ncpus;
|
expected_pay_off = total_wall_cpu_time_this_period / ncpus;
|
||||||
for (j=0; j<ncpus; j++) {
|
for (j=0; j<ncpus; j++) {
|
||||||
if (cpu_earliest_deadline_first) {
|
if (cpu_earliest_deadline_first) {
|
||||||
if (!schedule_earliest_deadline_result(expected_pay_off)) break;
|
if (!schedule_earliest_deadline_result(expected_pay_off)) break;
|
||||||
|
@ -590,13 +600,13 @@ bool CLIENT_STATE::schedule_cpus() {
|
||||||
|
|
||||||
// reset work accounting
|
// reset work accounting
|
||||||
// doing this at the end of schedule_cpus() because
|
// doing this at the end of schedule_cpus() because
|
||||||
// work_done_this_period's can change as apps finish
|
// wall_cpu_time_this_period's can change as apps finish
|
||||||
//
|
//
|
||||||
for (i=0; i<projects.size(); i++) {
|
for (i=0; i<projects.size(); i++) {
|
||||||
p = projects[i];
|
p = projects[i];
|
||||||
p->work_done_this_period = 0;
|
p->wall_cpu_time_this_period = 0;
|
||||||
}
|
}
|
||||||
cpu_sched_work_done_this_period = 0;
|
total_wall_cpu_time_this_period = 0;
|
||||||
|
|
||||||
set_client_state_dirty("schedule_cpus");
|
set_client_state_dirty("schedule_cpus");
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -146,7 +146,7 @@ void CLIENT_STATE::parse_cmdline(int argc, char** argv) {
|
||||||
} else if (ARG(run_cpu_benchmarks)) {
|
} else if (ARG(run_cpu_benchmarks)) {
|
||||||
run_cpu_benchmarks = true;
|
run_cpu_benchmarks = true;
|
||||||
} else if (ARG(attach_project)) {
|
} else if (ARG(attach_project)) {
|
||||||
if (i > argc-2) {
|
if (i >= argc-2) {
|
||||||
show_options = true;
|
show_options = true;
|
||||||
} else {
|
} else {
|
||||||
strcpy(attach_project_url, argv[++i]);
|
strcpy(attach_project_url, argv[++i]);
|
||||||
|
|
|
@ -0,0 +1,254 @@
|
||||||
|
<?php
|
||||||
|
require_once("docutil.php");
|
||||||
|
page_head("Client scheduling");
|
||||||
|
echo "
|
||||||
|
|
||||||
|
This document describes two related parts of the BOINC core client
|
||||||
|
(version 4.36 and later):
|
||||||
|
<p>
|
||||||
|
<b>CPU scheduling policy</b>:
|
||||||
|
Of the set of results that are runnable (see below),
|
||||||
|
which ones to execute?
|
||||||
|
(On a machine with N CPUs, BOINC will try to execute N results at once).
|
||||||
|
|
||||||
|
<p>
|
||||||
|
<b>Work-fetch policy</b>:
|
||||||
|
When should the core client ask a project for more work,
|
||||||
|
which project should it ask,
|
||||||
|
and how much work should it ask for?
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The goals of the CPU scheduler and work-fetch policies are
|
||||||
|
(in descending priority):
|
||||||
|
<ul>
|
||||||
|
<li> Results should be completed and reported by their deadline
|
||||||
|
(results reported after their deadline
|
||||||
|
may not have any value to the project and may not be granted credit).
|
||||||
|
<li> Project resource shares should be honored over the long term;
|
||||||
|
<li> If a computer is attached to multiple projects,
|
||||||
|
execution should rotate among projects on a frequent basis.
|
||||||
|
</ul>
|
||||||
|
The policies are designed to accommodate all scenarios,
|
||||||
|
including those with computers that are slow or are attached
|
||||||
|
to a large number of projects.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
In previous versions of BOINC,
|
||||||
|
the core client attempted to maintain at least one result
|
||||||
|
for each attached project,
|
||||||
|
and would do weighted round-robin CPU scheduling among all projects.
|
||||||
|
In some scenarios (any combination of slow computer,
|
||||||
|
lots of projects, and tight deadlines) a computer could
|
||||||
|
miss the deadlines of all its results.
|
||||||
|
The new policies solve this problem as follows:
|
||||||
|
<ul>
|
||||||
|
<li>
|
||||||
|
Work fetch is limited to ensure that deadlines can be met.
|
||||||
|
A computer attached to 10 projects might
|
||||||
|
have work for only a few (perhaps only one) at a given time.
|
||||||
|
<li>
|
||||||
|
If deadlines are threatened,
|
||||||
|
the CPU scheduling policy switches to a mode
|
||||||
|
(earliest deadline first) that optimizes the likelihood
|
||||||
|
of meeting deadlines, at the expense of variety.
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<h2>Concepts and terms</h2>
|
||||||
|
|
||||||
|
<h3>Wall CPU time</h3>
|
||||||
|
A result's <b>wall CPU time</b> is the amount of wall-clock time
|
||||||
|
its process has been runnable at the OS level.
|
||||||
|
The actual CPU time may be much less than this,
|
||||||
|
e.g. if the process does a lot of paging,
|
||||||
|
or if other (non-BOINC) processing jobs run at the same time.
|
||||||
|
<p>
|
||||||
|
BOINC uses wall CPU time as the measure of how much resource
|
||||||
|
has been given to each project.
|
||||||
|
Why not use actual CPU time instead?
|
||||||
|
<ul>
|
||||||
|
<li> Wall CPU time is more fair in the case of paging apps.
|
||||||
|
<li> The measurement of actual CPU time depends on apps to
|
||||||
|
report it correctly.
|
||||||
|
Sometimes apps have bugs that cause them to always report zero.
|
||||||
|
This screws up the scheduler.
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<h3>Result states</h3>
|
||||||
|
A result is <b>runnable</b> if
|
||||||
|
<ul>
|
||||||
|
<li> Neither it nor its project is suspended, and
|
||||||
|
<li> its files have been downloaded, and
|
||||||
|
<li> it hasn't finished computing
|
||||||
|
</ul>
|
||||||
|
A result is <b>runnable soon</b> if
|
||||||
|
<ul>
|
||||||
|
<li> Neither it nor its project is suspended, and
|
||||||
|
<li> it hasn't finished computing
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
|
<h3>Project states</h3>
|
||||||
|
A project is <b>runnable</b> if
|
||||||
|
<ul>
|
||||||
|
<li> It's not suspended, and
|
||||||
|
<li> it has at least one runnable result
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
A project is <b>downloading</b> if
|
||||||
|
<ul>
|
||||||
|
<li> It's not suspended, and
|
||||||
|
<li> it has at least one result whose files are being downloaded
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
A project is <b>contactable</b> if
|
||||||
|
<ul>
|
||||||
|
<li> It's not suspended, and
|
||||||
|
<li> its master file has already been fetched, and
|
||||||
|
<li> it's not deferred (i.e. its minimum RPC time is in the past), and
|
||||||
|
<li> it's no-new-work flag is not set
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
A project is <b>potentially runnable</b> if
|
||||||
|
<ul>
|
||||||
|
<li> It's either runnable, downloading, or contactable.
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h3>Debt</h3>
|
||||||
|
Intuitively, a project's 'debt' is how much work is owed to it,
|
||||||
|
relative to other projects.
|
||||||
|
BOINC uses two types of debt;
|
||||||
|
each is defined related to a set S of projects.
|
||||||
|
In each case, the debt is recalculated periodically as follows:
|
||||||
|
<ul>
|
||||||
|
<li> A = the wall CPU time used by projects in S during this period
|
||||||
|
<li> R = sum of resource shares of projects in S
|
||||||
|
<li> For each project P in S:
|
||||||
|
<ul>
|
||||||
|
<li> F = P.resource_share / R (i.e., P's fractional resource share)
|
||||||
|
<li> W = A*F (i.e., how much wall CPU time P should have gotten)
|
||||||
|
<li> P.debt += W - P.wall_cpu_time (i.e. what P should have gotten
|
||||||
|
minus what it got).
|
||||||
|
</ul>
|
||||||
|
<li> P.debt is normalized (e.g. so that the mean or minimum is zero).
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
|
<b>Short-term debt</b> is used by the CPU scheduler.
|
||||||
|
It is adjusted over the set of runnable projects.
|
||||||
|
It is normalized so that minimum short-term debt is zero,
|
||||||
|
and maximum short-term debt is no greater than 86400 (i.e. one day).
|
||||||
|
|
||||||
|
<p>
|
||||||
|
<b>Long-term debt</b> is used by the work-fetch policy.
|
||||||
|
It is adjusted over the set of potentially runnable projects.
|
||||||
|
It is normalized so that average long-term debt is zero.
|
||||||
|
|
||||||
|
|
||||||
|
<h2>The CPU scheduling policy</h2>
|
||||||
|
<p>
|
||||||
|
The CPU scheduler has two modes, <b>normal</b> and <b>panic</b>.
|
||||||
|
In normal mode, the CPU scheduler runs the project(s)
|
||||||
|
with the greatest short-term debt.
|
||||||
|
Specifically:
|
||||||
|
<ol>
|
||||||
|
<li> Set the 'anticipated debt' of each project to its short-term debt
|
||||||
|
<li> Find the project P with the greatest anticipated debt,
|
||||||
|
select one of P's runnable results
|
||||||
|
(picking one that is already running, if possible)
|
||||||
|
and schedule that result.
|
||||||
|
<li> Decrement P's anticipated debt by the 'expected payoff'
|
||||||
|
(the total wall CPU in the last period divided by #CPUs).
|
||||||
|
<li> Repeat steps 2 and 3 for additional CPUs
|
||||||
|
</ol>
|
||||||
|
Over the long term, this results in a round-robin policy,
|
||||||
|
weighted by resource shares.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
In panic mode, the CPU scheduler
|
||||||
|
schedules the runnable results with the earliest deadlines.
|
||||||
|
This allows the client to meet deadlines that would otherwise be missed.
|
||||||
|
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The CPU scheduler runs when a result is completed,
|
||||||
|
when the end of the user-specified scheduling period is reached,
|
||||||
|
when new results become runnable,
|
||||||
|
or when the user performs a UI interaction
|
||||||
|
(e.g. suspending or resuming a project or result).
|
||||||
|
|
||||||
|
|
||||||
|
<h2>The work-fetch policy</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
X is the estimated wall time by which the number of
|
||||||
|
runnable results will fall below #CPUs.
|
||||||
|
<p>
|
||||||
|
min_queue is the user's network-connection period general preference.
|
||||||
|
<p>
|
||||||
|
work_fetch_OK is a flag set by the mode selection algorithm (see below).
|
||||||
|
<p>
|
||||||
|
The work-fetch policy maintains an 'overall urgency':
|
||||||
|
<ul>
|
||||||
|
<li>
|
||||||
|
<b>NEED_IMMEDIATELY</b>:
|
||||||
|
there is at least one idle CPU
|
||||||
|
<li>
|
||||||
|
<b>NEED</b>:
|
||||||
|
X < than min_queue
|
||||||
|
<li>
|
||||||
|
<b>OK</b>:
|
||||||
|
X > min_queue, work_fetch_OK is true
|
||||||
|
<li>
|
||||||
|
<b>DONT_NEED</b>:
|
||||||
|
work_fetch_OK is false
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
In addition, the work-fetch policy maintains a per-project work-fetch mode:
|
||||||
|
<p>
|
||||||
|
R(P) = fractional resource share of P
|
||||||
|
<p>
|
||||||
|
X(P) = estimated wall time when number of runnable results for P
|
||||||
|
will fall below #CPUs*R(P)
|
||||||
|
<ul>
|
||||||
|
<li>
|
||||||
|
<b>NEED_IMMEDIATELY</b>:
|
||||||
|
no results of P are runnable soon.
|
||||||
|
<li>
|
||||||
|
<b>NEED</b>:
|
||||||
|
X(P) < min_queue * R(P)
|
||||||
|
<li>
|
||||||
|
<b>OK</b>:
|
||||||
|
X(P) > min_queue * R(P),
|
||||||
|
and P is not suspended or deferred or no-new-work
|
||||||
|
<li>
|
||||||
|
<b>DONT_NEED</b>:
|
||||||
|
P is suspended or deferred or no-new-work
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
|
||||||
|
<h2>Mode selection</h2>
|
||||||
|
<p>
|
||||||
|
Sort the work units by deadline, earliest first.
|
||||||
|
If at any point in this list, the sum of the remaining
|
||||||
|
processing time is greater than 0.8 * up_frac * time to deadline,
|
||||||
|
the CPU queue is overloaded.
|
||||||
|
This triggers both no work requests and the CPU scheduler
|
||||||
|
into earliest deadline first.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Sum the fraction that the remaining processing time is of the time
|
||||||
|
to deadline for each work unit.
|
||||||
|
If this is greater than 0.8 * up_frac, the CPU queue is fully loaded.
|
||||||
|
This triggers no work fetch.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
";
|
||||||
|
page_tail();
|
||||||
|
?>
|
Loading…
Reference in New Issue