*** empty log message ***

svn path=/trunk/boinc/; revision=6325
2005-06-09 21:37:34 +00:00 · 2005-06-09 21:37:34 +00:00 · 289db963ab
parent d74f627d41
commit 289db963ab
8 changed files with 304 additions and 25 deletions
--- a/15
+++ b/15
@ -7596,3 +7596,18 @@ David  8 June 2005
        cs_apps.C
        cs_benchmark.C
        net_stats.C
 David  9 June 2005
    - Changed variable names from "work_done" to "wall_cpu_time";
        it's important to emphasize that debt accounting
        is done on the basis of how long a process is running,
        NOT how much CPU time it actually or reportedly got.
    - don't count non-CPU-intensive apps in calculation of
        total_wall_cpu_time_this_period
    - fixed crash if you run core client with -attach_project X
    client/
        client_state.C,h
        client_types.C,h
        cs_apps.C
        cs_cmdline.C
--- a/client/client_state.C
+++ b/client/client_state.C
@ -115,7 +115,7 @@ CLIENT_STATE::CLIENT_STATE() {
    cpu_earliest_deadline_first = false;
    cpu_sched_last_time = 0;
-    cpu_sched_work_done_this_period = 0;
+    total_wall_cpu_time_this_period = 0;
    must_schedule_cpus = true;
 }
--- a/client/client_state.h
+++ b/client/client_state.h
@ -156,8 +156,8 @@ private:
    // CPU sched state
    //
    double cpu_sched_last_time;
-    double cpu_sched_work_done_this_period;
+    double total_wall_cpu_time_this_period;
-        // CPU time since last schedule_cpus()
+        // "wall CPU time" accumulated since last schedule_cpus()
 	bool work_fetch_no_new_work;
 	bool cpu_earliest_deadline_first;
--- a/client/client_types.C
+++ b/client/client_types.C
@ -93,7 +93,7 @@ void PROJECT::init() {
    strcpy(code_sign_key, "");
    user_files.clear();
    anticipated_debt = 0;
-    work_done_this_period = 0;
+    wall_cpu_time_this_period = 0;
    next_runnable_result = NULL;
    work_request = 0;
    work_request_urgency = WORK_FETCH_DONT_NEED;
--- a/client/client_types.h
+++ b/client/client_types.h
@ -248,9 +248,9 @@ public:
    double anticipated_debt;
        // expected debt by the end of the preemption period
-    double work_done_this_period;
+    double wall_cpu_time_this_period;
-        // how much CPU time has been devoted to this
+        // how much "wall CPU time" has been devoted to this
-        // project in the current period (secs)
+        // project in the current scheduling period (secs)
    struct RESULT *next_runnable_result;
        // the next result to run for this project
--- a/client/cs_apps.C
+++ b/client/cs_apps.C
@ -83,7 +83,6 @@ int CLIENT_STATE::app_finished(ACTIVE_TASK& at) {
    char path[256];
    int retval;
    double size;
    double task_cpu_time;
    bool had_error = false;
@ -157,9 +156,9 @@ int CLIENT_STATE::app_finished(ACTIVE_TASK& at) {
        );
    }
-    task_cpu_time = at.current_cpu_time - at.cpu_time_at_last_sched;
+    double wall_cpu_time = now - cpu_sched_last_time;
-    at.result->project->work_done_this_period += task_cpu_time;
+    at.result->project->wall_cpu_time_this_period += wall_cpu_time;
-    cpu_sched_work_done_this_period += task_cpu_time;
+    total_wall_cpu_time_this_period += wall_cpu_time;
    return 0;
 }
@ -394,18 +393,29 @@ void CLIENT_STATE::adjust_debts() {
    int count_cpu_intensive = 0;
    PROJECT *p;
    double min_short_term_debt=0, share_frac;
-    double elapsed_time = gstate.now - cpu_sched_last_time;
+    double wall_cpu_time = gstate.now - cpu_sched_last_time;
    SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_TASK);
-    // total up work done since last CPU reschedule
+    // Total up total and per-project "wall CPU" since last CPU reschedule.
    // "Wall CPU" is the wall time during which a task was
    // runnable (at the OS level).
    //
    // We use wall CPU for debt calculation
    // (instead of reported actual CPU) for two reasons:
    // 1) the process might have paged a lot, so the actual CPU
    //    may be a lot less than wall CPU
    // 2) BOINC relies on apps to report their CPU time.
    //    Sometimes there are bugs and apps report zero CPU.
    //    It's safer not to trust them.
    //
    for (i=0; i<active_tasks.active_tasks.size(); i++) {
        ACTIVE_TASK* atp = active_tasks.active_tasks[i];
        if (atp->scheduler_state != CPU_SCHED_SCHEDULED) continue;
-        double task_cpu_time = elapsed_time;
+        if (atp->result->project->non_cpu_intensive) continue;
-        atp->result->project->work_done_this_period += task_cpu_time;
+
-        cpu_sched_work_done_this_period += task_cpu_time;
+        atp->result->project->wall_cpu_time_this_period += wall_cpu_time;
        total_wall_cpu_time_this_period += wall_cpu_time;
    }
    // find total resource shares of runnable and potentially runnable projects
@ -429,8 +439,8 @@ void CLIENT_STATE::adjust_debts() {
        //
        if (p->potentially_runnable()) {
            share_frac = p->resource_share/potentially_runnable_resource_share;
-            p->long_term_debt += share_frac*cpu_sched_work_done_this_period
+            p->long_term_debt += share_frac*total_wall_cpu_time_this_period
-                - p->work_done_this_period
+                - p->wall_cpu_time_this_period
            ;
        }
        total_long_term_debt += p->long_term_debt;
@ -442,8 +452,8 @@ void CLIENT_STATE::adjust_debts() {
            p->anticipated_debt = 0;
        } else {
            share_frac = p->resource_share/runnable_resource_share;
-            p->short_term_debt += share_frac*cpu_sched_work_done_this_period
+            p->short_term_debt += share_frac*total_wall_cpu_time_this_period
-                - p->work_done_this_period
+                - p->wall_cpu_time_this_period
            ;
            if (first) {
                first = false;
@ -536,7 +546,7 @@ bool CLIENT_STATE::schedule_cpus() {
        atp = active_tasks.active_tasks[i];
        atp->next_scheduler_state = CPU_SCHED_PREEMPTED;
    }
-    expected_pay_off = cpu_sched_work_done_this_period / ncpus;
+    expected_pay_off = total_wall_cpu_time_this_period / ncpus;
    for (j=0; j<ncpus; j++) {
        if (cpu_earliest_deadline_first) {
            if (!schedule_earliest_deadline_result(expected_pay_off)) break;
@ -590,13 +600,13 @@ bool CLIENT_STATE::schedule_cpus() {
    // reset work accounting
    // doing this at the end of schedule_cpus() because
-    // work_done_this_period's can change as apps finish
+    // wall_cpu_time_this_period's can change as apps finish
    //
    for (i=0; i<projects.size(); i++) {
        p = projects[i];
-        p->work_done_this_period = 0;
+        p->wall_cpu_time_this_period = 0;
    }
-    cpu_sched_work_done_this_period = 0;
+    total_wall_cpu_time_this_period = 0;
    set_client_state_dirty("schedule_cpus");
    return true;
--- a/client/cs_cmdline.C
+++ b/client/cs_cmdline.C
@ -146,7 +146,7 @@ void CLIENT_STATE::parse_cmdline(int argc, char** argv) {
        } else if (ARG(run_cpu_benchmarks)) {
            run_cpu_benchmarks = true;
        } else if (ARG(attach_project)) {
-            if (i > argc-2) {
+            if (i >= argc-2) {
                show_options = true;
            } else {
                strcpy(attach_project_url, argv[++i]);
--- a/doc/sched.php
+++ b/doc/sched.php
@ -0,0 +1,254 @@
 <?php
 require_once("docutil.php");
 page_head("Client scheduling");
 echo "
 This document describes two related parts of the BOINC core client
 (version 4.36 and later):
 <p>
 <b>CPU scheduling policy</b>:
 Of the set of results that are runnable (see below),
 which ones to execute?
 (On a machine with N CPUs, BOINC will try to execute N results at once).
 <p>
 <b>Work-fetch policy</b>:
 When should the core client ask a project for more work,
 which project should it ask,
 and how much work should it ask for?
 <p>
 The goals of the CPU scheduler and work-fetch policies are
 (in descending priority):
 <ul>
 <li> Results should be completed and reported by their deadline
 (results reported after their deadline
 may not have any value to the project and may not be granted credit).
 <li> Project resource shares should be honored over the long term;
 <li> If a computer is attached to multiple projects,
    execution should rotate among projects on a frequent basis.
 </ul>
 The policies are designed to accommodate all scenarios,
 including those with computers that are slow or are attached
 to a large number of projects.
 <p>
 In previous versions of BOINC,
 the core client attempted to maintain at least one result
 for each attached project,
 and would do weighted round-robin CPU scheduling among all projects.
 In some scenarios (any combination of slow computer,
 lots of projects, and tight deadlines) a computer could
 miss the deadlines of all its results.
 The new policies solve this problem as follows:
 <ul>
 <li>
 Work fetch is limited to ensure that deadlines can be met.
 A computer attached to 10 projects might
 have work for only a few (perhaps only one) at a given time.
 <li>
 If deadlines are threatened,
 the CPU scheduling policy switches to a mode
 (earliest deadline first) that optimizes the likelihood
 of meeting deadlines, at the expense of variety.
 </ul>
 <h2>Concepts and terms</h2>
 <h3>Wall CPU time</h3>
 A result's <b>wall CPU time</b> is the amount of wall-clock time
 its process has been runnable at the OS level.
 The actual CPU time may be much less than this,
 e.g. if the process does a lot of paging,
 or if other (non-BOINC) processing jobs run at the same time.
 <p>
 BOINC uses wall CPU time as the measure of how much resource
 has been given to each project.
 Why not use actual CPU time instead?
 <ul>
 <li> Wall CPU time is more fair in the case of paging apps.
 <li> The measurement of actual CPU time depends on apps to
 report it correctly.
 Sometimes apps have bugs that cause them to always report zero.
 This screws up the scheduler.
 </ul>
 <h3>Result states</h3>
 A result is <b>runnable</b> if
 <ul>
 <li> Neither it nor its project is suspended, and
 <li> its files have been downloaded, and
 <li> it hasn't finished computing
 </ul>
 A result is <b>runnable soon</b> if
 <ul>
 <li> Neither it nor its project is suspended, and
 <li> it hasn't finished computing
 </ul>
 <h3>Project states</h3>
 A project is <b>runnable</b> if
 <ul>
 <li> It's not suspended, and
 <li> it has at least one runnable result
 </ul>
 A project is <b>downloading</b> if
 <ul>
 <li> It's not suspended, and
 <li> it has at least one result whose files are being downloaded
 </ul>
 A project is <b>contactable</b> if
 <ul>
 <li> It's not suspended, and
 <li> its master file has already been fetched, and
 <li> it's not deferred (i.e. its minimum RPC time is in the past), and
 <li> it's no-new-work flag is not set
 </ul>
 A project is <b>potentially runnable</b> if
 <ul>
 <li> It's either runnable, downloading, or contactable.
 </ul>
 <h3>Debt</h3>
 Intuitively, a project's 'debt' is how much work is owed to it,
 relative to other projects.
 BOINC uses two types of debt;
 each is defined related to a set S of projects.
 In each case, the debt is recalculated periodically as follows:
 <ul>
 <li> A = the wall CPU time used by projects in S during this period
 <li> R = sum of resource shares of projects in S
 <li> For each project P in S:
   <ul>
   <li> F = P.resource_share / R (i.e., P's fractional resource share)
   <li> W = A*F (i.e., how much wall CPU time P should have gotten)
   <li> P.debt += W - P.wall_cpu_time (i.e. what P should have gotten
           minus what it got).
    </ul>
 <li> P.debt is normalized (e.g. so that the mean or minimum is zero).
 </ul>
 <b>Short-term debt</b> is used by the CPU scheduler.
 It is adjusted over the set of runnable projects.
 It is normalized so that minimum short-term debt is zero,
 and maximum short-term debt is no greater than 86400 (i.e. one day).
 <p>
 <b>Long-term debt</b> is used by the work-fetch policy.
 It is adjusted over the set of potentially runnable projects.
 It is normalized so that average long-term debt is zero.
 <h2>The CPU scheduling policy</h2>
 <p>
 The CPU scheduler has two modes, <b>normal</b> and <b>panic</b>.
 In normal mode, the CPU scheduler runs the project(s)
 with the greatest short-term debt.
 Specifically:
 <ol>
 <li> Set the 'anticipated debt' of each project to its short-term debt
 <li> Find the project P with the greatest anticipated debt,
    select one of P's runnable results
    (picking one that is already running, if possible)
    and schedule that result.
 <li> Decrement P's anticipated debt by the 'expected payoff'
    (the total wall CPU in the last period divided by #CPUs).
 <li> Repeat steps 2 and 3 for additional CPUs
 </ol>
 Over the long term, this results in a round-robin policy,
 weighted by resource shares.
 <p>
 In panic mode, the CPU scheduler
 schedules the runnable results with the earliest deadlines.
 This allows the client to meet deadlines that would otherwise be missed.
 <p>
 The CPU scheduler runs when a result is completed,
 when the end of the user-specified scheduling period is reached,
 when new results become runnable,
 or when the user performs a UI interaction
 (e.g. suspending or resuming a project or result).
 <h2>The work-fetch policy</h2>
 <p>
 X is the estimated wall time by which the number of
 runnable results will fall below #CPUs.
 <p>
 min_queue is the user's network-connection period general preference.
 <p>
 work_fetch_OK is a flag set by the mode selection algorithm (see below).
 <p>
 The work-fetch policy maintains an 'overall urgency':
 <ul>
 <li>
 <b>NEED_IMMEDIATELY</b>:
    there is at least one idle CPU
 <li>
 <b>NEED</b>:
    X &lt; than min_queue 
 <li>
 <b>OK</b>:
    X > min_queue, work_fetch_OK is true
 <li>
 <b>DONT_NEED</b>:
    work_fetch_OK is false
 </ul>
 <p>
 In addition, the work-fetch policy maintains a per-project work-fetch mode:
 <p>
 R(P) = fractional resource share of P
 <p>
 X(P) = estimated wall time when number of runnable results for P
 will fall below #CPUs*R(P)
 <ul>
 <li>
 <b>NEED_IMMEDIATELY</b>:
    no results of P are runnable soon.
 <li>
 <b>NEED</b>:
    X(P) < min_queue * R(P)
 <li>
 <b>OK</b>:
    X(P) > min_queue * R(P),
    and P is not suspended or deferred or no-new-work
 <li>
 <b>DONT_NEED</b>:
    P is suspended or deferred or no-new-work
 </ul>
 <p>
 <h2>Mode selection</h2>
 <p>
 Sort the work units by deadline, earliest first.
 If at any point in this list, the sum of the remaining 
 processing time is greater than 0.8 * up_frac * time to deadline,
 the CPU queue is overloaded.
 This triggers both no work requests and the CPU scheduler
 into earliest deadline first.
 <p>
 Sum the fraction that the remaining processing time is of the time
 to deadline for each work unit.
 If this is greater than 0.8 * up_frac, the CPU queue is fully loaded.
 This triggers no work fetch.
 ";
 page_tail();
 ?>