From 289db963aba21839779e4251ecf607b543a656dc Mon Sep 17 00:00:00 2001 From: David Anderson Date: Thu, 9 Jun 2005 21:37:34 +0000 Subject: [PATCH] *** empty log message *** svn path=/trunk/boinc/; revision=6325 --- checkin_notes | 15 +++ client/client_state.C | 2 +- client/client_state.h | 4 +- client/client_types.C | 2 +- client/client_types.h | 6 +- client/cs_apps.C | 44 +++++--- client/cs_cmdline.C | 2 +- doc/sched.php | 254 ++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 304 insertions(+), 25 deletions(-) create mode 100644 doc/sched.php diff --git a/checkin_notes b/checkin_notes index 8d7550bfd4..bb0a4d6d17 100755 --- a/checkin_notes +++ b/checkin_notes @@ -7596,3 +7596,18 @@ David 8 June 2005 cs_apps.C cs_benchmark.C net_stats.C + +David 9 June 2005 + - Changed variable names from "work_done" to "wall_cpu_time"; + it's important to emphasize that debt accounting + is done on the basis of how long a process is running, + NOT how much CPU time it actually or reportedly got. + - don't count non-CPU-intensive apps in calculation of + total_wall_cpu_time_this_period + - fixed crash if you run core client with -attach_project X + + client/ + client_state.C,h + client_types.C,h + cs_apps.C + cs_cmdline.C diff --git a/client/client_state.C b/client/client_state.C index 33ae6eeb05..20e2c4e1a5 100644 --- a/client/client_state.C +++ b/client/client_state.C @@ -115,7 +115,7 @@ CLIENT_STATE::CLIENT_STATE() { cpu_earliest_deadline_first = false; cpu_sched_last_time = 0; - cpu_sched_work_done_this_period = 0; + total_wall_cpu_time_this_period = 0; must_schedule_cpus = true; } diff --git a/client/client_state.h b/client/client_state.h index 02367a7187..1bb185040f 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -156,8 +156,8 @@ private: // CPU sched state // double cpu_sched_last_time; - double cpu_sched_work_done_this_period; - // CPU time since last schedule_cpus() + double total_wall_cpu_time_this_period; + // "wall CPU time" accumulated since last schedule_cpus() bool work_fetch_no_new_work; bool cpu_earliest_deadline_first; diff --git a/client/client_types.C b/client/client_types.C index 168c19207d..531a4857fa 100644 --- a/client/client_types.C +++ b/client/client_types.C @@ -93,7 +93,7 @@ void PROJECT::init() { strcpy(code_sign_key, ""); user_files.clear(); anticipated_debt = 0; - work_done_this_period = 0; + wall_cpu_time_this_period = 0; next_runnable_result = NULL; work_request = 0; work_request_urgency = WORK_FETCH_DONT_NEED; diff --git a/client/client_types.h b/client/client_types.h index b171f67321..c091053f8f 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -248,9 +248,9 @@ public: double anticipated_debt; // expected debt by the end of the preemption period - double work_done_this_period; - // how much CPU time has been devoted to this - // project in the current period (secs) + double wall_cpu_time_this_period; + // how much "wall CPU time" has been devoted to this + // project in the current scheduling period (secs) struct RESULT *next_runnable_result; // the next result to run for this project diff --git a/client/cs_apps.C b/client/cs_apps.C index 0b56bdea9d..7f663a9fe4 100644 --- a/client/cs_apps.C +++ b/client/cs_apps.C @@ -83,7 +83,6 @@ int CLIENT_STATE::app_finished(ACTIVE_TASK& at) { char path[256]; int retval; double size; - double task_cpu_time; bool had_error = false; @@ -157,9 +156,9 @@ int CLIENT_STATE::app_finished(ACTIVE_TASK& at) { ); } - task_cpu_time = at.current_cpu_time - at.cpu_time_at_last_sched; - at.result->project->work_done_this_period += task_cpu_time; - cpu_sched_work_done_this_period += task_cpu_time; + double wall_cpu_time = now - cpu_sched_last_time; + at.result->project->wall_cpu_time_this_period += wall_cpu_time; + total_wall_cpu_time_this_period += wall_cpu_time; return 0; } @@ -394,18 +393,29 @@ void CLIENT_STATE::adjust_debts() { int count_cpu_intensive = 0; PROJECT *p; double min_short_term_debt=0, share_frac; - double elapsed_time = gstate.now - cpu_sched_last_time; + double wall_cpu_time = gstate.now - cpu_sched_last_time; SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_TASK); - // total up work done since last CPU reschedule + // Total up total and per-project "wall CPU" since last CPU reschedule. + // "Wall CPU" is the wall time during which a task was + // runnable (at the OS level). + // + // We use wall CPU for debt calculation + // (instead of reported actual CPU) for two reasons: + // 1) the process might have paged a lot, so the actual CPU + // may be a lot less than wall CPU + // 2) BOINC relies on apps to report their CPU time. + // Sometimes there are bugs and apps report zero CPU. + // It's safer not to trust them. // for (i=0; ischeduler_state != CPU_SCHED_SCHEDULED) continue; - double task_cpu_time = elapsed_time; - atp->result->project->work_done_this_period += task_cpu_time; - cpu_sched_work_done_this_period += task_cpu_time; + if (atp->result->project->non_cpu_intensive) continue; + + atp->result->project->wall_cpu_time_this_period += wall_cpu_time; + total_wall_cpu_time_this_period += wall_cpu_time; } // find total resource shares of runnable and potentially runnable projects @@ -429,8 +439,8 @@ void CLIENT_STATE::adjust_debts() { // if (p->potentially_runnable()) { share_frac = p->resource_share/potentially_runnable_resource_share; - p->long_term_debt += share_frac*cpu_sched_work_done_this_period - - p->work_done_this_period + p->long_term_debt += share_frac*total_wall_cpu_time_this_period + - p->wall_cpu_time_this_period ; } total_long_term_debt += p->long_term_debt; @@ -442,8 +452,8 @@ void CLIENT_STATE::adjust_debts() { p->anticipated_debt = 0; } else { share_frac = p->resource_share/runnable_resource_share; - p->short_term_debt += share_frac*cpu_sched_work_done_this_period - - p->work_done_this_period + p->short_term_debt += share_frac*total_wall_cpu_time_this_period + - p->wall_cpu_time_this_period ; if (first) { first = false; @@ -536,7 +546,7 @@ bool CLIENT_STATE::schedule_cpus() { atp = active_tasks.active_tasks[i]; atp->next_scheduler_state = CPU_SCHED_PREEMPTED; } - expected_pay_off = cpu_sched_work_done_this_period / ncpus; + expected_pay_off = total_wall_cpu_time_this_period / ncpus; for (j=0; jwork_done_this_period = 0; + p->wall_cpu_time_this_period = 0; } - cpu_sched_work_done_this_period = 0; + total_wall_cpu_time_this_period = 0; set_client_state_dirty("schedule_cpus"); return true; diff --git a/client/cs_cmdline.C b/client/cs_cmdline.C index f026a01a19..57e06d93bf 100644 --- a/client/cs_cmdline.C +++ b/client/cs_cmdline.C @@ -146,7 +146,7 @@ void CLIENT_STATE::parse_cmdline(int argc, char** argv) { } else if (ARG(run_cpu_benchmarks)) { run_cpu_benchmarks = true; } else if (ARG(attach_project)) { - if (i > argc-2) { + if (i >= argc-2) { show_options = true; } else { strcpy(attach_project_url, argv[++i]); diff --git a/doc/sched.php b/doc/sched.php new file mode 100644 index 0000000000..e30ecaf4f6 --- /dev/null +++ b/doc/sched.php @@ -0,0 +1,254 @@ + +CPU scheduling policy: +Of the set of results that are runnable (see below), +which ones to execute? +(On a machine with N CPUs, BOINC will try to execute N results at once). + +

+Work-fetch policy: +When should the core client ask a project for more work, +which project should it ask, +and how much work should it ask for? + +

+The goals of the CPU scheduler and work-fetch policies are +(in descending priority): +

    +
  • Results should be completed and reported by their deadline +(results reported after their deadline +may not have any value to the project and may not be granted credit). +
  • Project resource shares should be honored over the long term; +
  • If a computer is attached to multiple projects, + execution should rotate among projects on a frequent basis. +
+The policies are designed to accommodate all scenarios, +including those with computers that are slow or are attached +to a large number of projects. + +

+In previous versions of BOINC, +the core client attempted to maintain at least one result +for each attached project, +and would do weighted round-robin CPU scheduling among all projects. +In some scenarios (any combination of slow computer, +lots of projects, and tight deadlines) a computer could +miss the deadlines of all its results. +The new policies solve this problem as follows: +

    +
  • +Work fetch is limited to ensure that deadlines can be met. +A computer attached to 10 projects might +have work for only a few (perhaps only one) at a given time. +
  • +If deadlines are threatened, +the CPU scheduling policy switches to a mode +(earliest deadline first) that optimizes the likelihood +of meeting deadlines, at the expense of variety. +
+ + + +

Concepts and terms

+ +

Wall CPU time

+A result's wall CPU time is the amount of wall-clock time +its process has been runnable at the OS level. +The actual CPU time may be much less than this, +e.g. if the process does a lot of paging, +or if other (non-BOINC) processing jobs run at the same time. +

+BOINC uses wall CPU time as the measure of how much resource +has been given to each project. +Why not use actual CPU time instead? +

    +
  • Wall CPU time is more fair in the case of paging apps. +
  • The measurement of actual CPU time depends on apps to +report it correctly. +Sometimes apps have bugs that cause them to always report zero. +This screws up the scheduler. +
+ + + +

Result states

+A result is runnable if +
    +
  • Neither it nor its project is suspended, and +
  • its files have been downloaded, and +
  • it hasn't finished computing +
+A result is runnable soon if +
    +
  • Neither it nor its project is suspended, and +
  • it hasn't finished computing +
+ + +

Project states

+A project is runnable if +
    +
  • It's not suspended, and +
  • it has at least one runnable result +
+ +A project is downloading if +
    +
  • It's not suspended, and +
  • it has at least one result whose files are being downloaded +
+ +A project is contactable if +
    +
  • It's not suspended, and +
  • its master file has already been fetched, and +
  • it's not deferred (i.e. its minimum RPC time is in the past), and +
  • it's no-new-work flag is not set +
+ +A project is potentially runnable if +
    +
  • It's either runnable, downloading, or contactable. +
+ +

Debt

+Intuitively, a project's 'debt' is how much work is owed to it, +relative to other projects. +BOINC uses two types of debt; +each is defined related to a set S of projects. +In each case, the debt is recalculated periodically as follows: +
    +
  • A = the wall CPU time used by projects in S during this period +
  • R = sum of resource shares of projects in S +
  • For each project P in S: +
      +
    • F = P.resource_share / R (i.e., P's fractional resource share) +
    • W = A*F (i.e., how much wall CPU time P should have gotten) +
    • P.debt += W - P.wall_cpu_time (i.e. what P should have gotten + minus what it got). +
    +
  • P.debt is normalized (e.g. so that the mean or minimum is zero). +
+ + +Short-term debt is used by the CPU scheduler. +It is adjusted over the set of runnable projects. +It is normalized so that minimum short-term debt is zero, +and maximum short-term debt is no greater than 86400 (i.e. one day). + +

+Long-term debt is used by the work-fetch policy. +It is adjusted over the set of potentially runnable projects. +It is normalized so that average long-term debt is zero. + + +

The CPU scheduling policy

+

+The CPU scheduler has two modes, normal and panic. +In normal mode, the CPU scheduler runs the project(s) +with the greatest short-term debt. +Specifically: +

    +
  1. Set the 'anticipated debt' of each project to its short-term debt +
  2. Find the project P with the greatest anticipated debt, + select one of P's runnable results + (picking one that is already running, if possible) + and schedule that result. +
  3. Decrement P's anticipated debt by the 'expected payoff' + (the total wall CPU in the last period divided by #CPUs). +
  4. Repeat steps 2 and 3 for additional CPUs +
+Over the long term, this results in a round-robin policy, +weighted by resource shares. + +

+In panic mode, the CPU scheduler +schedules the runnable results with the earliest deadlines. +This allows the client to meet deadlines that would otherwise be missed. + + +

+The CPU scheduler runs when a result is completed, +when the end of the user-specified scheduling period is reached, +when new results become runnable, +or when the user performs a UI interaction +(e.g. suspending or resuming a project or result). + + +

The work-fetch policy

+ +

+X is the estimated wall time by which the number of +runnable results will fall below #CPUs. +

+min_queue is the user's network-connection period general preference. +

+work_fetch_OK is a flag set by the mode selection algorithm (see below). +

+The work-fetch policy maintains an 'overall urgency': +

    +
  • +NEED_IMMEDIATELY: + there is at least one idle CPU +
  • +NEED: + X < than min_queue +
  • +OK: + X > min_queue, work_fetch_OK is true +
  • +DONT_NEED: + work_fetch_OK is false +
+ +

+In addition, the work-fetch policy maintains a per-project work-fetch mode: +

+R(P) = fractional resource share of P +

+X(P) = estimated wall time when number of runnable results for P +will fall below #CPUs*R(P) +

    +
  • +NEED_IMMEDIATELY: + no results of P are runnable soon. +
  • +NEED: + X(P) < min_queue * R(P) +
  • +OK: + X(P) > min_queue * R(P), + and P is not suspended or deferred or no-new-work +
  • +DONT_NEED: + P is suspended or deferred or no-new-work +
+ +

+ +

Mode selection

+

+Sort the work units by deadline, earliest first. +If at any point in this list, the sum of the remaining +processing time is greater than 0.8 * up_frac * time to deadline, +the CPU queue is overloaded. +This triggers both no work requests and the CPU scheduler +into earliest deadline first. + +

+Sum the fraction that the remaining processing time is of the time +to deadline for each work unit. +If this is greater than 0.8 * up_frac, the CPU queue is fully loaded. +This triggers no work fetch. + + + +"; +page_tail(); +?>