From 57f2aec66d41476876708e4357ec25ef1ab236b9 Mon Sep 17 00:00:00 2001
From: David Anderson The BOINC client result computation scheduling aims to achieve the
-following goals: CPU scheduling aims to achieve the following goals
+(decreasing priority): The motivation for the second goal stems from the potential
+Goals and motivation
-
+
+
+CPU scheduling
+
+
-
-
-
This
-is clearly desirable.
A user
-participating in multiple projects can get bored seeing his computer
-work only on one project for a long time.
The user specifies the resource shares and thus
-expects them to be honored.
A project's resource share represents how much computing resources -(CPU time, network bandwith, storage space) a user wants to allocate -to the project relative to the resources allocated to all of the other -projects in which he is participating. The client should respect this -allocation to be faithful to the user. In the case of CPU time, the -result computation scheduling should achieve the expected time shares -over a reasonable time period.
- -At the same time, the scheduler RPC policy needs to complement the -result scheduling. We have the following goals for this policy:
- -We address the goals using result preemption. After a given time -period, the client decides on a new set of projects for which results -will be computed in the next time period. This decision will consider -the projects' resource shares by tracking the debt owed to a project. -The debt to a project accrues according to the project's resource -share, and is paid off when CPU time is devoted to the project.
- -A consequence of result preemption is that projects can have -multiple active result computations at a given time. For example, -consider a two processor system participating in two projects, A and -B, with resource shares 75% and 25%, respectively. Ideally, one -processor will run a result computation for A, while the other -processor will switch between running result computations for A and B. -Thus, A will have two active result computations. This consequence -implies a desirable property of the result preemption scheme: that -the number of active result computations for a project be -minimized. For example, it's better to have one result from +
We will attempt to minimize the number of active result + + +
+A result is 'active' if there is a slot directory for it. +A consequence of result preemption is that there can +be more active results than CPUs. + + +
+The notion of 'debt' is used to respect the resource share allocation +for each project. +The debt to a project represents the amount of work +(in CPU time) we owe it. +Debt is decreased when CPU time is devoted to a project. +We increase the debt to a project according to the +total amount of work done in a time period scaled by the project's +resource share. + +
+For example, consider a system participating in two projects, A and B, +with resource shares 75% and 25%, respectively. +Suppose in some time period, the system devotes 25 minutes of CPU time to project A +and 15 minutes of CPU time to project B. +We decrease the debt to A by 20 minutes and increase it by 30 minutes (75% of 25 + 15). +So the debt increases overall. +This makes sense because we expected to devote a +larger percentage of the system resources to project A than it +actually got. + +
+The choice of projects for which to start result computations +can simply follow the debt ordering of the projects. +The algorithm computes the 'anticipated debt' to a project +(the debt we expect to owe after the time period expires) +as it chooses result computations to run. + +
+This algorithm is run: +
+We will attempt to minimize the number of active result computations for a project by dynamically choosing results to compute -from a global pool. When we allocate CPU time to project, we will -choose results to compute intelligently: choose already running tasks -first, then preempted tasks, and only choose to start a new result -computation in the last resort. This will not guarantee the above -property, but we hope it will be close to achieving it.
- -The algorithm requires that a time period length be defined (e.g. -one hour). The result preemption algorithm is run at the beginning of -each period. It proceeds as follows:
+from a global pool. +When we allocate CPU time to project, +we will choose already running tasks first, +then preempted tasks, and only choose to start a new result +computation in the last resort. +This will not guarantee the above +property, but we hope it will be close to achieving it.Because result computations may finish before the time period -expires, we need to account for such a gap in a project's debt -payment. So, we need to also keep track of the amount of work done -during the current time period for each project as results finish. -This accounting should be reset for each time period.
- -Finally, the starting of new result computations in the middle of a -time period needs to use this accounting instead of the expected -future debts that were estimated at the beginning of the time period. -Otherwise, it will be similar to the decision of choosing which tasks -to run at the beginning of a time period.
-We'll initialize total_work_done_this_period to -num_cpus * period_length.
--preempt_apps(): // called after every period_length +data structures: +ACTIVE_TASK: + double cpu_at_last_schedule_point + double current_cpu_time + scheduler_state: + PREEMPTED + RUNNING + next_scheduler_state // temp +PROJECT: + double work_done_this_period // temp + double debt + double anticipated_debt // temp + bool has_runnable_result -// finish accounting -foreach T in running_tasks: - T.project.work_done_this_period += T.work_done_this_period - total_work_done_this_period += T.work_done_this_period +schedule_cpus(): + +foreach project P + P.work_done_this_period = 0 + +total_work_done_this_period = 0 +foreach task T that is RUNNING: + x = current_cpu_time - T.cpu_at_last_schedule_point + T.project.work_done_this_period += x + total_work_done_this_period += x -// pay off and accrue debts foreach P in projects: P.debt += P.resource_share * total_work_done_this_period - P.work_done_this_period -// make preemption decisions expected_pay_off = total_work_done_this_period / num_cpus + foreach P in projects: - P.expected_future_debt = P.debt -to_preempt.addAll(running_tasks) // assume we'll preempt everything at first -to_run = () + P.anticipated_debt = P.debt + +foreach task T + T.next_scheduler_state = PREEMPTED + do num_cpus times: - found = false - do projects.size times: - // choose the project with the largest expected future debt - P = argmax { P.expected_future_debt } over all P in projects - if (some T in to_preempt is for P): - // P has a task that ran last period, so just don't preempt it - to_preempt.remove(T) - T.expected_pay_off = expected_pay_off - found = true - break - if (some T in preempted_tasks is for P): - // P has a task that was preempted - preempted_tasks.remove(T) - to_run.add(T) - T.expected_pay_off = expected_pay_off - found = true - break - if (some R in results is for P, not active, and ready to run): - T = new ACTIVE_TASK for R - to_run.add(T) - T.expected_pay_off = expected_pay_off - found = true - break - remove P from consideration in the argmax - if found: - P.expected_future_debt -= expected_pay_off - else: + // choose the project with the largest anticipated debt + P = argmax { P.anticipated_debt } over all P in projects with runnable result + if none: break -suspend tasks in to_preempt (reset T.expected_pay_off for each T in to_preempt) -run or unsuspend tasks in to_run (and put in running_tasks) + if (some T in P is RUNNING): + t.next_scheduler_state = RUNNING + P.anticipated_debt -= expected_pay_off + continue + if (some T in P is PREEMPTED): + T.next_scheduler_state = RUNNING + P.anticipated_debt -= expected_pay_off + continue + if (some R in results is for P, not active, and ready to run): + T = new ACTIVE_TASK for R + T.next_scheduler_state = RUNNING + P.anticipated_debt -= expected_pay_off -// reset accounting -foreach P in projects: - P.work_done_this_period = 0 -total_work_done_this_period = 0 - ----------- - -start_apps(): // called at each iteration of the BOINC main loop - -foreach P in projects: - // expected_future_debt should account for any tasks that finished - // and for tasks that are still running - P.expected_future_debt = P.debt - P.work_done_this_period -foreach T in running_tasks: - T.project.expected_future_debt -= T.expected_pay_off - -to_run = () -while running_tasks.size < num_cpus: - do projects.size times: - // choose the project with the largest expected future debt - P = argmax { P.expected_future_debt } over all P in projects - if (some T in preempted_tasks is for P): - // P has a task that was preempted - preempted_tasks.remove(T) - to_run.add(T) - T.expected_pay_off = fraction_of_period_left * expected_pay_off - found = true - break - if (some R in results is for P, not active, and ready to run): - T = new ACTIVE_TASK for R - to_run.add(T) - T.expected_pay_off = fraction_of_period_left * expected_pay_off - found = true - break - remove P from consideration in the argmax - if found: - P.expected_future_debt -= fraction_of_period_left * expected_pay_off - else: - break -run or unsuspend tasks in to_run - ----------- - -handle_finished_apps(): // called at each iteration of the BOINC main loop - -foreach T in running_tasks: - if T finished: - // do some accounting - T.project.work_done_this_period += T.work_done_this_period - total_work_done_this_period += T.work_done_this_period - do other clean up stuff +foreach task T + if scheduler_state == PREEMPTED and next_scheduler_state = RUNNING + unsuspend or run + if scheduler_state == RUNNING and next_scheduler_state = PREEMPTED + suspend (or kill) +foreach task T + T.cpu_at_last_schedule_point = current_cpu_time-
The notion of debt is used to respect the resource share allocation -for each project. The debt to a project represents the amount of work -(in CPU time) we owe to a project. Debt is paid off when CPU time is -devoted to a project. We accrue the debt to a project according to the -total amount of work done in a time period scaled by the project's -resource share.
++The work fetch policy has the following goal: -
For example, consider a system participating in two projects, A and -B, with resource shares 75% and 25%, respectively. Suppose in some -time period, the system devotes 25 minutes of CPU time to project A -and 15 minutes of CPU time to project B. We decrease the debt to A by -20 minutes and accrue it by 30 minutes (75% of 25 + 15). So the debt -increases overall. This makes sense because we expected to devote a -larger percentage of the system resources to project A than it -actually got.
+The choosing of projects for which to start result computations at -the beginning of each time period can simply follow the debt ordering -of the projects. The algorithm computes the expected future debt to a -project (the debt we expect to owe after the time period expires) as -it chooses result computations to run.
+expected future debt = debt - expected pay off * number of -tasks to run this period+
+The CPU scheduler needs a minimum number of results from a project +in order to respect the project's resource share. +We effectively have too little work when the number of results for a +project is less than this minimum number. -
However, choosing projects to run in the middle of a time period is -a little different. The preemption algorithm expected each of the -tasks it started to last for the entire time period. However, when a -task finishes in the middle of a time period, the expected future debt -to the respective project is an overestimate. We thus change the -expected future debt to reflect what has taken place: it is the debt -owed to the project at the beginning of the time period, minus the -amount of work that has already been done this time period, and minus -the amount of work we expect to complete by the end of the time -period. When projects have results chosen to run, we decrease the -expected future debt by the amount of work we expect to be done for -the project in the remainder of the time period.
++min_results(P) = ceil(ncpus * P.resource_share) +-
expected future debt = debt - (work completed + expected -pay off of tasks already running this period + expected pay off * -fraction of period left * number of new tasks for this -period)+
+The client can estimate the amount of time that will elapse until we +have too little work for a project. +When this length of time is less than T, it is time to get more work. -
The client should get more work when either of the following are -true:
++This algorithm determines if a project needs more work. If a project +does need work, then the amount of work it needs is computed. +It is called whenever the client can make a scheduler RPC. +
Ignoring the second case can cause long running result computations -to monopolize the CPU, even with result preemption. For example, -suppose a project has work units that finish on the order of months. -Then, when work_buf_min is on the order of days, the client will never -think it is out of work. However, projects with shorter result -computations may run out of work. So, even with preemption, we cannot -have minimum variety.
++The mechanism for actually getting work checks if a project has a +non-zero work request and if so, makes the scheduler RPC call to +request the work. -
The second case (running out of work for one project) is addressed -by capping the amount of work counted for a project. We cap it by the -total amount of work that can be done in min_work_buf_secs, scaled by -the project's resource share. Thus, the client will get more work when -any one project has too little work.
- -The case of having fewer results than CPUs is addressed by -\"packing\" results into CPU \"bins\".
+-need_to_get_work(): +data structures: +PROJECT: + double work_request_days - num_cpus_busy = 0 - total_work_secs = 0 - work_secs_for_one_cpu = 0 - foreach P in projects: - P.avail_work_secs = 0 +check_work_needed(Project P): - sort results in order of decreasing estimated_cpu_time +if num_results(P) < min_results(P): + P.work_request_days = 2T + return NEED_WORK_IMMEDIATELY - // pack results into CPU bins - foreach R in results: - result_work_secs = estimated_cpu_time(R) - work_secs_for_one_cpu += result_work_secs - R.project.avail_work_secs += result_work_secs - if work_secs_for_one_cpu >= min_work_buf_secs - work_secs_for_one_cpu = 0 - num_cpus_busy += 1 +top_results = top (min_results(P) - 1) results of P by expected +completion time + +work_remaining = 0 +foreach result R of P that is not in top_results: + work_remaining += R.expected_completion_time +work_remaining *= P.resource_share * active_frac / ncpus + +if work_remaining < T: + P.work_request_days = 2T - work_remaining / seconds_per_day + return NEED_WORK +else: + P.work_request_days = 0 + return DONT_NEED_WORK - // count total amount of work, but cap amount any one project contributes - // to this total - foreach P in projects: - total_work_secs += min { P.avail_work_secs, - P.resource_share * min_work_buf_secs * num_cpus } - return (num_cpus_busy < num_cpus) - || (total_work_secs < min_work_secs * num_cpus)-
XXX it will be useful to know what caused this predicate to return -true, so maybe it should be split into separate predicates.
- -XXX also need to factor in if we are able to currently contact a -project (according to min_rpc_time).
"; page_tail(); ?> diff --git a/doc/credit.php b/doc/credit.php index 2b92fdc954..9a6862655c 100644 --- a/doc/credit.php +++ b/doc/credit.php @@ -97,6 +97,8 @@ void update_average( avg_time = now; } +#define CREDIT_HALF_LIFE (SECONDS_IN_DAY*7) + "),"-BOINC projects can export data describing teams, users and hosts. -This data is exported in downloadable XML files, -and can be summarized and represented as Web pages. -Some examples are listed at +BOINC projects can export statistics data +describing teams, users and hosts. +This data can be imported and used to produce +web sites that show statistics and leaderboards +for one or more BOINC projects. +Examples of such sites are listed at http://setiboinc.ssl.berkeley.edu/ap/stats.php.
-The data is presented in several 'views': -teams ordered by credit, teams ordered by ID, etc. -Each view is available in two forms: -
+ If zero or absent, + the listing is written to a single file." +); +list_item("detail", + "If present, records are 'detailed': + user records include a list of hosts, + and team records include a list of users." +); +list_end(); +echo"
-The entries in a given file are in either 'summary' or 'detail' form. -For example, the summary of a team gives its ID, name, and credit, -while the detailed form also contains a list of its members. -
-The files are as follows: - -
-tables.xml -
-For each entity type (team, user, and host) this gives -
", +"); +list_item("core_versions.xml", +"A list of versions of the core client in the project's database" ); -list_item("team_expavg_credit.xml". htmlspecialchars("-Other files: -"; -list_start(); -list_heading("File name", "Contents"); -list_item( -"team_total_credit.xml-"), -" 1046220857 127 -1000 -100 14 -1000 -100 681 -1000 -100 setiathome @@ -70,41 +102,32 @@ htmlspecialchars("...
team_total_credit_N.xml", -"Team summaries, ordered by decreasing total credit. -The first file is the complete list; -the remaining files (for N = 0, 1, ...) is the list -in limited-size chunks." +"). +"
-The format of the various XML elements is as follows: +The format of the various XML elements +in the output files is as follows. +Notes: +
Team summary
@@ -115,6 +138,7 @@ htmlspecialchars("
-Note: <cpid> ('cross-project identifier') -is a unique identifier across multiple projects. -Accounts with the same email address on different projects -will have the same cross-project identifier -(as long as at least one computer is attached to both accounts). -
User detail
", htmlspecialchars(" @@ -179,6 +201,7 @@ htmlspecialchars("Eric Heien 4897.904591 9820.631754 +1087542007.701900 United States 1046220857 [ 14 ] @@ -187,6 +210,7 @@ htmlspecialchars("27 0.000000 0.000000 +1087542007.701900 Darwin @@ -196,6 +220,7 @@ htmlspecialchars("266 0.000000 0.000000 +1087542007.701900 GenuineIntel Intel(R) Linux @@ -211,6 +236,7 @@ htmlspecialchars("266 0.000000 0.000000 +1087542007.701900 GenuineIntel Intel(R) Linux @@ -226,6 +252,7 @@ htmlspecialchars("3 0.000000 0.000000 +1087542007.701900 GenuineIntel Pentium Windows XP diff --git a/doc/white.css b/doc/white.css index 11230e8c7d..6736d524a1 100644 --- a/doc/white.css +++ b/doc/white.css @@ -25,7 +25,7 @@ a:active { body , table , input , select { font-family: "Trebuchet MS", Verdana, Arial, Sans Serif; -#font-size: small; + font-size: small; } body { diff --git a/lib/error_numbers.h b/lib/error_numbers.h index 1350875572..f890ce3116 100755 --- a/lib/error_numbers.h +++ b/lib/error_numbers.h @@ -131,3 +131,4 @@ #define ERR_BIND -180 #define ERR_LISTEN -181 #define ERR_TIMEOUT -182 +#define ERR_PROJECT_DOWN -183 diff --git a/sched/assimilator.C b/sched/assimilator.C index bdcc95f566..72a54c2742 100644 --- a/sched/assimilator.C +++ b/sched/assimilator.C @@ -50,7 +50,6 @@ bool do_pass(APP& app) { DB_RESULT canonical_result, result; bool did_something = false; char buf[256]; - int retval; check_stop_daemons(); diff --git a/sched/handle_request.C b/sched/handle_request.C index 038870c8aa..1cc720b541 100644 --- a/sched/handle_request.C +++ b/sched/handle_request.C @@ -547,7 +547,7 @@ inline static const char* get_remote_addr() { void handle_trickle_ups(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { unsigned int i; DB_RESULT result; - DB_TRICKLE_UP tup; + DB_MSG_FROM_HOST mfh; int retval; char buf[256]; @@ -577,15 +577,18 @@ void handle_trickle_ups(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { ); continue; } - tup.clear(); - tup.create_time = time(0); - tup.send_time = td.send_time; - tup.resultid = result.id; - tup.appid = result.appid; - tup.hostid = reply.host.id; - tup.handled = false; - safe_strcpy(tup.xml, td.trickle_text.c_str()); - retval = tup.insert(); + mfh.clear(); + mfh.create_time = time(0); + mfh.send_time = td.send_time; + mfh.variety = result.appid; + mfh.hostid = reply.host.id; + mfh.handled = false; + sprintf(buf, "%s \n", td.result_name); + string foobar; + foobar = buf; + foobar += td.trickle_text; + safe_strcpy(mfh.xml, foobar.c_str()); + retval = mfh.insert(); if (retval) { log_messages.printf(SCHED_MSG_LOG::CRITICAL, "[HOST#%d] trickle insert failed: %d\n", @@ -596,14 +599,14 @@ void handle_trickle_ups(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { } void handle_trickle_downs(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { - DB_TRICKLE_DOWN td; + DB_MSG_TO_HOST mth; char buf[256]; sprintf(buf, "where hostid = %d", reply.host.id); - while (!td.enumerate(buf)) { - reply.trickle_downs.push_back(td); - td.handled = true; - td.update(); + while (!mth.enumerate(buf)) { + reply.msgs_to_host.push_back(mth); + mth.handled = true; + mth.update(); } } diff --git a/sched/main.C b/sched/main.C index 0e52efefc0..54a693aa8c 100644 --- a/sched/main.C +++ b/sched/main.C @@ -74,6 +74,7 @@ void send_message(char* msg, int delay) { "\n" " \n", msg, delay ); diff --git a/sched/server_types.C b/sched/server_types.C index b176c51d51..3b3e286f34 100644 --- a/sched/server_types.C +++ b/sched/server_types.C @@ -185,7 +185,6 @@ SCHEDULER_REPLY::~SCHEDULER_REPLY() { int SCHEDULER_REPLY::write(FILE* fout) { unsigned int i, j; string u1, u2, t1, t2; - int retval; fprintf(fout, "%s \n" "%d \n" + "\n" " \n" @@ -317,25 +316,9 @@ int SCHEDULER_REPLY::write(FILE* fout) { if (send_trickle_up_ack) { fputs(" \n", fout); } - for (i=0; i \n" - " %s \n" - "%d \n" - "\n" - "%s\n" - " \n" - "\n", - result.name, - td.create_time, - td.xml - ); + for (i=0; i\n"); diff --git a/sched/server_types.h b/sched/server_types.h index 90ea7b0030..b3cf0ba221 100644 --- a/sched/server_types.h +++ b/sched/server_types.h @@ -104,7 +104,7 @@ struct SCHEDULER_REPLY { vector wus; vector results; vector result_acks; - vector trickle_downs; + vector msgs_to_host; char code_sign_key[4096]; char code_sign_key_signature[4096]; bool send_trickle_up_ack; diff --git a/sched/trickle_handler.C b/sched/trickle_handler.C index 0cd79dcba9..74d2eb9615 100644 --- a/sched/trickle_handler.C +++ b/sched/trickle_handler.C @@ -44,23 +44,23 @@ using namespace std; SCHED_CONFIG config; char app_name[256]; -extern int handle_trickle(TRICKLE_UP&); +extern int handle_trickle(MSG_FROM_HOST&); -int handle_trickle(TRICKLE_UP& tup) { +int handle_trickle(MSG_FROM_HOST& mfh) { int retval; printf( - "got trickle-up \n%s\nfor result %d\n", - tup.xml, tup.resultid + "got trickle-up \n%s\n\n", + mfh.xml ); - DB_TRICKLE_DOWN tdown; - tdown.clear(); - tdown.create_time = time(0); - tdown.resultid = tup.resultid; - tdown.hostid = tup.hostid; - tdown.handled = false; - strcpy(tdown.xml, tup.xml); - retval = tdown.insert(); + DB_MSG_TO_HOST mth; + mth.clear(); + mth.create_time = time(0); + mth.hostid = mfh.hostid; + mth.variety = mfh.variety; + mth.handled = false; + strcpy(mth.xml, mfh.xml); + retval = mth.insert(); if (retval) { printf("insert failed %d\n", retval); } @@ -71,17 +71,17 @@ int handle_trickle(TRICKLE_UP& tup) { // return true if there were any // bool do_trickle_scan(APP& app) { - DB_TRICKLE_UP tup; + DB_MSG_FROM_HOST mfh; char buf[256]; bool found=false; int retval; - sprintf(buf, "where appid=%d and handled=0", app.id); - while (!tup.enumerate(buf)) { - retval = handle_trickle(tup); + sprintf(buf, "where variety=%d and handled=0", app.id); + while (!mfh.enumerate(buf)) { + retval = handle_trickle(mfh); if (!retval) { - tup.handled = true; - tup.update(); + mfh.handled = true; + mfh.update(); } found = true; }