acct mgr fix

svn path=/trunk/boinc/; revision=9587
This commit is contained in:
David Anderson 2006-03-02 07:17:18 +00:00
parent 776e976c6d
commit 5d07c6378e
12 changed files with 230 additions and 97 deletions

View File

@ -2478,3 +2478,12 @@ Rom 1 Mar 2006
Rom 1 Mar 2006 (HEAD)
- Tag for 5.3.23 release, all platforms
boinc_core_release_5_3_23
David 1 Mar 2006
- Core client: don't lose <send_gui_rpc_info/> tag from acct mgr URL file
when make acct mgr RPC
- Core client: make RAM size measurement work on HPUX > 4GB
client/
acct_mgr.C
hostinfo_unix.C

View File

@ -95,10 +95,10 @@ int ACCT_MGR_OP::do_rpc(
gstate.core_client_release,
run_mode_name[gstate.user_run_request]
);
if (strlen(ami.previous_host_cpid)) {
if (strlen(gstate.acct_mgr_info.previous_host_cpid)) {
fprintf(f,
" <previous_host_cpid>%s</previous_host_cpid>\n",
ami.previous_host_cpid
gstate.acct_mgr_info.previous_host_cpid
);
}
if (gstate.acct_mgr_info.send_gui_rpc_info) {
@ -284,7 +284,8 @@ void ACCT_MGR_OP::handle_reply(int http_op_retval) {
}
if (sig_ok) {
gstate.acct_mgr_info = ami;
strcpy(gstate.acct_mgr_info.acct_mgr_name, ami.acct_mgr_name);
strcpy(gstate.acct_mgr_info.signing_key, ami.signing_key);
// attach to new projects
//
@ -318,7 +319,7 @@ void ACCT_MGR_OP::handle_reply(int http_op_retval) {
}
}
strcpy(ami.previous_host_cpid, gstate.host_info.host_cpid);
strcpy(gstate.acct_mgr_info.previous_host_cpid, gstate.host_info.host_cpid);
if (repeat_sec) {
gstate.acct_mgr_info.next_rpc_time = gstate.now + repeat_sec;
} else {
@ -340,7 +341,7 @@ int ACCT_MGR_INFO::write_info() {
acct_mgr_name,
acct_mgr_url
);
if (send_gui_rpc_info) fprintf(p," <send_gui_rpc_info/>\n");
if (send_gui_rpc_info) fprintf(p," <send_gui_rpc_info/>\n");
if (strlen(signing_key)) {
fprintf(p,
" <signing_key>\n%s</signing_key>\n",
@ -405,9 +406,7 @@ int ACCT_MGR_INFO::init() {
if (match_tag(buf, "</acct_mgr>")) break;
else if (parse_str(buf, "<name>", acct_mgr_name, 256)) continue;
else if (parse_str(buf, "<url>", acct_mgr_url, 256)) continue;
else if (match_tag(buf, "<send_gui_rpc_info/>")) {
send_gui_rpc_info = true;
}
else if (parse_bool(buf, "send_gui_rpc_info", send_gui_rpc_info)) continue;
else if (match_tag(buf, "<signing_key>")) {
retval = copy_element_contents(
p,

View File

@ -467,7 +467,7 @@ int HOST_INFO::get_host_info() {
#elif defined(_HPUX_SOURCE)
struct pst_static pst;
pstat_getstatic(&pst, sizeof(pst), (size_t)1, 0);
m_nbytes = (double)pst.physical_memory * (double)pst.page_size;
m_nbytes = (long double)pst.physical_memory * (long double)pst.page_size;
#elif defined(__osf__)
// Tru64 UNIX.
// 2005-12-26 SMS.

View File

@ -1,6 +1,14 @@
<?
$project_news = array(
array("February 24, 2006",
"A significant step has been made towards integrating
BOINC and Grid software:
beginning with Condor version 6.7.17,
Condor can be configured to run BOINC when it has no other work to perform.
This capability is described in <a href=http://www.cs.wisc.edu/condor/manual/v6.7/3_13Setting_Up.html#SECTION004138000000000000000>the Condor manual</a>.
"
),
array("February 22, 2006",
"BOINC announces its
<a href=mac_intel.php>support for the Macintosh/Intel platform</a>.

View File

@ -83,8 +83,8 @@ Each area is managed by an 'owner'.
</tr>
<tr>
<td>Wiki-based documentation</td>
<td>Christopher Malton and Rytis Slatkevicius</td>
<td>Paul Buck</td>
<td><br></td>
</tr>
<tr>
<td>Windows installer and screensaver</td>

View File

@ -19,7 +19,7 @@ Help for SETI@home is available on the
<a href=http://setiathome.berkeley.edu/forum_help_desk.php>SETI@home message boards</a>
and help for BOINC is available at the
<a href=dev/>BOINC message boards</a> or the
<a href=http://boinc-doc.net/boinc-wiki/>BOINC Wiki</a>.
<a href=http://boinc-wiki.ath.cx/>BOINC Wiki</a>.
</font></b>
";

View File

@ -30,7 +30,7 @@ resources.
<td valign=top>
<table width=100% border=0 cellspacing=0 cellpadding=8>
<tr><td bgcolor=$light_blue>
<h2>&nbsp;Donate computing power</h2>
<h2>&nbsp;Participate</h2>
</td></tr>
<tr><td>
<p>
@ -63,18 +63,18 @@ resources.
</ul>
<p>
You can participate in more than one project,
and you control the fraction of your computing power
that goes to each project.
and you control how much of your computing power
goes to each project.
If you participate in several projects,
your computer will be kept busy even when one project has no work.
<p>
To participate:
<ul>
<ol>
<li> <a href=projects.php>Select projects</a>
<li> <a href=download.php>Download</a> and run BOINC software
<li> Enter the project's URL (visit the project's web site,
and copy the URL from your browser's address field).
</ul>
</ol>
<center>
<a href=participate.php><b>More info</a>
| <a href=download.php><b>Download</a>
@ -86,12 +86,20 @@ resources.
</td></tr>
<tr><td bgcolor=$light_blue><h2>Create a volunteer computing project</h2></td></tr>
<tr><td>
Scientists with computationally-intensive tasks
may be able to use BOINC.
If you are a scientist with a computationally-intensive task,
you may be able to use BOINC.
A BOINC project with a single Linux server
can provide computing power equivalent
to a cluster with tens of thousands of CPUs.
<p><a href=create_project.php>More info</a>
<p>
If you lack the resources (manpower, server capacity,
or network bandwidth) to operate a BOINC project directly,
organizations such as World Community Grid may be able
to assist you.
Please <a href=contact.php>contact us</a> for information.
<center>
<p><b><a href=create_project.php>More info</a>
</center>
</td></tr>
<tr><td bgcolor=$light_blue><h2>Software</h2></td></tr>
<tr><td>

View File

@ -9,6 +9,7 @@ function show_link($language, $name, $url) {
function language($lang, $sites) {
echo "<tr><td bgcolor=eeeeee valign=top>$lang</td><td>\n";
shuffle($sites);
foreach ($sites as $s) {
echo "$s<br>\n";
}
@ -27,7 +28,7 @@ echo "
User-editable
sites with information and documentation about BOINC.
<ul>
<li> <a href=http://boinc-doc.net/boinc-wiki/>The Unofficial BOINC Wiki</a> (in English)
<li> <a href=http://boinc-wiki.ath.cx/>The Unofficial BOINC Wiki</a> (in English)
<li> <a href=http://www.seti-argentina.com.ar/instrucciones-boinc-manager>BOINC Argentina</a> (in Spanish)
<li> <a href=http://faq.boinc.de/>Deutsche BOINC FAQ</a> (in German)
<li> <a href=http://www.boincfrance.org/wakka.php?wiki=BienVenue>BOINCFrance.org</a> (in French)
@ -77,6 +78,7 @@ language("Dutch", array(
site("http://www.boinc.be", "www.boinc.be"),
));
language("English", array(
site("http://sirans-boincnews.com/", "Siran's BOINC Projects News Site"),
site("http://www.ukboincteam.org.uk/", "UK BOINC Team"),
site("http://symbion.madnezz.com/", "Symbion"),
site("http://scotlandsseti.blogspot.com/", "Megacruncher's Blog"),

View File

@ -27,6 +27,7 @@ echo "
<br>Run BOINC on uncommon platforms,
or inspect the source code before you run it.
<li> <a href=acct_mgrs.php>Account managers</a>
<li> <a href=http://www.cs.wisc.edu/condor/manual/v6.7/3_13Setting_Up.html#SECTION004138000000000000000>BOINC as a Condor backfill job</a>
</ul>
<p>
<h3>Web sites</h3>

View File

@ -16,7 +16,7 @@ have no control over the creation of BOINC-based projects,
and in general do not endorse them.
The BOINC web sites lists some, but not all, projects.
A more complete list is on the
<a href=http://boinc-doc.net/boinc-wiki/index.php?title=Catalog_of_BOINC_Powered_Projects>BOINC Wiki</a>.
<a href=http://boinc-wiki.ath.cx/index.php?title=Catalog_of_BOINC_Powered_Projects>BOINC Wiki</a>.
<p>
When you participate in a project,

View File

@ -80,43 +80,54 @@ This screws up the scheduler.
<h3>Result states</h3>
A result is <b>runnable</b> if
R is <b>runnable</b> if
<ul>
<li> Neither it nor its project is suspended, and
<li> its files have been downloaded, and
<li> it hasn't finished computing
<li> Neither R nor R.project is suspended, and
<li> R's files have been downloaded, and
<li> R hasn't finished computing
</ul>
A result is <b>runnable soon</b> if
<ul>
<li> Neither it nor its project is suspended, and
<li> it hasn't finished computing
</ul>
<h3>Project states</h3>
A project is <b>runnable</b> if
P is <b>runnable</b> if
<ul>
<li> it has at least one runnable result
<li> P has at least one runnable result
(this implies that P is not suspended).
</ul>
A project is <b>downloading</b> if
P is <b>downloading</b> if
<ul>
<li> It's not suspended, and
<li> it has at least one result whose files are being downloaded
<li> P is not suspended, and
<li> P has at least one result whose files are being downloaded
and none of the downloads is deferred.
</ul>
A project is <b>contactable</b> if
P is <b>fetchable</b>
(i.e. the work-fetch policy allows work to be fetched from it) if
<ul>
<li> It's not suspended, and
<li> its master file has already been fetched, and
<li> it's not deferred (i.e. its minimum RPC time is in the past), and
<li> it's no-new-work flag is not set
<li> P is not suspended, and
<li> P is not deferred (i.e. its minimum RPC time is in the past), and
<li> P's no-new-work flag is not set, and
<li> P is not overworked (see definition below), and
<li> a fetch of P's master file is not pending
</ul>
A project is <b>potentially runnable</b> if
P is <b>latency-limited</b> if
<ul>
<li> It's either runnable, downloading, or contactable.
<li> The client's last scheduler RPC to P returned
a 'no work because of deadlines' flag, and
<li> the RPC reply's delay request has not yet elapsed.
</ul>
This means that P has work available,
but didn't send any because the work's deadlines couldn't be met
given the existing work queue.
<p>
P is <b>potentially runnable</b> if
<ul>
<li> P is either runnable, downloading, fetchable, overworked,
or latency-limited.
</ul>
This means that, to the best of the client's knowledge,
it could do work for P if it wanted to.
<h3>Debt</h3>
Intuitively, a project's 'debt' is how much work is owed to it,
@ -134,20 +145,21 @@ In each case, the debt is recalculated periodically as follows:
<li> P.debt += W - P.wall_cpu_time (i.e. what P should have gotten
minus what it got).
</ul>
<li> P.debt is normalized (e.g. so that the mean or minimum is zero).
<li> P.debt is normalized so that the mean or minimum is zero.
</ul>
<b>Short-term debt</b> is used by the CPU scheduler.
It is adjusted over the set of runnable projects.
It is normalized so that minimum short-term debt is zero,
and maximum short-term debt is no greater than 86400 (i.e. one day).
and maximum short-term debt is no greater than 86,400 (i.e. one day).
<p>
<b>Long-term debt</b> is used by the work-fetch policy.
It is defined for all projects,
and adjusted over the set of potentially runnable projects.
It is normalized so that average long-term debt is zero.
It is normalized so that average long-term debt,
over all project, is zero.
<h2>CPU scheduling policy</h2>
@ -157,8 +169,9 @@ for results that are in danger of missing their deadline,
and weighted round-robin among other projects if additional CPUs exist.
This allows the client to meet deadlines that would otherwise be missed,
while honoring resource shares over the long term.
The scheduler uses the following data, which are obtained
by a simulation of round-robin scheduling applied to the current work queue:
The scheduler uses the following data,
which are obtained by doing a simulation of round-robin scheduling
applied to the current work queue:
<ul>
<li> deadline_missed(R): whether result R would miss
its deadline with round-robin scheduling.
@ -168,23 +181,21 @@ be missed with round-robin scheduling.
</ul>
The scheduling policy is:
<ol>
<li> Find the project P for which deadlines_missed(P)>0,
and whose earliest deadline is earliest.
<li> Schedule deadline_missed(P)
results of P in order of increasing deadline,
with preference to those already running.
<li> If there are more CPUs, and other projects for which
deadlines_missed(P)>0, go to 1.
<li> If all CPUs are scheduled, stop;
otherwise continue to the next step, considering
only projects with deadlines_missed==0.
<li> Find the result R (not scheduled yet)
for which deadline_missed(R)
and whose deadline is earliest.
Tiebreaker: preference to result already running.
<li> If such an R exists, schedule R
<li> If there are more CPUs, and unscheduled results
deadline_missed(R), go to 1.
<li> If all CPUs are scheduled, stop.
<li> Set the 'anticipated debt' of each project to its short-term debt
<li> Find the project P with the greatest anticipated debt,
select one of P's runnable results
(picking one that is already running, if possible)
and schedule that result.
<li> Decrement P's anticipated debt by the 'expected payoff'
(the total wall CPU in the last period divided by NCPUS).
(the scheduling period divided by NCPUS).
<li> Repeat steps 6 and 7 for additional CPUs
</ol>
@ -196,20 +207,50 @@ or when the user performs a UI interaction
(e.g. suspending or resuming a project or result).
<p>
The CPU scheduler produces a list of results to run,
but they are not necessarily run immediately;
the enforcement of the schedule is done asynchronously.
A currently
The CPU scheduler decides what result should run,
but it doesn't enforce this decision
(by preempting, resuming and starting applications).
This enforcement is done by a separate function,
which runs periodically, and is also called by
the CPU scheduler at its conclusion.
The following rules apply to application preemption:
<ul>
<li> If the 'leave in memory' preference is not set,
an application scheduled for preemption is allowed to run for
up to sched_interval/2 additional seconds, or until it checkpoints.
<li>
The above does not apply for application being preempted
to run a result R for which deadline_missed(R).
<li> If an application has never checkpointed,
it is always left in memory on preemption.
</ul>
<h2>Work-fetch policy</h2>
<p>
When a result runs in EDF mode,
its project may get more than its share of CPU time.
The work-fetch policy is responsible for
ensuring that this doesn't happen repeatedly.
It does this by suppressing work fetch for the project.
<p>
A project P is <b>overworked</b> if
<ul>
<li> P.long_term_debt < -sched_period
</ul>
<p>
This condition occurs if P's results run in EDF mode
(and in extreme cases, when a project with large negative LTD
is detached).
<p>
The work-fetch policy uses the functions
<pre>
prrs(project P)
frs(project P)
</pre>
<blockquote>
P's fractional resource share among potentially runnable projects.
P's fractional resource share among fetchable projects.
</blockquote>
<pre>
@ -219,41 +260,43 @@ min_results(project P)
The minimum number of runnable results needed to
maintain P's resource share on this machine: namely,
<br>
ceil(ncpus*prrs(P))
ceil(ncpus*frs(P))
</blockquote>
<pre>
time_until_work_done(project P)
</pre>
<blockquote>
The estimated wall time until the number of
uncompleted results for this project will reach min_results(P)-1,
assuming round-robin scheduling among
the current potentially runnable projects.
assuming round-robin scheduling among currently fetchable projects.
</blockquote>
<pre>
time_until_free_cpu()
</pre>
<blockquote>
The estimated wall time until there is a free CPU,
assuming round-robin scheduling among currently fetchable projects.
</blockquote>
<p>
The work-fetch policy function is called every 5 seconds
The work-fetch policy function is called every few minutes
(or as needed) by the scheduler RPC polling function.
It sets the variable <b>work_request_size(P)</b> for each project P,
which is the number of seconds of work to request
if we do a scheduler RPC to P.
This is computed as follows:
<pre>
if P is suspended, deferred, overworked, or no-new-work
return 0
if time_until_work_done(P) > min_queue or CPU scheduler scheduled all CPUs by EDF
if time_until_cpu_free() < min_queue
return 1
else
return 0
y = estimated wall time of P's queued work
return max(1, (min_queue*ncpus*frs(P)) - y)
</pre>
It sets the following variable for each project P:
<p>
<b>work_request_size(P)</b>:
the number of seconds of work to request if we do a scheduler RPC to P.
This is
<ul>
<li>
0 if P is suspended, deferred, or no-new-work
<li>
0 if time_until_work_done(P) > min_queue
<li>
0 if CPU scheduler scheduled all CPUs by EDF
<li>
0 if P.long_term_debt < -(CPU sched interval)
(project has been using more than its share of CPU;
e.g. repeatedly doing short-deadline results).
<li>
otherwise:
(min_queue*ncpus*prrs(P)) - (estimated wall time of queued work)
</ul>
<p>
The scheduler RPC mechanism may select a project to contact
@ -267,16 +310,66 @@ P.long_term_debt - time_until_work_done(P) is greatest
</pre>
and gets work from that project.
<hr>
Scheduler request must include:
whether sporadic network connection (period);
whether in EDF mode.
Resource fractions?
<h2>Scheduler work-send policy</h2>
<p>
Scheduler reply should include (it no work sent)
if reason is because we're overloaded.
NOTE: the following has not been implemented,
and is independent of the above policies.
<p>
If last response was 'no work', don't accumulate LTD.
The scheduler should avoid sending results whose
deadlines are likely to be missed,
or which are likely to cause existing results to miss their deadlines.
This will be accomplished as follows:
<ul>
<li>
Scheduler requests includes connection period,
list of queued result (with estimated time remaining and deadline)
and project resource fractions.
<li>
The scheduler won't send results whose deadlines are less than
now + min_queue.
<li>
The scheduler does an EDF simulation of the initial workload
to see what results meet their deadline.
For each result R being considered for sending,
the scheduler does an EDF simulation.
If R meets its deadline,
all results that missed their deadlines do so by
no more than they did previously,
and all results that originally met their deadline still do,
R is sent.
<li>
If the scheduler has work but doesn't send any because of deadline misses,
it returns a 'no work because of deadlines' flag.
If the last RPC to a project returned this flag,
it is marked as latency-limited and accumulates LTD.
</ul>
<hr>
<h2>Describing scenarios</h2>
<p>
We encourage the use of the following notation for
describing scheduling scenarios
(times are given in hours):
<p>
P(C, D, R)
<p>
This describes a project with
<ul>
<li> C = CPU time per task
<li> D = delay bound
<li> R = fractional resource share
</ul>
A scenario is described by a list of project,
plus the following optional parameters:
<ul>
<li> NCPUS: number of CPUS (default 1)
<li> min_queue
</ul>
Hence a typical scenario description is:
<pre>
(P1(1000, 2000, .5), P2(1, 10, .5), NCPUS=4)
</pre>
";
page_tail();
?>

View File

@ -129,7 +129,7 @@ everyone else should connect to that process.
(log out and in, and check the Message tab).
<li> Verify that users other than
admin and installing users
can run the Manager and connect to the running core client.
can't run the Manager and connect to the running core client.
</ul>
<h3>Tests for modem-connected computers</h3>
<ul>
@ -211,6 +211,19 @@ run screensaver
<li> Try to connect to core client
with bad password, from host not on list, etc.
</ul>
<h3>Account manager functions</h3>
<ul>
<li> Attach to and detach from an account manager
such as GridRepublic or BOINCStats.
<li> Verify that the core client periodically synchronizes with
the account manager.
<li> Verify that the core client won't let you detach from
a project that you attached via the account manager.
<li> NOTE: If you find bugs in the account manager itself,
report them directly to the account manager admins,
not to BOINC Alpha.
</ul>
";
?>