*** empty log message ***

svn path=/trunk/boinc/; revision=9445
2006-02-11 03:00:37 +00:00 · 2006-02-11 03:00:37 +00:00 · f433b73f6b
parent 6bc732f191
commit f433b73f6b
9 changed files with 257 additions and 129 deletions
--- a/9
+++ b/9
@ -1708,3 +1708,12 @@ Charlie 8 Feb 2006
    mac_build/
        boinc.xcodeproj/
            project.pbxproj
+
+David  10 Feb 2005
+    - cosmetic
+
+    client/
+        client_msgs.h
+        client_types.h
+        cpu_sched.C
+        cs_scheduler.C
--- a/client/client_msgs.h
+++ b/client/client_msgs.h
@ -91,6 +91,7 @@ public:
        DEBUG_POLL,        // show what polls are responding
        DEBUG_GUIRPC,
        DEBUG_SCHED_CPU,
+        DEBUG_WORK_FETCH,
        DEBUG_SCRSAVE
    };
    CLIENT_MSG_LOG(): MSG_LOG(stdout) {}
--- a/client/client_types.h
+++ b/client/client_types.h
@ -254,6 +254,7 @@ public:
    std::vector<RESULT*>active;
    std::vector<RESULT*>pending;
    double rrsim_proc_rate;
+        // fraction of each CPU this project will get in RR simulation
    void set_rrsim_proc_rate(double per_cpu_proc_rate, double rrs);

    // "debt" is how much CPU time we owe this project relative to others
@ -284,7 +285,7 @@ public:
        // X seconds of wall-clock time to complete,
        // taking into account
        // 1) other projects and resource share;
-        // 2) on_frac and active_frac
+        // 2) on_frac, active_frac, and cpu_effiency
        // see doc/work_req.php
    int work_request_urgency;

--- a/client/cpu_sched.C
+++ b/client/cpu_sched.C
@ -422,34 +422,6 @@ void CLIENT_STATE::enforce_schedule() {
    }
 }

-// set the project's rrsim_proc_rate:
-// the fraction of each CPU that it will get in round-robin mode
-//
-void PROJECT::set_rrsim_proc_rate(double per_cpu_proc_rate, double rrs) {
-    int nactive = (int)active.size();
-    if (nactive == 0) return;
-    double x;
-    if (rrs) {
-        x = resource_share/rrs;
-    } else {
-        x = 1;      // TODO - fix
-    }
-
-    // if this project has fewer active results than CPUs,
-    // scale up its share to reflect this
-    //
-    if (nactive < gstate.ncpus) {
-        x *= ((double)gstate.ncpus)/nactive;
-    }
-
-    // But its rate on a given CPU can't exceed the CPU speed
-    //
-    if (x>1) {
-        x = 1;
-    }
-    rrsim_proc_rate = x*per_cpu_proc_rate*CPU_PESSIMISM_FACTOR;
-}
-
 // return true if we don't have enough runnable tasks to keep all CPUs busy
 //
 bool CLIENT_STATE::no_work_for_a_cpu() {
@ -465,7 +437,39 @@ bool CLIENT_STATE::no_work_for_a_cpu() {
    return ncpus > count;
 }

-// return true if round-robin scheduling will miss a deadline
+// Set the project's rrsim_proc_rate:
+// the fraction of each CPU that it will get in round-robin mode.
+// Precondition: the project's "active" array is populated
+//
+void PROJECT::set_rrsim_proc_rate(double per_cpu_proc_rate, double rrs) {
+    int nactive = (int)active.size();
+    if (nactive == 0) return;
+    double x;
+
+    if (rrs) {
+        x = resource_share/rrs;
+    } else {
+        x = 1;      // pathological case; maybe should be 1/# runnable projects
+    }
+
+    // if this project has fewer active results than CPUs,
+    // scale up its share to reflect this
+    //
+    if (nactive < gstate.ncpus) {
+        x *= ((double)gstate.ncpus)/nactive;
+    }
+
+    // But its rate on a given CPU can't exceed 1
+    //
+    if (x>1) {
+        x = 1;
+    }
+    rrsim_proc_rate = x*per_cpu_proc_rate*CPU_PESSIMISM_FACTOR;
+}
+
+// return true if round-robin scheduling will miss a deadline.
+// per_cpu_proc_rate is the expected number of CPU seconds per wall second
+// on each CPU; rrs is the resource share of runnable projects
 //
 bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) {
    PROJECT* p, *pbest;
@ -486,7 +490,6 @@ bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) {
        p->pending.clear();
    }

-
    for (i=0; i<results.size(); i++) {
        rp = results[i];
        if (rp->aborted_via_gui) continue;
@ -596,8 +599,10 @@ bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) {
 //
 void CLIENT_STATE::set_scheduler_mode() {
    bool use_earliest_deadline_first = false;
-    double total_proc_rate = avg_proc_rate();
-    double per_cpu_proc_rate = total_proc_rate/ncpus;
+    double per_cpu_proc_rate = avg_proc_rate()/ncpus;
+        // how many CPU seconds per wall second we get on each CPU,
+        // taking into account on_frac, active_frac, and cpu_efficiency
+
    double rrs = runnable_resource_share();

    if (rr_misses_deadline(per_cpu_proc_rate, rrs)) {
--- a/client/cs_scheduler.C
+++ b/client/cs_scheduler.C
@ -147,7 +147,7 @@ PROJECT* CLIENT_STATE::next_project_trickle_up_pending() {

 // Return the best project to fetch work from, NULL if none
 //
-// Basically, pick the one with largest long term debt - amount of current work
+// Pick the one with largest (long term debt - amount of current work)
 //
 // PRECONDITIONS:
 //   - work_request_urgency and work_request set for all projects
@ -160,7 +160,7 @@ PROJECT* CLIENT_STATE::next_project_need_work() {
    unsigned int i;
    double prrs = potentially_runnable_resource_share();

-    for (i=0; i<projects.size(); ++i) {
+    for (i=0; i<projects.size(); i++) {
        p = projects[i];
        if (p->work_request_urgency == WORK_FETCH_DONT_NEED) continue;
        if (p->work_request == 0) continue;
@ -502,11 +502,11 @@ double CLIENT_STATE::time_until_work_done(
    }
 }

-// Compute:
-// - work_request and work_request_urgency for all projects.
+// Top-level function for work fetch policy.
+// Outputs:
 // - overall_work_fetch_urgency
-//
-// Only set non-zero work requests for projects that are contactable
+// - for each contactable project:
+//     - work_request and work_request_urgency
 //
 int CLIENT_STATE::compute_work_requests() {
    unsigned int i;
@ -517,7 +517,7 @@ int CLIENT_STATE::compute_work_requests() {
    SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_SCHED_CPU);

    overall_work_fetch_urgency = WORK_FETCH_DONT_NEED;
-    for (i = 0; i < projects.size(); ++i) {
+    for (i=0; i< projects.size(); i++) {
        projects[i]->work_request_urgency = WORK_FETCH_DONT_NEED;
        projects[i]->work_request = 0;
    }
@ -556,7 +556,7 @@ int CLIENT_STATE::compute_work_requests() {
    // Then estimate how long it's going to be until we have fewer
    // than this # of results remaining.
    //
-    for (i=0; i<projects.size(); ++i) {
+    for (i=0; i<projects.size(); i++) {
        PROJECT *p = projects[i];

        p->work_request = 0;
@ -1034,14 +1034,17 @@ int CLIENT_STATE::handle_scheduler_reply(
 }

 bool CLIENT_STATE::should_get_work() {
-    // if there are fewer runnable results then CPUS, we need more work.
+    // if there are fewer runnable results than CPUS, we need more work.
    //
    if (no_work_for_a_cpu()) return true;

    double tot_cpu_time_remaining = 0;
-    for (unsigned int i = 0; i < results.size();++i) {
+    for (unsigned int i=0; i<results.size(); i++) {
        tot_cpu_time_remaining += results[i]->estimated_cpu_time_remaining();
    }
+
+    // ????? shouldn't we scale by ncpus?  by avg_proc_rate()??
+    //
    if (tot_cpu_time_remaining < global_prefs.work_buf_min_days*SECONDS_PER_DAY) {
        return true;
    }
@ -1096,7 +1099,7 @@ void CLIENT_STATE::set_work_fetch_mode() {

 double CLIENT_STATE::work_needed_secs() {
    double total_work = 0;
-    for( unsigned int i = 0; i < results.size(); ++i) {
+    for(unsigned int i=0; i<results.size(); i++) {
        if (results[i]->project->non_cpu_intensive) continue;
        total_work += results[i]->estimated_cpu_time_remaining();
    }
--- a/doc/boinc_news.inc
+++ b/doc/boinc_news.inc
@ -1,6 +1,14 @@
 <?

 $project_news = array(
+array("February 8, 2006",
+    "The <a href=http://szdg.lpds.sztaki.hu/szdg/>SZTAKI Desktop Grid</a>
+    is now in production.
+    The project, based at the MTA-SZTAKI Laboratory of Parallel
+    and Distributed Systems in Budapest, Hungary, is attempting
+    to find all the generalized binary number systems
+    (bases are matrices and digits are vectors) up to dimension 11."
+),
 array("February 6, 2006",
    "The <a href=http://boinc.bio.wzw.tum.de/boincsimap/>Similarity
    Matrix of Proteins (SIMAP)</a> project is now in production.
--- a/doc/index.php
+++ b/doc/index.php
@ -37,29 +37,26 @@ resources.
    BOINC lets you donate computing power
    to scientific research projects such as:
    <ul>
-    <li>
-    <a href=http://climateprediction.net>Climateprediction.net</a>:
-    study climate change
-    <li>
-    <a href=http://einstein.phys.uwm.edu/>Einstein@home</a>:
-    search for gravitational signals emitted by pulsars
-    <li>
-    <a href=http://lhcathome.cern.ch/>LHC@home</a>:
-    improve the design of the CERN LHC particle accelerator
-    <li>
-    <a href=http://predictor.scripps.edu>Predictor@home</a>:
-    investigate protein-related diseases
-    <li>
-    <a href=http://boinc.bakerlab.org/rosetta/>Rosetta@home<a>:
-    help researchers develop cures for human diseases
-    <li><a href=http://setiathome.berkeley.edu/>SETI@home</a>:
-    Look for radio evidence of extraterrestrial life
-    <li><a href=http://boinc.bio.wzw.tum.de/boincsimap/>SIMAP</a>:
+    <li> <a href=http://climateprediction.net>Climateprediction.net</a>:
+        study climate change
+    <li> <a href=http://einstein.phys.uwm.edu/>Einstein@home</a>:
+        search for gravitational signals emitted by pulsars
+    <li> <a href=http://lhcathome.cern.ch/>LHC@home</a>:
+        improve the design of the CERN LHC particle accelerator
+    <li> <a href=http://predictor.scripps.edu>Predictor@home</a>:
+        investigate protein-related diseases
+    <li> <a href=http://boinc.bakerlab.org/rosetta/>Rosetta@home<a>:
+        help researchers develop cures for human diseases
+    <li> <a href=http://setiathome.berkeley.edu/>SETI@home</a>:
+        Look for radio evidence of extraterrestrial life
+    <li> <a href=http://boinc.bio.wzw.tum.de/boincsimap/>SIMAP</a>:
        calculate protein similarity data for use by many biological
        research projects.
+    <li> <a href=http://szdg.lpds.sztaki.hu/szdg/>SZTAKI Desktop Grid</a>:
+        search for generalized binary number systems.
    <li> <a href=http://www.worldcommunitygrid.org/>World Community Grid</a>:
        advance our knowledge of human disease. (Requires 5.2.1 or greater)
-    <li><a href=http://www.cellcomputing.net/>Cell Computing</a>
+    <li> <a href=http://www.cellcomputing.net/>Cell Computing</a>
        biomedical research
        (Japanese; requires nonstandard client software)
    </ul>
--- a/doc/sched.php
+++ b/doc/sched.php
@ -186,76 +186,127 @@ when the end of the user-specified scheduling period is reached,
 when new results become runnable,
 or when the user performs a UI interaction
 (e.g. suspending or resuming a project or result).
-
-
-<h2>The work-fetch policy</h2>
-
-<p>
-X is the estimated wall time by which the number of
-runnable results will fall below NCPUS.
-<p>
-min_queue is the user's network-connection period general preference.
-<p>
-work_fetch_OK is a flag set by the mode selection algorithm (see below).
-<p>
-The work-fetch policy maintains an 'overall urgency':
+It does the following:
 <ul>
-<li>
-<b>NEED_IMMEDIATELY</b>:
-    there is at least one idle CPU
-<li>
-<b>NEED</b>:
-    X &lt; min_queue 
-<li>
-<b>OK</b>:
-    X &gt; min_queue, work_fetch_OK is true
-<li>
-<b>DONT_NEED</b>:
-    work_fetch_OK is false
-</ul>
-
-<p>
-In addition, the work-fetch policy maintains a per-project work-fetch mode:
-<p>
-R(P) = fractional resource share of P
-<p>
-X(P) = estimated wall time when number of runnable results for P
-will fall below NCPUS*R(P)
-<ul>
-<li>
-<b>NEED_IMMEDIATELY</b>:
-    no results of P are runnable soon.
-<li>
-<b>NEED</b>:
-    X(P) &lt; min_queue * R(P)
-<li>
-<b>OK</b>:
-    X(P) > min_queue * R(P),
-    and P is not suspended or deferred or no-new-work
-<li>
-<b>DONT_NEED</b>:
-    P is suspended or deferred or no-new-work
-</ul>
-
-<p>
-
-<h2>Mode selection</h2>
-<p>
-Work_fetch_OK is set to false if either
-<ul>
-<li> The sum of all RTFs is > 0.8
-<li> The CRTF of any result is > 0.8
-</ul>
-
-EDF mode is used if either
-<ul>
-<li> The CRTF of any result is > 0.8
-<li> The deadline of any result is earlier than one day from now
-<li> The deadline of any result is less than
-2 * min_queue from now.
+<li> Do a simulation of round-robin scheduling
+applied to the current work queue.
+<li> If all results meet their deadlines,
+    use round-robin; otherwise, use EDF.
 </ul>


+<h2>Work-fetch policy</h2>
+
+<p>
+The work-fetch policy is defined in terms of a basic function
+<pre>
+time_until_work_done(project, N, subset_resource_share)
+    // estimate wall time until the number of uncompleted results
+    // for this project will reach N,
+    // given the total resource share for a set of competing projects
+</pre>
+<p>
+The work-fetch policy function is called every 5 seconds
+(or as needed) by the scheduler RPC polling function.
+</pre>
+It sets the following variables:
+<ul>
+<li> <b>global urgency</b>: one of
+    <ul>
+    <li><b>DONT_NEED</b>: CPU scheduler is in EDF mode,
+        or fetching additional work would make it so.
+    <li><b>OK</b>: we have enough work, but it's OK to get more
+    <li><b>NEED</b>: a CPU will be idle within min_queue
+    <li><b>NEED_IMMEDIATELY</b>: a CPU is idle.
+    </ul>
+<li> For each project P
+    <br>
+    N = ncpus*(relative resource share)
+    <br>
+    prrs = potentially runnable resource share
+    <br>
+    X = time_until_work_done(P, N-1, prrs)
+    <ul>
+    <li><b>project urgency</b>
+        <ul>
+        <li><b>DONT_NEED</b>: P is suspended or deferred or no-new-work
+        <li><b>OK</b>: X > min_queue
+        <li><b>NEED</b>: X > 0
+        <li><b>NEED_IMMEDIATELY</b>: X == 0
+        </ul>
+    <li> <b>work request size</b>
+    (the number of seconds of work to request,
+    if we do a scheduler RPC to this project).
+    </ul>
+</ul>
+
+<p>
+The scheduler RPC mechanism may select a project to contact
+because of a user request, an outstanding trickle-up message,
+or a result that is overdue for reporting.
+If it does so, it will also request work from that project.
+
+<p>
+Otherwise, the RPC mechanism calls the following function and
+gets work from that project, if any.
+<pre>
+next_project_need_work()
+    if global_urgency == DONT_NEED return null
+    Pbest = null;
+    for each project P
+        if P.urgency != DONT_NEED and P.work_request_size > 0
+        if P.urgency == OK and global_urgency == OK
+            continue
+        P.score = P.long_term_debt - time_until_work_done(P, 0, prrs)
+        if Pbest
+            if P.score > Pbest.score
+                Pbest = P
+        else
+            Pbest = p
+    return Pbest
+</pre>
+
+<p>
+The work-fetch policy function is as follows:
+<pre>
+// compute global urgency
+
+x = delay until number of runnable results will be < ncpus
+if x == 0
+    global_urgency = NEED_IMMEDIATELY
+else
+    if CPU scheduling mode is EDF
+        global_urgency = DONT_NEED
+    else
+        P = project with greatest long-term debt
+        suppose we got work from P
+        if round-robin would then miss a deadline
+            global_urgency = DONT_NEED
+        else
+            if x &lt; min_queue
+                global_urgency = NEED
+            else
+                global_urgency = OK
+    
+// compute per-project urgencies and work request sizes
+
+if global_urgency != DONT_NEED
+    for each project P
+        N = ncpus/(fractional potentially runnable resource_share)
+            (i.e. number of results we need on hand to
+            keep enough CPUs busy to maintain resource share)
+        x = time until # of runnable results for P will fall below N
+        if x == 0
+            P.urgency = NEED_IMMEDIATELY
+        else if x < min_queue
+            P.urgency = NEED
+            P.work_request_size = min_queue - x
+        else if global_urgency > OK
+            P.urgency = OK
+            P.work_request_size = 1
+        else
+            P.urgency = DONT_NEED
+</pre>

 ";
 page_tail();
--- a/doc/server_status.php
+++ b/doc/server_status.php
@ -0,0 +1,53 @@
+<?php
+require_once("docutil.php");
+
+page_head("Server status XML export");
+
+echo "
+BOINC-based projects offer the following XML export
+at <code>URL/server_status.php</code>.
+These are generally updated every 10 minutes or so -
+do not poll more often than that.
+These can be used to make web sites showing
+the server status of multiple BOINC projects.
+<p>
+";
+echo html_text("
+<server_status>
+    <update_time>1128535206</update_time>
+    <daemon_status>
+        <daemon>
+            <host>jocelyn</host>
+            <command>BOINC database</command>
+            <status>running</status>
+        </daemon>
+        <daemon>
+            <host>castelli</host>
+            <command>master science database</command>
+            <status>running</status>
+        </daemon>
+        <daemon>
+            <host>klaatu</host>
+            <command>data-driven web pages</command>
+            <status>disabled</status>
+        </daemon>
+        <daemon>
+            <host>galileo</host>
+            <command>feeder</command>
+            <status>not running</status>
+        </daemon>
+    </daemon_status>
+	<database_file_states>
+        <results_ready_to_send>614830</results_ready_to_send>
+        <results_in_progress>1208959</results_in_progress>
+        <workunits_waiting_for_validation>8</workunits_waiting_for_validation>
+        <workunits_waiting_for_assimilation>2</workunits_waiting_for_assimilation>
+        <workunits_waiting_for_deletion>4</workunits_waiting_for_deletion>
+        <results_waiting_for_deletion>15</results_waiting_for_deletion>
+        <transitioner_backlog_hours>0.00083333334</transitioner_backlog_hours>
+    </database_file_states>
+</server_status>");
+
+page_tail();
+
+?>