From f433b73f6be55ecbdb2a6707fd4a8e25efa20ad2 Mon Sep 17 00:00:00 2001
From: David Anderson <davea@ssl.berkeley.edu>
Date: Sat, 11 Feb 2006 03:00:37 +0000
Subject: [PATCH] *** empty log message ***

svn path=/trunk/boinc/; revision=9445
---
 checkin_notes         |   9 +++
 client/client_msgs.h  |   1 +
 client/client_types.h |   3 +-
 client/cpu_sched.C    |  69 ++++++++--------
 client/cs_scheduler.C |  25 +++---
 doc/boinc_news.inc    |   8 ++
 doc/index.php         |  35 ++++----
 doc/sched.php         | 183 +++++++++++++++++++++++++++---------------
 doc/server_status.php |  53 ++++++++++++
 9 files changed, 257 insertions(+), 129 deletions(-)
 create mode 100644 doc/server_status.php

diff --git a/checkin_notes b/checkin_notes
index 27310f7afa..6822ec4463 100755
--- a/checkin_notes
+++ b/checkin_notes
@@ -1708,3 +1708,12 @@ Charlie 8 Feb 2006
     mac_build/
         boinc.xcodeproj/
             project.pbxproj
+
+David  10 Feb 2005
+    - cosmetic
+
+    client/
+        client_msgs.h
+        client_types.h
+        cpu_sched.C
+        cs_scheduler.C
diff --git a/client/client_msgs.h b/client/client_msgs.h
index 11e98a8d62..d761e7901f 100644
--- a/client/client_msgs.h
+++ b/client/client_msgs.h
@@ -91,6 +91,7 @@ public:
         DEBUG_POLL,        // show what polls are responding
         DEBUG_GUIRPC,
         DEBUG_SCHED_CPU,
+        DEBUG_WORK_FETCH,
         DEBUG_SCRSAVE
     };
     CLIENT_MSG_LOG(): MSG_LOG(stdout) {}
diff --git a/client/client_types.h b/client/client_types.h
index 502830fa18..25641e2b90 100644
--- a/client/client_types.h
+++ b/client/client_types.h
@@ -254,6 +254,7 @@ public:
     std::vector<RESULT*>active;
     std::vector<RESULT*>pending;
     double rrsim_proc_rate;
+        // fraction of each CPU this project will get in RR simulation
     void set_rrsim_proc_rate(double per_cpu_proc_rate, double rrs);
 
     // "debt" is how much CPU time we owe this project relative to others
@@ -284,7 +285,7 @@ public:
         // X seconds of wall-clock time to complete,
         // taking into account
         // 1) other projects and resource share;
-        // 2) on_frac and active_frac
+        // 2) on_frac, active_frac, and cpu_effiency
         // see doc/work_req.php
     int work_request_urgency;
 
diff --git a/client/cpu_sched.C b/client/cpu_sched.C
index 1eb0903b09..8234dfa795 100644
--- a/client/cpu_sched.C
+++ b/client/cpu_sched.C
@@ -422,34 +422,6 @@ void CLIENT_STATE::enforce_schedule() {
     }
 }
 
-// set the project's rrsim_proc_rate:
-// the fraction of each CPU that it will get in round-robin mode
-//
-void PROJECT::set_rrsim_proc_rate(double per_cpu_proc_rate, double rrs) {
-    int nactive = (int)active.size();
-    if (nactive == 0) return;
-    double x;
-    if (rrs) {
-        x = resource_share/rrs;
-    } else {
-        x = 1;      // TODO - fix
-    }
-
-    // if this project has fewer active results than CPUs,
-    // scale up its share to reflect this
-    //
-    if (nactive < gstate.ncpus) {
-        x *= ((double)gstate.ncpus)/nactive;
-    }
-
-    // But its rate on a given CPU can't exceed the CPU speed
-    //
-    if (x>1) {
-        x = 1;
-    }
-    rrsim_proc_rate = x*per_cpu_proc_rate*CPU_PESSIMISM_FACTOR;
-}
-
 // return true if we don't have enough runnable tasks to keep all CPUs busy
 //
 bool CLIENT_STATE::no_work_for_a_cpu() {
@@ -465,7 +437,39 @@ bool CLIENT_STATE::no_work_for_a_cpu() {
     return ncpus > count;
 }
 
-// return true if round-robin scheduling will miss a deadline
+// Set the project's rrsim_proc_rate:
+// the fraction of each CPU that it will get in round-robin mode.
+// Precondition: the project's "active" array is populated
+//
+void PROJECT::set_rrsim_proc_rate(double per_cpu_proc_rate, double rrs) {
+    int nactive = (int)active.size();
+    if (nactive == 0) return;
+    double x;
+
+    if (rrs) {
+        x = resource_share/rrs;
+    } else {
+        x = 1;      // pathological case; maybe should be 1/# runnable projects
+    }
+
+    // if this project has fewer active results than CPUs,
+    // scale up its share to reflect this
+    //
+    if (nactive < gstate.ncpus) {
+        x *= ((double)gstate.ncpus)/nactive;
+    }
+
+    // But its rate on a given CPU can't exceed 1
+    //
+    if (x>1) {
+        x = 1;
+    }
+    rrsim_proc_rate = x*per_cpu_proc_rate*CPU_PESSIMISM_FACTOR;
+}
+
+// return true if round-robin scheduling will miss a deadline.
+// per_cpu_proc_rate is the expected number of CPU seconds per wall second
+// on each CPU; rrs is the resource share of runnable projects
 //
 bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) {
     PROJECT* p, *pbest;
@@ -486,7 +490,6 @@ bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) {
         p->pending.clear();
     }
 
-
     for (i=0; i<results.size(); i++) {
         rp = results[i];
         if (rp->aborted_via_gui) continue;
@@ -596,8 +599,10 @@ bool CLIENT_STATE::rr_misses_deadline(double per_cpu_proc_rate, double rrs) {
 //
 void CLIENT_STATE::set_scheduler_mode() {
     bool use_earliest_deadline_first = false;
-    double total_proc_rate = avg_proc_rate();
-    double per_cpu_proc_rate = total_proc_rate/ncpus;
+    double per_cpu_proc_rate = avg_proc_rate()/ncpus;
+        // how many CPU seconds per wall second we get on each CPU,
+        // taking into account on_frac, active_frac, and cpu_efficiency
+
     double rrs = runnable_resource_share();
 
     if (rr_misses_deadline(per_cpu_proc_rate, rrs)) {
diff --git a/client/cs_scheduler.C b/client/cs_scheduler.C
index 201e6aae5e..84d1fc62de 100644
--- a/client/cs_scheduler.C
+++ b/client/cs_scheduler.C
@@ -147,7 +147,7 @@ PROJECT* CLIENT_STATE::next_project_trickle_up_pending() {
 
 // Return the best project to fetch work from, NULL if none
 //
-// Basically, pick the one with largest long term debt - amount of current work
+// Pick the one with largest (long term debt - amount of current work)
 //
 // PRECONDITIONS:
 //   - work_request_urgency and work_request set for all projects
@@ -160,7 +160,7 @@ PROJECT* CLIENT_STATE::next_project_need_work() {
     unsigned int i;
     double prrs = potentially_runnable_resource_share();
 
-    for (i=0; i<projects.size(); ++i) {
+    for (i=0; i<projects.size(); i++) {
         p = projects[i];
         if (p->work_request_urgency == WORK_FETCH_DONT_NEED) continue;
         if (p->work_request == 0) continue;
@@ -502,11 +502,11 @@ double CLIENT_STATE::time_until_work_done(
     }
 }
 
-// Compute:
-// - work_request and work_request_urgency for all projects.
+// Top-level function for work fetch policy.
+// Outputs:
 // - overall_work_fetch_urgency
-//
-// Only set non-zero work requests for projects that are contactable
+// - for each contactable project:
+//     - work_request and work_request_urgency
 //
 int CLIENT_STATE::compute_work_requests() {
     unsigned int i;
@@ -517,7 +517,7 @@ int CLIENT_STATE::compute_work_requests() {
     SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_SCHED_CPU);
 
     overall_work_fetch_urgency = WORK_FETCH_DONT_NEED;
-    for (i = 0; i < projects.size(); ++i) {
+    for (i=0; i< projects.size(); i++) {
         projects[i]->work_request_urgency = WORK_FETCH_DONT_NEED;
         projects[i]->work_request = 0;
     }
@@ -556,7 +556,7 @@ int CLIENT_STATE::compute_work_requests() {
     // Then estimate how long it's going to be until we have fewer
     // than this # of results remaining.
     //
-    for (i=0; i<projects.size(); ++i) {
+    for (i=0; i<projects.size(); i++) {
         PROJECT *p = projects[i];
 
         p->work_request = 0;
@@ -1034,14 +1034,17 @@ int CLIENT_STATE::handle_scheduler_reply(
 }
 
 bool CLIENT_STATE::should_get_work() {
-    // if there are fewer runnable results then CPUS, we need more work.
+    // if there are fewer runnable results than CPUS, we need more work.
     //
     if (no_work_for_a_cpu()) return true;
 
     double tot_cpu_time_remaining = 0;
-    for (unsigned int i = 0; i < results.size();++i) {
+    for (unsigned int i=0; i<results.size(); i++) {
         tot_cpu_time_remaining += results[i]->estimated_cpu_time_remaining();
     }
+
+    // ????? shouldn't we scale by ncpus?  by avg_proc_rate()??
+    //
     if (tot_cpu_time_remaining < global_prefs.work_buf_min_days*SECONDS_PER_DAY) {
         return true;
     }
@@ -1096,7 +1099,7 @@ void CLIENT_STATE::set_work_fetch_mode() {
 
 double CLIENT_STATE::work_needed_secs() {
     double total_work = 0;
-    for( unsigned int i = 0; i < results.size(); ++i) {
+    for(unsigned int i=0; i<results.size(); i++) {
         if (results[i]->project->non_cpu_intensive) continue;
         total_work += results[i]->estimated_cpu_time_remaining();
     }
diff --git a/doc/boinc_news.inc b/doc/boinc_news.inc
index 2b14865096..78a18a7eea 100644
--- a/doc/boinc_news.inc
+++ b/doc/boinc_news.inc
@@ -1,6 +1,14 @@
 <?
 
 $project_news = array(
+array("February 8, 2006",
+    "The <a href=http://szdg.lpds.sztaki.hu/szdg/>SZTAKI Desktop Grid</a>
+    is now in production.
+    The project, based at the MTA-SZTAKI Laboratory of Parallel
+    and Distributed Systems in Budapest, Hungary, is attempting
+    to find all the generalized binary number systems
+    (bases are matrices and digits are vectors) up to dimension 11."
+),
 array("February 6, 2006",
     "The <a href=http://boinc.bio.wzw.tum.de/boincsimap/>Similarity
     Matrix of Proteins (SIMAP)</a> project is now in production.
diff --git a/doc/index.php b/doc/index.php
index a14da0ed7b..5ef476450c 100644
--- a/doc/index.php
+++ b/doc/index.php
@@ -37,29 +37,26 @@ resources.
     BOINC lets you donate computing power
     to scientific research projects such as:
     <ul>
-    <li>
-    <a href=http://climateprediction.net>Climateprediction.net</a>:
-    study climate change
-    <li>
-    <a href=http://einstein.phys.uwm.edu/>Einstein@home</a>:
-    search for gravitational signals emitted by pulsars
-    <li>
-    <a href=http://lhcathome.cern.ch/>LHC@home</a>:
-    improve the design of the CERN LHC particle accelerator
-    <li>
-    <a href=http://predictor.scripps.edu>Predictor@home</a>:
-    investigate protein-related diseases
-    <li>
-    <a href=http://boinc.bakerlab.org/rosetta/>Rosetta@home<a>:
-    help researchers develop cures for human diseases
-    <li><a href=http://setiathome.berkeley.edu/>SETI@home</a>:
-    Look for radio evidence of extraterrestrial life
-    <li><a href=http://boinc.bio.wzw.tum.de/boincsimap/>SIMAP</a>:
+    <li> <a href=http://climateprediction.net>Climateprediction.net</a>:
+        study climate change
+    <li> <a href=http://einstein.phys.uwm.edu/>Einstein@home</a>:
+        search for gravitational signals emitted by pulsars
+    <li> <a href=http://lhcathome.cern.ch/>LHC@home</a>:
+        improve the design of the CERN LHC particle accelerator
+    <li> <a href=http://predictor.scripps.edu>Predictor@home</a>:
+        investigate protein-related diseases
+    <li> <a href=http://boinc.bakerlab.org/rosetta/>Rosetta@home<a>:
+        help researchers develop cures for human diseases
+    <li> <a href=http://setiathome.berkeley.edu/>SETI@home</a>:
+        Look for radio evidence of extraterrestrial life
+    <li> <a href=http://boinc.bio.wzw.tum.de/boincsimap/>SIMAP</a>:
         calculate protein similarity data for use by many biological
         research projects.
+    <li> <a href=http://szdg.lpds.sztaki.hu/szdg/>SZTAKI Desktop Grid</a>:
+        search for generalized binary number systems.
     <li> <a href=http://www.worldcommunitygrid.org/>World Community Grid</a>:
         advance our knowledge of human disease. (Requires 5.2.1 or greater)
-    <li><a href=http://www.cellcomputing.net/>Cell Computing</a>
+    <li> <a href=http://www.cellcomputing.net/>Cell Computing</a>
         biomedical research
         (Japanese; requires nonstandard client software)
     </ul>
diff --git a/doc/sched.php b/doc/sched.php
index ecc28f2470..01787f0d27 100644
--- a/doc/sched.php
+++ b/doc/sched.php
@@ -186,76 +186,127 @@ when the end of the user-specified scheduling period is reached,
 when new results become runnable,
 or when the user performs a UI interaction
 (e.g. suspending or resuming a project or result).
-
-
-<h2>The work-fetch policy</h2>
-
-<p>
-X is the estimated wall time by which the number of
-runnable results will fall below NCPUS.
-<p>
-min_queue is the user's network-connection period general preference.
-<p>
-work_fetch_OK is a flag set by the mode selection algorithm (see below).
-<p>
-The work-fetch policy maintains an 'overall urgency':
+It does the following:
 <ul>
-<li>
-<b>NEED_IMMEDIATELY</b>:
-    there is at least one idle CPU
-<li>
-<b>NEED</b>:
-    X &lt; min_queue 
-<li>
-<b>OK</b>:
-    X &gt; min_queue, work_fetch_OK is true
-<li>
-<b>DONT_NEED</b>:
-    work_fetch_OK is false
-</ul>
-
-<p>
-In addition, the work-fetch policy maintains a per-project work-fetch mode:
-<p>
-R(P) = fractional resource share of P
-<p>
-X(P) = estimated wall time when number of runnable results for P
-will fall below NCPUS*R(P)
-<ul>
-<li>
-<b>NEED_IMMEDIATELY</b>:
-    no results of P are runnable soon.
-<li>
-<b>NEED</b>:
-    X(P) &lt; min_queue * R(P)
-<li>
-<b>OK</b>:
-    X(P) > min_queue * R(P),
-    and P is not suspended or deferred or no-new-work
-<li>
-<b>DONT_NEED</b>:
-    P is suspended or deferred or no-new-work
-</ul>
-
-<p>
-
-<h2>Mode selection</h2>
-<p>
-Work_fetch_OK is set to false if either
-<ul>
-<li> The sum of all RTFs is > 0.8
-<li> The CRTF of any result is > 0.8
-</ul>
-
-EDF mode is used if either
-<ul>
-<li> The CRTF of any result is > 0.8
-<li> The deadline of any result is earlier than one day from now
-<li> The deadline of any result is less than
-2 * min_queue from now.
+<li> Do a simulation of round-robin scheduling
+applied to the current work queue.
+<li> If all results meet their deadlines,
+    use round-robin; otherwise, use EDF.
 </ul>
 
 
+<h2>Work-fetch policy</h2>
+
+<p>
+The work-fetch policy is defined in terms of a basic function
+<pre>
+time_until_work_done(project, N, subset_resource_share)
+    // estimate wall time until the number of uncompleted results
+    // for this project will reach N,
+    // given the total resource share for a set of competing projects
+</pre>
+<p>
+The work-fetch policy function is called every 5 seconds
+(or as needed) by the scheduler RPC polling function.
+</pre>
+It sets the following variables:
+<ul>
+<li> <b>global urgency</b>: one of
+    <ul>
+    <li><b>DONT_NEED</b>: CPU scheduler is in EDF mode,
+        or fetching additional work would make it so.
+    <li><b>OK</b>: we have enough work, but it's OK to get more
+    <li><b>NEED</b>: a CPU will be idle within min_queue
+    <li><b>NEED_IMMEDIATELY</b>: a CPU is idle.
+    </ul>
+<li> For each project P
+    <br>
+    N = ncpus*(relative resource share)
+    <br>
+    prrs = potentially runnable resource share
+    <br>
+    X = time_until_work_done(P, N-1, prrs)
+    <ul>
+    <li><b>project urgency</b>
+        <ul>
+        <li><b>DONT_NEED</b>: P is suspended or deferred or no-new-work
+        <li><b>OK</b>: X > min_queue
+        <li><b>NEED</b>: X > 0
+        <li><b>NEED_IMMEDIATELY</b>: X == 0
+        </ul>
+    <li> <b>work request size</b>
+    (the number of seconds of work to request,
+    if we do a scheduler RPC to this project).
+    </ul>
+</ul>
+
+<p>
+The scheduler RPC mechanism may select a project to contact
+because of a user request, an outstanding trickle-up message,
+or a result that is overdue for reporting.
+If it does so, it will also request work from that project.
+
+<p>
+Otherwise, the RPC mechanism calls the following function and
+gets work from that project, if any.
+<pre>
+next_project_need_work()
+    if global_urgency == DONT_NEED return null
+    Pbest = null;
+    for each project P
+        if P.urgency != DONT_NEED and P.work_request_size > 0
+        if P.urgency == OK and global_urgency == OK
+            continue
+        P.score = P.long_term_debt - time_until_work_done(P, 0, prrs)
+        if Pbest
+            if P.score > Pbest.score
+                Pbest = P
+        else
+            Pbest = p
+    return Pbest
+</pre>
+
+<p>
+The work-fetch policy function is as follows:
+<pre>
+// compute global urgency
+
+x = delay until number of runnable results will be < ncpus
+if x == 0
+    global_urgency = NEED_IMMEDIATELY
+else
+    if CPU scheduling mode is EDF
+        global_urgency = DONT_NEED
+    else
+        P = project with greatest long-term debt
+        suppose we got work from P
+        if round-robin would then miss a deadline
+            global_urgency = DONT_NEED
+        else
+            if x &lt; min_queue
+                global_urgency = NEED
+            else
+                global_urgency = OK
+    
+// compute per-project urgencies and work request sizes
+
+if global_urgency != DONT_NEED
+    for each project P
+        N = ncpus/(fractional potentially runnable resource_share)
+            (i.e. number of results we need on hand to
+            keep enough CPUs busy to maintain resource share)
+        x = time until # of runnable results for P will fall below N
+        if x == 0
+            P.urgency = NEED_IMMEDIATELY
+        else if x < min_queue
+            P.urgency = NEED
+            P.work_request_size = min_queue - x
+        else if global_urgency > OK
+            P.urgency = OK
+            P.work_request_size = 1
+        else
+            P.urgency = DONT_NEED
+</pre>
 
 ";
 page_tail();
diff --git a/doc/server_status.php b/doc/server_status.php
new file mode 100644
index 0000000000..032766a84a
--- /dev/null
+++ b/doc/server_status.php
@@ -0,0 +1,53 @@
+<?php
+require_once("docutil.php");
+
+page_head("Server status XML export");
+
+echo "
+BOINC-based projects offer the following XML export
+at <code>URL/server_status.php</code>.
+These are generally updated every 10 minutes or so -
+do not poll more often than that.
+These can be used to make web sites showing
+the server status of multiple BOINC projects.
+<p>
+";
+echo html_text("
+<server_status>
+    <update_time>1128535206</update_time>
+    <daemon_status>
+        <daemon>
+            <host>jocelyn</host>
+            <command>BOINC database</command>
+            <status>running</status>
+        </daemon>
+        <daemon>
+            <host>castelli</host>
+            <command>master science database</command>
+            <status>running</status>
+        </daemon>
+        <daemon>
+            <host>klaatu</host>
+            <command>data-driven web pages</command>
+            <status>disabled</status>
+        </daemon>
+        <daemon>
+            <host>galileo</host>
+            <command>feeder</command>
+            <status>not running</status>
+        </daemon>
+    </daemon_status>
+	<database_file_states>
+        <results_ready_to_send>614830</results_ready_to_send>
+        <results_in_progress>1208959</results_in_progress>
+        <workunits_waiting_for_validation>8</workunits_waiting_for_validation>
+        <workunits_waiting_for_assimilation>2</workunits_waiting_for_assimilation>
+        <workunits_waiting_for_deletion>4</workunits_waiting_for_deletion>
+        <results_waiting_for_deletion>15</results_waiting_for_deletion>
+        <transitioner_backlog_hours>0.00083333334</transitioner_backlog_hours>
+    </database_file_states>
+</server_status>");
+
+page_tail();
+
+?>