From e79d3ea4c8c84c248d9ace744dd7a5277a3462fd Mon Sep 17 00:00:00 2001
From: David Anderson <davea@ssl.berkeley.edu>
Date: Wed, 22 Aug 2012 04:02:52 +0000
Subject: [PATCH] - client: change the way project disk share is computed.    
 - Allow projects to report "desired disk usage" (DDU).         If the client
 learns that a project wants disk space,         it can shrink the allocation
 to other projects.     - Base share computation on DDU rather than disk
 usage.     - Introduce the notion of "disk resource share".         This is
 defined (somewhat arbitrarily) as resource share         plus 1/10 of the
 largest resource share.         This is intended to ensure that even
 zero-share projects         get enough disk space to store app versions and
 data files;         otherwise they wouldn't be able to compute. - server: use
 host.d_boinc_max (which wasn't being used)     to start d_project_share
 reported by client. - volunteer storage: change the way hosts are allocated
 to chunks.     Allow hosts to store several chunks of the same file, if
 needed

svn path=/trunk/boinc/; revision=26052
---
 checkin_notes           | 30 +++++++++++++
 client/client_state.h   |  1 +
 client/cs_prefs.cpp     | 68 +++++++++++++++++++++--------
 client/project.cpp      |  5 +++
 client/project.h        | 10 ++++-
 client/scheduler_op.cpp |  2 +
 db/boinc_db_types.h     | 10 ++---
 sched/sched_types.cpp   |  2 +-
 vda/sched_vda.cpp       |  6 +--
 vda/vda_lib.h           | 24 ++++++++---
 vda/vda_lib2.cpp        | 94 +++++++++++++++++++++++++++++++++++------
 11 files changed, 204 insertions(+), 48 deletions(-)

diff --git a/checkin_notes b/checkin_notes
index 3c44205b98..f64db9762a 100644
--- a/checkin_notes
+++ b/checkin_notes
@@ -5647,3 +5647,33 @@ David  20 Aug 2012
     vda/
         vdad.cpp
         vda_lib2.dpp
+
+David  21 Aug 2012
+    - client: change the way project disk share is computed.
+        - Allow projects to report "desired disk usage" (DDU).
+            If the client learns that a project wants disk space,
+            it can shrink the allocation to other projects.
+        - Base share computation on DDU rather than disk usage.
+        - Introduce the notion of "disk resource share".
+            This is defined (somewhat arbitrarily) as resource share
+            plus 1/10 of the largest resource share.
+            This is intended to ensure that even zero-share projects
+            get enough disk space to store app versions and data files;
+            otherwise they wouldn't be able to compute.
+    - server: use host.d_boinc_max (which wasn't being used)
+        to start d_project_share reported by client.
+    - volunteer storage: change the way hosts are allocated to chunks.
+        Allow hosts to store several chunks of the same file, if needed
+
+    vda/
+        vda_lib2.cpp
+        vda_lib.h
+    client/
+        project.cpp,h
+        client_state.h
+        cs_prefs.cpp
+        scheduler_op.cpp
+    db/
+        boinc_db_types.h
+    sched/
+        sched_types.h
diff --git a/client/client_state.h b/client/client_state.h
index 87b61877d5..1681947516 100644
--- a/client/client_state.h
+++ b/client/client_state.h
@@ -379,6 +379,7 @@ struct CLIENT_STATE {
         // disk usage not counting projects
         // computed by get_disk_usages()
     double total_disk_usage;
+        // client plus projects
     int get_disk_usages();
     void get_disk_shares();
     double allowed_disk_usage(double boinc_total);
diff --git a/client/cs_prefs.cpp b/client/cs_prefs.cpp
index e4c05cbfa2..36a6491e34 100644
--- a/client/cs_prefs.cpp
+++ b/client/cs_prefs.cpp
@@ -114,37 +114,68 @@ int CLIENT_STATE::get_disk_usages() {
     return 0;
 }
 
-// populate PROJECT::disk_share for all projects
+// populate PROJECT::disk_share for all projects,
+// i.e. the max space we should allocate to the project.
+// This is calculated as follows:
+// - each project has a "disk_resource_share" (DRS)
+//   This is the resource share plus .1*(max resource share).
+//   This ensures that backup projects get some disk.
+// - each project as a "desired_disk_usage (DDU)", 
+//   which is either its current usage
+//   or an amount sent from the scheduler.
+// - each project has a "quota": (available space)*(drs/total_drs).
+// - a project is "greedy" if DDU > quota.
+// - if a project is non-greedy, share = quota
+// - X = available space - space used by non-greedy projects
+// - if a project is greedy, share = quota
+//   + X*drs/(total drs of greedy projects)
 //
 void CLIENT_STATE::get_disk_shares() {
     PROJECT* p;
     unsigned int i;
 
-    double rss = 0;
+    // compute disk resource shares
+    //
+    double trs = 0;
+    double max_rs = 0;
     for (i=0; i<projects.size(); i++) {
         p = projects[i];
-        rss += p->resource_share;
-        p->disk_share = p->disk_usage;
+        p->ddu = std::max(p->disk_usage, p->desired_disk_usage);
+        double rs = p->resource_share;
+        trs += rs;
+        if (rs > max_rs) max_rs = rs;
+    }
+    if (trs) {
+        max_rs /= 10;
+        for (i=0; i<projects.size(); i++) {
+            p = projects[i];
+            p->disk_resource_share = p->resource_share + max_rs;
+        }
+    } else {
+        for (i=0; i<projects.size(); i++) {
+            p = projects[i];
+            p->disk_resource_share = 1;
+        }
     }
-    if (!rss) return;
 
-    // a project is "greedy" if it's using more than its share of disk
+    // Compute:
+    // greedy_drs: total disk resource share of greedy projects
+    // non_greedy_ddu: total desired disk usage of non-greedy projects
     //
-    double greedy_rs = 0;
-    double non_greedy_usage = 0;
+    double greedy_drs = 0;
+    double non_greedy_ddu = 0;
     double allowed = allowed_disk_usage(total_disk_usage);
     for (i=0; i<projects.size(); i++) {
         p = projects[i];
-        double rs = p->resource_share/rss;
-        if (p->disk_usage > allowed*rs) {
-            greedy_rs += p->resource_share;
+        p->disk_quota = allowed*p->disk_resource_share/trs;
+        if (p->ddu > p->disk_quota) {
+            greedy_drs += p->disk_resource_share;
         } else {
-            non_greedy_usage += p->disk_usage;
+            non_greedy_ddu += p->ddu;
         }
     }
-    if (!greedy_rs) greedy_rs = 1;      // handle projects w/ zero resource share
 
-    double greedy_allowed = allowed - non_greedy_usage;
+    double greedy_allowed = allowed - non_greedy_ddu;
     if (log_flags.disk_usage_debug) {
         msg_printf(0, MSG_INFO,
             "[disk_usage] allowed %.2fMB used %.2fMB",
@@ -153,9 +184,11 @@ void CLIENT_STATE::get_disk_shares() {
     }
     for (i=0; i<projects.size(); i++) {
         p = projects[i];
-        double rs = p->resource_share/rss;
-        if (p->disk_usage > allowed*rs) {
-            p->disk_share = greedy_allowed*p->resource_share/greedy_rs;
+        double rs = p->disk_resource_share/trs;
+        if (p->ddu > allowed*rs) {
+            p->disk_share = greedy_allowed*p->disk_resource_share/greedy_drs;
+        } else {
+            p->disk_share = p->disk_quota;
         }
         if (log_flags.disk_usage_debug) {
             msg_printf(p, MSG_INFO,
@@ -650,4 +683,3 @@ double CLIENT_STATE::max_available_ram() {
         global_prefs.ram_max_used_busy_frac, global_prefs.ram_max_used_idle_frac
     );
 }
-
diff --git a/client/project.cpp b/client/project.cpp
index 8058b8be3b..69f90d44ca 100644
--- a/client/project.cpp
+++ b/client/project.cpp
@@ -37,6 +37,7 @@ void PROJECT::init() {
     project_specific_prefs = "";
     gui_urls = "";
     resource_share = 100;
+    desired_disk_usage = 0;
     for (int i=0; i<MAX_RSC; i++) {
         no_rsc_pref[i] = false;
         no_rsc_config[i] = false;
@@ -292,6 +293,7 @@ int PROJECT::parse_state(XML_PARSER& xp) {
             trickle_up_ops.push_back(new TRICKLE_UP_OP(stemp));
             continue;
         }
+        if (xp.parse_double("desired_disk_usage", desired_disk_usage)) continue;
         if (log_flags.unparsed_xml) {
             msg_printf(0, MSG_INFO,
                 "[unparsed_xml] PROJECT::parse_state(): unrecognized: %s",
@@ -343,6 +345,7 @@ int PROJECT::write_state(MIOFILE& out, bool gui_rpc) {
         "    <rec_time>%f</rec_time>\n"
 
         "    <resource_share>%f</resource_share>\n"
+        "    <desired_disk_usage>%f</desired_disk_usage>\n"
         "    <duration_correction_factor>%f</duration_correction_factor>\n"
         "    <sched_rpc_pending>%d</sched_rpc_pending>\n"
         "    <send_time_stats_log>%d</send_time_stats_log>\n"
@@ -374,6 +377,7 @@ int PROJECT::write_state(MIOFILE& out, bool gui_rpc) {
         pwf.rec,
         pwf.rec_time,
         resource_share,
+        desired_disk_usage,
         duration_correction_factor,
         sched_rpc_pending,
         send_time_stats_log,
@@ -529,6 +533,7 @@ void PROJECT::copy_state_fields(PROJECT& p) {
     if (ams_resource_share >= 0) {
         resource_share = ams_resource_share;
     }
+    desired_disk_usage = p.desired_disk_usage;
     use_symlinks = p.use_symlinks;
 }
 
diff --git a/client/project.h b/client/project.h
index a3256489d1..74dacb8805 100644
--- a/client/project.h
+++ b/client/project.h
@@ -39,7 +39,15 @@ struct PROJECT : PROJ_AM {
     double resource_share;
         // project's resource share relative to other projects.
     double resource_share_frac;
-        // fraction of RS of non-suspended, compute-intensive projects
+        // temp; fraction of RS of non-suspended, compute-intensive projects
+    double disk_resource_share;
+        // temp in get_disk_shares()
+    double desired_disk_usage;
+        // reported by project
+    double ddu;
+        // temp in get_disk_shares()
+    double disk_quota;
+        // temp in get_disk_shares()
 
     // the following are from the user's project prefs
     //
diff --git a/client/scheduler_op.cpp b/client/scheduler_op.cpp
index ee8c377c94..3e9d2ae5bb 100644
--- a/client/scheduler_op.cpp
+++ b/client/scheduler_op.cpp
@@ -899,6 +899,8 @@ int SCHEDULER_REPLY::parse(FILE* in, PROJECT* project) {
             continue;
         } else if (xp.parse_int("teamid", project->teamid)) {
             continue;
+        } else if (xp.parse_double("desired_disk_usage", project->desired_disk_usage)) {
+            continue;
         } else {
             if (log_flags.unparsed_xml) {
                 msg_printf(project, MSG_INFO,
diff --git a/db/boinc_db_types.h b/db/boinc_db_types.h
index 5d709b417d..f495c81974 100644
--- a/db/boinc_db_types.h
+++ b/db/boinc_db_types.h
@@ -287,11 +287,10 @@ struct HOST {
     double d_boinc_used_project;
                             // amount being used for this project
 
-    // The following item is not used.
-    // It's redundant (server can compute based on other params and prefs)
-    //
-    double d_boinc_max;     // max disk space that BOINC is allowed to use,
-                            // reflecting user preferences
+    double d_boinc_max;
+        // This field has been repurposed.
+        // it's now used to store the project's share of available disk space
+        // (reported by recent clients as <d_project_share>
     double n_bwup;          // Average upload bandwidth, bytes/sec
     double n_bwdown;        // Average download bandwidth, bytes/sec
                             // The above are derived from actual
@@ -324,7 +323,6 @@ struct HOST {
     char p_features[1024];
     char virtualbox_version[256];
     bool p_vm_extensions_disabled;
-    double d_project_share; // this project's share of available disk space
 
     int parse(XML_PARSER&);
     int parse_time_stats(XML_PARSER&);
diff --git a/sched/sched_types.cpp b/sched/sched_types.cpp
index 104ffb62d7..506005bfd0 100644
--- a/sched/sched_types.cpp
+++ b/sched/sched_types.cpp
@@ -1260,7 +1260,7 @@ int HOST::parse_disk_usage(XML_PARSER& xp) {
         if (xp.match_tag("/disk_usage")) return 0;
         if (xp.parse_double("d_boinc_used_total", d_boinc_used_total)) continue;
         if (xp.parse_double("d_boinc_used_project", d_boinc_used_project)) continue;
-        if (xp.parse_double("d_project_share", d_project_share)) continue;
+        if (xp.parse_double("d_project_share", d_boinc_max)) continue;
         log_messages.printf(MSG_NORMAL,
             "HOST::parse_disk_usage(): unrecognized: %s\n",
             xp.parsed_tag
diff --git a/vda/sched_vda.cpp b/vda/sched_vda.cpp
index 0365ec6fa5..92c631d0a9 100644
--- a/vda/sched_vda.cpp
+++ b/vda/sched_vda.cpp
@@ -345,17 +345,17 @@ static int process_chunks_missing_on_client(CHUNK_LIST& chunks) {
 // remove some chunks and mark vda_files for update
 //
 static int enforce_quota(CHUNK_LIST& chunks) {
-    if (!g_request->host.d_project_share) return 0;
+    if (!g_request->host.d_boinc_max) return 0;
 
     double x = g_request->host.d_boinc_used_project;
     if (config.debug_vda) {
         log_messages.printf(MSG_NORMAL,
             "[vda] share: %f used: %f\n",
-            g_request->host.d_project_share, x
+            g_request->host.d_boinc_max, x
         );
     }
     CHUNK_LIST::iterator it = chunks.begin();
-    while (x > g_request->host.d_project_share && it != chunks.end()) {
+    while (x > g_request->host.d_boinc_max && it != chunks.end()) {
         DB_VDA_CHUNK_HOST& ch = it->second;
         if (!ch.found) continue;
         FILE_INFO fi;
diff --git a/vda/vda_lib.h b/vda/vda_lib.h
index 715a3e4fbc..fe926a8c26 100644
--- a/vda/vda_lib.h
+++ b/vda/vda_lib.h
@@ -43,11 +43,6 @@ struct VDA_FILE_AUX : VDA_FILE {
     POLICY policy;
     META_CHUNK* meta_chunk;
 
-    VDA_FILE_AUX(){
-        meta_chunk = NULL;
-    }
-    VDA_FILE_AUX(DB_VDA_FILE f) : VDA_FILE(f){}
-
     // the following for the simulator
     //
     double accounting_start_time;
@@ -63,14 +58,29 @@ struct VDA_FILE_AUX : VDA_FILE {
     inline bool collecting_stats() {
         return (pending_init_downloads == 0);
     }
+    VDA_FILE_AUX(){
+        meta_chunk = NULL;
+    }
 
     // the following for vdad
     //
-    std::vector<int> available_hosts;
-        // list of IDs of hosts with no chunks of this file
+    DB_HOST enum_host;
+    char enum_query[256];
+    int max_chunks;
+    int last_id;
+    bool enum_active;
+    bool found_this_scan;
+    bool found_any_this_scan;
+    bool found_any_this_enum;
+
     int init();
     int get_state();
     int choose_host();
+
+    VDA_FILE_AUX(DB_VDA_FILE f) : VDA_FILE(f) {
+        max_chunks = 0;
+        enum_active = false;
+    }
 };
 
 #define PRESENT         0
diff --git a/vda/vda_lib2.cpp b/vda/vda_lib2.cpp
index de2a7e500c..0717973641 100644
--- a/vda/vda_lib2.cpp
+++ b/vda/vda_lib2.cpp
@@ -570,23 +570,92 @@ int VDA_FILE_AUX::get_state() {
 }
 
 // Pick a host to send a chunk of this file to.
-// The host must:
-// 1) be alive (recent RPC time)
-// 2) not have any chunks of this file
-//
-// We maintain a cache of such hosts
+// We want to pick the host that has the fewest chunks
+// of this file already (preferably zero).
 // The policy is:
-//
-// - scan the cache, removing hosts that are no longer alive;
-//   return if find a live host
-// - pick a random starting point in host ID space,
-//   and enumerate 100 live hosts; wrap around if needed.
-//   Return one and put the rest in cache
+// - maintain a threshold "max_chunks".
+// - enumerate all hosts that are alive
+// - if find a host H w/ at most max_chunks of this file,
+//   set max_chunks to nchunks(H) and return H
+// - if scan all hosts w/o finding one, increment max_chunks and start over
 //
 int VDA_FILE_AUX::choose_host() {
     int retval;
-    DB_HOST host;
+    char buf[256];
 
+    // terminology:
+    // "enum" is the result of one DB query (typically 100 hosts)
+    // "scan" is a set of enums covering the entire host table
+    //
+    while (1) {
+        if (!enum_active) {
+            sprintf(enum_query, "where %s and id > %d order by id limit 100",
+                host_alive_clause(), last_id
+            );
+            enum_active = true;
+            found_any_this_enum = false;
+            if (last_id == 0) {
+                found_this_scan = false;
+                found_any_this_scan = false;
+            }
+        }
+        retval = enum_host.enumerate(enum_query);
+        if (retval == ERR_DB_NOT_FOUND) {
+            // we've finished an enum
+            //
+            enum_active = false;
+            if (found_any_this_enum) {
+                // if we found anything in this enum, continue the scan
+                continue;
+            }
+
+            // we've finished a scan
+            //
+            last_id = 0;
+            if (!found_any_this_scan) {
+                log_messages.printf(MSG_CRITICAL,
+                    "choose_host(): no live hosts\n"
+                );
+                return 0;
+            }
+            if (!found_this_scan) {
+                max_chunks++;
+                log_messages.printf(MSG_NORMAL,
+                    "choose_host(): completed scan, new max_chunks %d\n",
+                    max_chunks
+                );
+                continue;
+            }
+        }
+        if (retval) {
+            // a DB error occurred
+            enum_active = false;
+            return 0;
+        }
+
+        found_any_this_enum = true;
+        found_any_this_scan = true;
+        last_id = enum_host.id;
+
+        // we have a live host.
+        // see whether it satisfies max_chunks
+        //
+        DB_VDA_CHUNK_HOST ch;
+        int count;
+        sprintf(buf, "where vda_file_id=%d and host_id=%d", id, enum_host.id);
+        retval = ch.count(count, buf);
+        if (retval) {
+            log_messages.printf(MSG_CRITICAL, "ch.count failed\n");
+            return 0;
+        }
+        if (count <= max_chunks) {
+            found_this_scan = true;
+            max_chunks = count;
+            return enum_host.id;
+        }
+    }
+
+#if 0
     // replenish cache if needed
     //
     if (!available_hosts.size()) {
@@ -659,5 +728,6 @@ int VDA_FILE_AUX::choose_host() {
 
     log_messages.printf(MSG_CRITICAL, "No hosts available\n");
     return 0;
+#endif
 }