diff --git a/checkin_notes b/checkin_notes index 4e1ae5c9b7..8b1d6a56cf 100755 --- a/checkin_notes +++ b/checkin_notes @@ -11331,3 +11331,42 @@ Charlie 21 Oct 2006 mac_build/ boinc.xcodeproj/ project.pbxproj + +David 21 Oct 2006 + - Server: apps can be labeled as "beta", + and jobs for them will be sent only to users + with "1" in their project preferences. + This lets you do application testing without + creating a separate project, + which has advantages both for you (1 projects instead of 2) + and for testers (their testing gives them points + in your main project) + - Scheduler: refined the criteria for "reliable" hosts + to which old results are sent, + and added a param for reducing delay bound + when sending results to reliable hosts + - add -sleep_interval option to assimilator + - move -reliable flag from feeder + - add to sched config: + reliable_min_avg_credit + reliable_min_avg_turnaround + reliable_time + reliable_reduced_delay_bound + + (from Kevin Reed) + + NOTES: updating your server software to this version or later requires + a small database change (see html/ops/db_update.php) + + db/ + boinc_db.C,h + lib/ + parse.C + sched/ + assimilator.C + feeder.C + sched_array.C + sched_config.C,h + sched_locality.C + sched_send.C,h + server_types.h diff --git a/db/boinc_db.C b/db/boinc_db.C index 9b8e817493..24e4da3b83 100644 --- a/db/boinc_db.C +++ b/db/boinc_db.C @@ -129,9 +129,9 @@ void DB_PLATFORM::db_parse(MYSQL_ROW &r) { void DB_APP::db_print(char* buf){ sprintf(buf, "create_time=%d, name='%s', min_version=%d, " - "deprecated=%d, user_friendly_name='%s', homogeneous_redundancy=%d, weight=%f", + "deprecated=%d, user_friendly_name='%s', homogeneous_redundancy=%d, weight=%f, beta=%d", create_time, name, min_version, - deprecated?1:0, user_friendly_name, homogeneous_redundancy?1:0, weight + deprecated?1:0, user_friendly_name, homogeneous_redundancy?1:0, weight, beta ); } @@ -146,6 +146,7 @@ void DB_APP::db_parse(MYSQL_ROW &r) { strcpy2(user_friendly_name, r[i++]); homogeneous_redundancy = atoi(r[i++]); weight = atof(r[i++]); + beta = atoi(r[i++]); } void DB_APP_VERSION::db_print(char* buf){ diff --git a/db/boinc_db.h b/db/boinc_db.h index dc872b7763..2855fb84e6 100755 --- a/db/boinc_db.h +++ b/db/boinc_db.h @@ -69,6 +69,7 @@ struct APP { bool homogeneous_redundancy; double weight; // tells the feeder what fraction of results // should come from this app + int beta; int write(FILE*); void clear(); diff --git a/lib/parse.C b/lib/parse.C index 95c57a3c38..fb88a6d692 100644 --- a/lib/parse.C +++ b/lib/parse.C @@ -286,14 +286,19 @@ void extract_venue(char* in, char* venue_name, char* out) { q = strstr(out, "\n"); - if (!q) break; - strcpy(p, q+strlen("\n")); - } + q = in; + strcpy(out, ""); + while (1) { + p = strstr(q, ""); + if (!q) break; + q += strlen(""); + } } } diff --git a/sched/assimilator.C b/sched/assimilator.C index 0b36b75f53..7c47b4f554 100644 --- a/sched/assimilator.C +++ b/sched/assimilator.C @@ -51,6 +51,8 @@ int wu_id_modulus=0, wu_id_remainder=0; #define SLEEP_INTERVAL 10 +int sleep_interval = SLEEP_INTERVAL; + int one_pass_N_WU=0; // assimilate all WUs that need it @@ -154,6 +156,8 @@ int main(int argc, char** argv) { } else if (!strcmp(argv[i], "-one_pass_N_WU")) { one_pass_N_WU = atoi(argv[++i]); one_pass = true; + } else if (!strcmp(argv[i], "-sleep_interval")) { + sleep_interval = atoi(argv[++i]); } else if (!strcmp(argv[i], "-one_pass")) { one_pass = true; } else if (!strcmp(argv[i], "-d")) { @@ -214,7 +218,7 @@ int main(int argc, char** argv) { while (1) { if (!do_pass(app)) { if (one_pass) break; - sleep(SLEEP_INTERVAL); + sleep(sleep_interval); } } } diff --git a/sched/feeder.C b/sched/feeder.C index 5d727b5561..60ef070eac 100644 --- a/sched/feeder.C +++ b/sched/feeder.C @@ -32,7 +32,6 @@ // [ -purge_stale x ] remove work items from the shared memory segment // that have been there for longer then x minutes // but haven't been assigned -// [ -reliable x ] flag results for workunits older then x days as "need_reliable" // // Creates a shared memory segment containing DB info, // including the work array (results/workunits to send). @@ -143,7 +142,6 @@ char select_clause[256]; double sleep_interval = DEFAULT_SLEEP_INTERVAL; bool all_apps = false; int purge_stale_time = 0; -int reliable_time = 0; void cleanup_shmem() { ssp->ready = false; @@ -406,8 +404,8 @@ static void scan_work_array( // workunits older then the specificed time as needing a reliable // host wu_result.need_reliable = 0; - if (reliable_time) { - if ((wu_result.workunit.create_time + reliable_time*86400) <= time(0)) { + if (config.reliable_time) { + if ((wu_result.workunit.create_time + config.reliable_time) <= time(0)) { wu_result.need_reliable = true; } } @@ -526,13 +524,11 @@ int main(int argc, char** argv) { } else if (!strcmp(argv[i], "-allapps")) { all_apps = true; } else if (!strcmp(argv[i], "-priority_order")) { - order_clause = "order by priority desc "; + order_clause = "order by result.priority desc "; } else if (!strcmp(argv[i], "-priority_order_create_time")) { - order_clause = "order by priority desc, workunit.create_time "; + order_clause = "order by result.priority desc, workunit.create_time "; } else if (!strcmp(argv[i], "-purge_stale")) { purge_stale_time = atoi(argv[++i])*60; - } else if (!strcmp(argv[i], "-reliable")) { - reliable_time = atoi(argv[++i]); } else if (!strcmp(argv[i], "-mod")) { int n = atoi(argv[++i]); int j = atoi(argv[++i]); diff --git a/sched/sched_array.C b/sched/sched_array.C index dbb57090c6..5e862698bd 100644 --- a/sched/sched_array.C +++ b/sched/sched_array.C @@ -79,24 +79,44 @@ void scan_work_array( continue; } - // If this is a reliable host and we are checking for results that - // need a reliable host, then continue if the result is a normal result + // If we are looking for beta results and result is not a beta result + // then move on // - if (reply.wreq.reliable_only && (!wu_result.need_reliable)) { - continue; + APP* app = ss.lookup_app(wu_result.workunit.appid); + if (app == NULL) continue; // this should never happen + if (reply.wreq.beta_only) { + if (!app->beta) { + continue; + } + log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG, + "[HOST#%d] beta work found. Result id %d \n", + reply.host.id + ); + } else { + if (app->beta) { + continue; + } } + // If this is a reliable host and we are checking for results that + // need a reliable host, then continue if the result is a normal result + // skip if the app is beta (beta apps don't use the reliable mechanism) + // + if (!app->beta) { + if (reply.wreq.reliable_only && (!wu_result.need_reliable)) { + continue; + } else if (!reply.wreq.reliable_only && wu_result.need_reliable) { + continue; + } + } + + // If we are looking for infeasible results and the result is not infeasiable + // then move on + // if (reply.wreq.infeasible_only && (wu_result.infeasible_count==0)) { continue; } - // Never send a result that needs a reliable host to one that - // has not earned credit - // - if (wu_result.need_reliable && reply.host.total_credit == 0) { - continue; - } - // don't send if we're already sending a result for same WU // if (config.one_result_per_user_per_wu) { @@ -108,7 +128,7 @@ void scan_work_array( // don't send if host can't handle it // wu = wu_result.workunit; - if (wu_is_infeasible(wu, sreq, reply)) { + if (wu_is_infeasible(wu, sreq, reply, ss)) { log_messages.printf( SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible\n", reply.host.id, wu.id, wu.name diff --git a/sched/sched_config.C b/sched/sched_config.C index f440463c9b..9af81e92e4 100644 --- a/sched/sched_config.C +++ b/sched/sched_config.C @@ -120,8 +120,12 @@ int SCHED_CONFIG::parse(FILE* f) { else if (xp.parse_bool(tag, "dont_delete_batches", dont_delete_batches)) continue; else if (xp.parse_int(tag, "sched_debug_level", sched_debug_level)) continue; else if (xp.parse_int(tag, "fuh_debug_level", fuh_debug_level)) continue; - - // tags the scheduler doesn't care about + else if (xp.parse_int(tag, "reliable_min_avg_credit", reliable_min_avg_credit)) continue; + else if (xp.parse_int(tag, "reliable_min_avg_turnaround", reliable_min_avg_turnaround)) continue; + else if (xp.parse_int(tag, "reliable_time", reliable_time)) continue; + else if (xp.parse_double(tag, "reliable_reduced_delay_bound", reliable_reduced_delay_bound)) continue; + + // some tags that scheduler doesn't care about // else if (xp.parse_str(tag, "cgi_url", temp, sizeof(temp))) continue; else if (xp.parse_str(tag, "log_dir", temp, sizeof(temp))) continue; diff --git a/sched/sched_config.h b/sched/sched_config.h index 6bb49076dd..c0e5ed4cd1 100644 --- a/sched/sched_config.h +++ b/sched/sched_config.h @@ -76,6 +76,10 @@ public: bool dont_delete_batches; int sched_debug_level; int fuh_debug_level; + int reliable_time; // age of workunit before requiring reliable + int reliable_min_avg_credit; + int reliable_min_avg_turnaround; + double reliable_reduced_delay_bound; int parse(FILE*); int parse_file(const char* dir="."); diff --git a/sched/sched_locality.C b/sched/sched_locality.C index a7f96d16e5..ab5281a736 100644 --- a/sched/sched_locality.C +++ b/sched/sched_locality.C @@ -287,7 +287,7 @@ static int possibly_send_result( // why the WU is not feasible. These are defined in sched_send.h. // INFEASIBLE_MEM, INFEASIBLE_DISK, INFEASIBLE_CPU. // - if (wu_is_infeasible(wu, sreq, reply)) { + if (wu_is_infeasible(wu, sreq, reply, ss)) { return ERR_INSUFFICIENT_RESOURCE; } diff --git a/sched/sched_send.C b/sched/sched_send.C index 9df3f40864..843e871a28 100644 --- a/sched/sched_send.C +++ b/sched/sched_send.C @@ -248,25 +248,76 @@ static double estimate_wallclock_duration( return ewd; } -// scan user's project prefs for elements of the form N, -// indicating the apps they want to run. +// Find or compute various details for the host. +// These parameters affect how work is sent to the host // -static int find_allowed_apps( - SCHEDULER_REPLY& reply, std::vector *app_ids -) { +static int get_host_details(SCHEDULER_REPLY& reply) { char buf[8096]; std::string str; extract_venue(reply.user.project_prefs, reply.host.venue, buf); str = buf; unsigned int pos = 0; int temp_int; + USER_APP_DTL* app_dtl; + + // scan user's project prefs for elements of the form N, + // indicating the apps they want to run. + // while (parse_int(str.substr(pos,str.length()-pos).c_str(), "", temp_int)) { - (*app_ids).push_back(temp_int); + app_dtl = new USER_APP_DTL(); + app_dtl->appid = temp_int; + app_dtl->work_available=0; + reply.wreq.host_dtls.preferred_apps.push_back(app_dtl); + pos = str.find("", pos) + 1; } + temp_int = parse_int(buf,"",temp_int); + reply.wreq.host_dtls.allow_beta_work = temp_int; + + // Decide whether or not this computer is a 'reliable' computer + // + double expavg_credit = reply.host.expavg_credit; + double expavg_time = reply.host.expavg_time; + double avg_turnaround = reply.host.avg_turnaround; + update_average(0, 0, CREDIT_HALF_LIFE, expavg_credit, expavg_time); + if (strstr(reply.host.os_name,"Windows") || strstr(reply.host.os_name,"Linux") + ) { + if (((expavg_credit/reply.host.p_ncpus) > config.reliable_min_avg_credit || config.reliable_min_avg_credit == 0) + && (avg_turnaround < config.reliable_min_avg_turnaround || config.reliable_min_avg_turnaround == 0) + ){ + reply.wreq.host_dtls.reliable = true; + log_messages.printf(SCHED_MSG_LOG::MSG_NORMAL, + "[HOST#%d] is reliable (OS = %s) expavg_credit = %.0f avg_turnaround(hours) = %.0f \n", + reply.host.id, reply.host.os_name, expavg_credit, + avg_turnaround/3600 + ); + } + } else { + if (((expavg_credit/reply.host.p_ncpus) > config.reliable_min_avg_credit*.75 || config.reliable_min_avg_credit == 0) + && (avg_turnaround < config.reliable_min_avg_turnaround*1.25 || config.reliable_min_avg_turnaround == 0) + ){ + reply.wreq.host_dtls.reliable = true; + log_messages.printf(SCHED_MSG_LOG::MSG_NORMAL, + "[HOST#%d] is reliable (OS = %s) expavg_credit = %.0f avg_turnaround(hours) = %.0f \n", + reply.host.id, reply.host.os_name, expavg_credit, + avg_turnaround/3600 + ); + } + } return 0; } +int find_preferred_app_index(SCHEDULER_REPLY& reply, int appid) { + int result = -1; + for (int i=0; iappid == appid ) { + result = i; + break; + } + } + return result; +} + // if the WU can't be executed on the host, return a bitmap of reasons why. // Reasons include: // 1) the host doesn't have enough memory; @@ -279,7 +330,8 @@ static int find_allowed_apps( // In particular it doesn't enforce the one-result-per-user-per-wu rule // int wu_is_infeasible( - WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply + WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply, + SCHED_SHMEM& ss ) { int reason = 0; unsigned int i; @@ -288,17 +340,25 @@ int wu_is_infeasible( // If they have then only send work for the allowed applications // TODO: call find_allowed_apps() only once, not once for each WU!! // - std::vector app_ids; - find_allowed_apps(reply, &app_ids); - if (app_ids.size() > 0) { - bool app_allowed = false; - for(i=0; i 0) { + for (i=0; iappid, + reply.wreq.host_dtls.preferred_apps[i]->work_available + ); + if (wu.appid==reply.wreq.host_dtls.preferred_apps[i]->appid) { + app_allowed = true; + reply.wreq.host_dtls.preferred_apps[i]->work_available=1; break; } } - if (!app_allowed) { + + // Only mark infeasible if we are looking at user preferred apps only + // + if (!app_allowed && !reply.wreq.beta_only) { reply.wreq.no_allowed_apps_available = true; reason |= INFEASIBLE_APP_SETTING; log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG, @@ -661,10 +721,21 @@ int add_result_to_reply( result.sent_time = time(0); int old_server_state = result.server_state; + // If the workunit needs reliable and is being sent to a reliable host, + // then shorten the delay bound by the percent specified + // + int delay_bound = wu.delay_bound; + if (config.reliable_time && reply.wreq.host_dtls.reliable && config.reliable_reduced_delay_bound > 0.01) { + if ((wu.create_time + config.reliable_time) <= time(0)) { + delay_bound = (int) (delay_bound * config.reliable_reduced_delay_bound); + } + } + + if (result.server_state != RESULT_SERVER_STATE_IN_PROGRESS) { // We are sending this result for the first time // - result.report_deadline = result.sent_time + wu.delay_bound; + result.report_deadline = result.sent_time + delay_bound; result.server_state = RESULT_SERVER_STATE_IN_PROGRESS; } else { // Result was ALREADY sent to this host but never arrived. @@ -677,8 +748,8 @@ int add_result_to_reply( if (result.report_deadline < result.sent_time) { result.report_deadline = result.sent_time + 10; } - if (result.report_deadline > result.sent_time + wu.delay_bound) { - result.report_deadline = result.sent_time + wu.delay_bound; + if (result.report_deadline > result.sent_time + delay_bound) { + result.report_deadline = result.sent_time + delay_bound; } log_messages.printf( @@ -767,6 +838,8 @@ int send_work( reply.wreq.core_client_version = sreq.core_client_major_version*100 + sreq.core_client_minor_version; reply.wreq.nresults = 0; + get_host_details(reply); // parse project prefs for app details + reply.wreq.beta_only = false; log_messages.printf( SCHED_MSG_LOG::MSG_NORMAL, @@ -800,16 +873,27 @@ int send_work( } else { // give top priority to results that require a 'reliable host' // - double expavg_credit = reply.host.expavg_credit; - double expavg_time = reply.host.expavg_time; - update_average(0, 0, CREDIT_HALF_LIFE, expavg_credit, expavg_time); - if ((expavg_credit/reply.host.p_ncpus) > 70) { + if (reply.wreq.host_dtls.reliable) { reply.wreq.reliable_only = true; reply.wreq.infeasible_only = false; - log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] is reliable\n", reply.host.id); scan_work_array(sreq, reply, platform, ss); } reply.wreq.reliable_only = false; + + // give 2nd priority to results that are for a beta app + // (projects should load beta work with care + // otherwise your users won't get production work done! + // + if (reply.wreq.host_dtls.allow_beta_work) { + reply.wreq.beta_only=true; + log_messages.printf( + SCHED_MSG_LOG::MSG_DEBUG, + "[HOST#%d] will accept beta work. Scanning for beta work.\n", + reply.host.id + ); + scan_work_array(sreq, reply, platform, ss); + } + reply.wreq.beta_only=false; // give next priority to results that were infeasible for some other host // @@ -935,6 +1019,14 @@ int send_work( reply.set_delay(delay_time); } } + + // free memory + // + for (int i=0; i preferred_apps; +}; + // summary of a client's request for work, and our response to it // struct WORK_REQ { bool infeasible_only; bool reliable_only; + bool beta_only; + HOST_DETAILS host_dtls; double seconds_to_fill; // in "normalized CPU seconds" (see doc/work_req.php) double disk_available;