From c9c9f2bae000e136f2696c229578a2c6b19a90e9 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Tue, 9 Apr 2013 12:19:00 -0700 Subject: [PATCH] - scheduler: code shuffle; new file sched_check.cpp contains functions that decide whether a job can be sent to a host --- sched/Makefile.am | 1 + sched/credit.cpp | 16 - sched/credit.h | 3 - sched/sched_array.cpp | 166 +--------- sched/sched_check.cpp | 526 +++++++++++++++++++++++++++++++ sched/sched_check.h | 48 +++ sched/sched_customize.cpp | 3 +- sched/sched_locality.cpp | 9 +- sched/sched_resend.cpp | 5 +- sched/sched_score.cpp | 9 +- sched/sched_score.h | 4 +- sched/sched_send.cpp | 636 +++++++++----------------------------- sched/sched_send.h | 25 +- 13 files changed, 745 insertions(+), 706 deletions(-) create mode 100644 sched/sched_check.cpp create mode 100644 sched/sched_check.h diff --git a/sched/Makefile.am b/sched/Makefile.am index 3a789fadd1..13ff710bdc 100644 --- a/sched/Makefile.am +++ b/sched/Makefile.am @@ -151,6 +151,7 @@ cgi_sources = \ plan_class_spec.cpp \ sched_array.cpp \ sched_assign.cpp \ + sched_check.cpp \ sched_customize.cpp \ sched_files.cpp \ sched_hr.cpp \ diff --git a/sched/credit.cpp b/sched/credit.cpp index 1ad51b02ac..7199a3374d 100644 --- a/sched/credit.cpp +++ b/sched/credit.cpp @@ -963,22 +963,6 @@ int assign_credit_set( return 0; } -// A job has: -// - errored out (scheduler) -// - timed out (transitioner) -// - failed validation (validator). -// Put (host/app_version) on "host scale probation", -// so that we won't use host scaling for a while. -// -void got_error(DB_HOST_APP_VERSION &hav) { - if (config.debug_credit) { - log_messages.printf(MSG_NORMAL, - "[credit] [HAV#%d] got error, setting error rate to %f\n", - hav.app_version_id, ERROR_RATE_INIT - ); - } -} - // carefully write any app_version records that have changed; // done at the end of every validator scan. // diff --git a/sched/credit.h b/sched/credit.h index 1af496cb43..16c4d54ae5 100644 --- a/sched/credit.h +++ b/sched/credit.h @@ -19,9 +19,6 @@ #include "boinc_db.h" -#define ERROR_RATE_INIT 0.1 - // the initial error rate of a host or app version - #define MIN_HOST_SAMPLES 10 // use host scaling only if have this many samples for host #define MIN_VERSION_SAMPLES 100 diff --git a/sched/sched_array.cpp b/sched/sched_array.cpp index ee7e375d96..2c6c570982 100644 --- a/sched/sched_array.cpp +++ b/sched/sched_array.cpp @@ -23,15 +23,17 @@ #include "config.h" -#include "sched_main.h" -#include "sched_types.h" -#include "sched_shmem.h" -#include "sched_hr.h" +#include "sched_check.h" #include "sched_config.h" -#include "sched_util.h" +#include "sched_hr.h" +#include "sched_main.h" #include "sched_msgs.h" #include "sched_send.h" +#include "sched_shmem.h" +#include "sched_types.h" +#include "sched_util.h" #include "sched_version.h" + #ifdef _USING_FCGI_ #include "boinc_fcgi.h" #endif @@ -201,159 +203,6 @@ static bool quick_check( return true; } -// Do checks that require DB access for whether we can send this job, -// and return: -// 0 if OK to send -// 1 if can't send to this host -// 2 if can't send to ANY host -// -int slow_check( - WU_RESULT& wu_result, // the job cache entry. - // We may refresh its hr_class and app_version_id fields. - APP* app, - BEST_APP_VERSION* bavp // the app version to be used -) { - int n, retval; - DB_RESULT result; - char buf[256]; - WORKUNIT& wu = wu_result.workunit; - - // Don't send if we've already sent a result of this WU to this user. - // - if (config.one_result_per_user_per_wu) { - sprintf(buf, - "where workunitid=%d and userid=%d", wu.id, g_reply->user.id - ); - retval = result.count(n, buf); - if (retval) { - log_messages.printf(MSG_CRITICAL, - "send_work: can't get result count (%s)\n", boincerror(retval) - ); - return 1; - } else { - if (n>0) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [USER#%d] already has %d result(s) for [WU#%d]\n", - g_reply->user.id, n, wu.id - ); - } - return 1; - } - } - } else if (config.one_result_per_host_per_wu) { - // Don't send if we've already sent a result of this WU to this host. - // We only have to check this if we don't send one result per user. - // - sprintf(buf, - "where workunitid=%d and hostid=%d", wu.id, g_reply->host.id - ); - retval = result.count(n, buf); - if (retval) { - log_messages.printf(MSG_CRITICAL, - "send_work: can't get result count (%s)\n", boincerror(retval) - ); - return 1; - } else { - if (n>0) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] already has %d result(s) for [WU#%d]\n", - g_reply->host.id, n, wu.id - ); - } - return 1; - } - } - } - - // Checks that require looking up the WU. - // Lump these together so we only do 1 lookup - // - if (app_hr_type(*app) || app->homogeneous_app_version) { - DB_WORKUNIT db_wu; - db_wu.id = wu.id; - int vals[3]; - retval = db_wu.get_field_ints( - "hr_class, app_version_id, error_mask", 3, vals - ); - if (retval) { - log_messages.printf(MSG_CRITICAL, - "can't get fields for [WU#%d]: %s\n", db_wu.id, boincerror(retval) - ); - return 1; - } - - // check wu.error_mask - // - if (vals[2] != 0) { - return 2; - } - - if (app_hr_type(*app)) { - wu.hr_class = vals[0]; - if (already_sent_to_different_hr_class(wu, *app)) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] [WU#%d %s] is assigned to different HR class\n", - g_reply->host.id, wu.id, wu.name - ); - } - // Mark the workunit as infeasible. - // This ensures that jobs already assigned to an HR class - // are processed first. - // - wu_result.infeasible_count++; - return 1; - } - } - if (app->homogeneous_app_version) { - int wu_avid = vals[1]; - wu.app_version_id = wu_avid; - if (wu_avid && wu_avid != bavp->avp->id) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] [WU#%d %s] is assigned to different app version\n", - g_reply->host.id, wu.id, wu.name - ); - } - wu_result.infeasible_count++; - return 1; - } - } - } - return 0; -} - -// Check for pathological conditions that mean -// result is not sendable at all. -// -bool result_still_sendable(DB_RESULT& result, WORKUNIT& wu) { - int retval = result.lookup_id(result.id); - if (retval) { - log_messages.printf(MSG_CRITICAL, - "[RESULT#%d] result.lookup_id() failed: %s\n", - result.id, boincerror(retval) - ); - return false; - } - if (result.server_state != RESULT_SERVER_STATE_UNSENT) { - log_messages.printf(MSG_NORMAL, - "[RESULT#%d] expected to be unsent; instead, state is %d\n", - result.id, result.server_state - ); - return false; - } - if (result.workunitid != wu.id) { - log_messages.printf(MSG_CRITICAL, - "[RESULT#%d] wrong WU ID: wanted %d, got %d\n", - result.id, wu.id, result.workunitid - ); - return false; - } - return true; -} - // Make a pass through the wu/results array, sending work. // The choice of jobs is limited by flags in g_wreq, as follows: // infeasible_only: @@ -502,7 +351,6 @@ recheck: // Send work by scanning the job array multiple times, // with different selection criteria on each scan. -// This has been superceded by send_work_matchmaker() // void send_work_old() { g_wreq->beta_only = false; diff --git a/sched/sched_check.cpp b/sched/sched_check.cpp new file mode 100644 index 0000000000..3a3f097056 --- /dev/null +++ b/sched/sched_check.cpp @@ -0,0 +1,526 @@ +// This file is part of BOINC. +// http://boinc.berkeley.edu +// Copyright (C) 2013 University of California +// +// BOINC is free software; you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License +// as published by the Free Software Foundation, +// either version 3 of the License, or (at your option) any later version. +// +// BOINC is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// See the GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with BOINC. If not, see . + +// Check whether a job can be sent to a host + +#include "credit.h" +#include "sched_config.h" +#include "sched_hr.h" +#include "sched_main.h" +#include "sched_msgs.h" +#include "sched_send.h" + +#include "sched_check.h" + +#ifdef _USING_FCGI_ +#include "boinc_fcgi.h" +#endif + +const char* infeasible_string(int code) { + switch (code) { + case INFEASIBLE_MEM: return "Not enough memory"; + case INFEASIBLE_DISK: return "Not enough disk"; + case INFEASIBLE_CPU: return "CPU too slow"; + case INFEASIBLE_APP_SETTING: return "App not selected"; + case INFEASIBLE_WORKLOAD: return "Existing workload"; + case INFEASIBLE_DUP: return "Already in reply"; + case INFEASIBLE_HR: return "Homogeneous redundancy"; + case INFEASIBLE_BANDWIDTH: return "Download bandwidth too low"; + } + return "Unknown"; +} + +// Return true if the user has set application preferences, +// and this job is not for a selected app +// +bool app_not_selected(WORKUNIT& wu) { + unsigned int i; + + if (g_wreq->preferred_apps.size() == 0) return false; + for (i=0; ipreferred_apps.size(); i++) { + if (wu.appid == g_wreq->preferred_apps[i].appid) { + g_wreq->preferred_apps[i].work_available = true; + return false; + } + } + return true; +} + +static inline int check_memory(WORKUNIT& wu) { + double diff = wu.rsc_memory_bound - g_wreq->usable_ram; + if (diff > 0) { + char message[256]; + sprintf(message, + "%s needs %0.2f MB RAM but only %0.2f MB is available for use.", + find_user_friendly_name(wu.appid), + wu.rsc_memory_bound/MEGA, g_wreq->usable_ram/MEGA + ); + add_no_work_message(message); + + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [WU#%d %s] needs %0.2fMB RAM; [HOST#%d] has %0.2fMB, %0.2fMB usable\n", + wu.id, wu.name, wu.rsc_memory_bound/MEGA, + g_reply->host.id, g_wreq->ram/MEGA, g_wreq->usable_ram/MEGA + ); + } + g_wreq->mem.set_insufficient(wu.rsc_memory_bound); + g_reply->set_delay(DELAY_NO_WORK_TEMP); + return INFEASIBLE_MEM; + } + return 0; +} + +static inline int check_disk(WORKUNIT& wu) { + double diff = wu.rsc_disk_bound - g_wreq->disk_available; + if (diff > 0) { + char message[256]; + sprintf(message, + "%s needs %0.2fMB more disk space. You currently have %0.2f MB available and it needs %0.2f MB.", + find_user_friendly_name(wu.appid), + diff/MEGA, g_wreq->disk_available/MEGA, wu.rsc_disk_bound/MEGA + ); + add_no_work_message(message); + + g_wreq->disk.set_insufficient(diff); + return INFEASIBLE_DISK; + } + return 0; +} + +static inline int check_bandwidth(WORKUNIT& wu) { + if (wu.rsc_bandwidth_bound == 0) return 0; + + // if n_bwdown is zero, the host has never downloaded anything, + // so skip this check + // + if (g_reply->host.n_bwdown == 0) return 0; + + double diff = wu.rsc_bandwidth_bound - g_reply->host.n_bwdown; + if (diff > 0) { + char message[256]; + sprintf(message, + "%s requires %0.2f KB/sec download bandwidth. Your computer has been measured at %0.2f KB/sec.", + find_user_friendly_name(wu.appid), + wu.rsc_bandwidth_bound/KILO, g_reply->host.n_bwdown/KILO + ); + add_no_work_message(message); + + g_wreq->bandwidth.set_insufficient(diff); + return INFEASIBLE_BANDWIDTH; + } + return 0; +} + +// Determine if the app is "hard", +// and we should send it only to high-end hosts. +// Currently this is specified by setting weight=-1; +// this is a kludge for SETI@home/Astropulse. +// +static inline bool hard_app(APP& app) { + return (app.weight == -1); +} + +static inline double get_estimated_delay(BEST_APP_VERSION& bav) { + int pt = bav.host_usage.proc_type; + if (pt == PROC_TYPE_CPU) { + return g_request->cpu_estimated_delay; + } + COPROC* cp = g_request->coprocs.type_to_coproc(pt); + return cp->estimated_delay; +} + + +// return the delay bound to use for this job/host. +// Actually, return two: optimistic (lower) and pessimistic (higher). +// If the deadline check with the optimistic bound fails, +// try the pessimistic bound. +// TODO: clean up this mess +// +static void get_delay_bound_range( + WORKUNIT& wu, + int res_server_state, int res_priority, double res_report_deadline, + BEST_APP_VERSION& bav, + double& opt, double& pess +) { + if (res_server_state == RESULT_SERVER_STATE_IN_PROGRESS) { + double now = dtime(); + if (res_report_deadline < now) { + // if original deadline has passed, return zeros + // This will skip deadline check. + opt = pess = 0; + return; + } + opt = res_report_deadline - now; + pess = wu.delay_bound; + } else { + opt = pess = wu.delay_bound; + + // If the workunit needs reliable and is being sent to a reliable host, + // then shorten the delay bound by the percent specified + // + if (config.reliable_on_priority && res_priority >= config.reliable_on_priority && config.reliable_reduced_delay_bound > 0.01 + ) { + opt = wu.delay_bound*config.reliable_reduced_delay_bound; + double est_wallclock_duration = estimate_duration(wu, bav); + + // Check to see how reasonable this reduced time is. + // Increase it to twice the estimated delay bound + // if all the following apply: + // + // 1) Twice the estimate is longer then the reduced delay bound + // 2) Twice the estimate is less then the original delay bound + // 3) Twice the estimate is less then the twice the reduced delay bound + if (est_wallclock_duration*2 > opt + && est_wallclock_duration*2 < wu.delay_bound + && est_wallclock_duration*2 < wu.delay_bound*config.reliable_reduced_delay_bound*2 + ) { + opt = est_wallclock_duration*2; + } + } + } +} + +// return 0 if the job, with the given delay bound, +// will complete by its deadline, and won't cause other jobs to miss deadlines. +// +static inline int check_deadline( + WORKUNIT& wu, APP& app, BEST_APP_VERSION& bav +) { + if (config.ignore_delay_bound) return 0; + + // skip delay check if host currently doesn't have any work + // and it's not a hard app. + // (i.e. everyone gets one result, no matter how slow they are) + // + if (get_estimated_delay(bav) == 0 && !hard_app(app)) { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] est delay 0, skipping deadline check\n" + ); + } + return 0; + } + + // if it's a hard app, don't send it to a host with no credit + // + if (hard_app(app) && g_reply->host.total_credit == 0) { + return INFEASIBLE_CPU; + } + + // do EDF simulation if possible; else use cruder approximation + // + if (config.workload_sim && g_request->have_other_results_list) { + double est_dur = estimate_duration(wu, bav); + if (g_reply->wreq.edf_reject_test(est_dur, wu.delay_bound)) { + return INFEASIBLE_WORKLOAD; + } + IP_RESULT candidate("", wu.delay_bound, est_dur); + strcpy(candidate.name, wu.name); + if (check_candidate(candidate, g_wreq->effective_ncpus, g_request->ip_results)) { + // it passed the feasibility test, + // but don't add it to the workload yet; + // wait until we commit to sending it + } else { + g_reply->wreq.edf_reject(est_dur, wu.delay_bound); + g_reply->wreq.speed.set_insufficient(0); + return INFEASIBLE_WORKLOAD; + } + } else { + double ewd = estimate_duration(wu, bav); + if (hard_app(app)) ewd *= 1.3; + double est_report_delay = get_estimated_delay(bav) + ewd; + double diff = est_report_delay - wu.delay_bound; + if (diff > 0) { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [WU#%d] deadline miss %d > %d\n", + wu.id, (int)est_report_delay, wu.delay_bound + ); + } + g_reply->wreq.speed.set_insufficient(diff); + return INFEASIBLE_CPU; + } else { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [WU#%d] meets deadline: %.2f + %.2f < %d\n", + wu.id, get_estimated_delay(bav), ewd, wu.delay_bound + ); + } + } + } + return 0; +} + +// Fast checks (no DB access) to see if the job can be sent to the host. +// Reasons why not include: +// 1) the host doesn't have enough memory; +// 2) the host doesn't have enough disk space; +// 3) based on CPU speed, resource share and estimated delay, +// the host probably won't get the result done within the delay bound +// 4) app isn't in user's "approved apps" list +// +// If the job is feasible, return 0 and fill in wu.delay_bound +// with the delay bound we've decided to use. +// +int wu_is_infeasible_fast( + WORKUNIT& wu, + int res_server_state, int res_priority, double res_report_deadline, + APP& app, BEST_APP_VERSION& bav +) { + int retval; + + // project-specific check + // + if (wu_is_infeasible_custom(wu, app, bav)) { + return INFEASIBLE_CUSTOM; + } + + if (config.user_filter) { + if (wu.batch && wu.batch != g_reply->user.id) { + return INFEASIBLE_USER_FILTER; + } + } + + // homogeneous redundancy: can't send if app uses HR and + // 1) host is of unknown HR class, or + // 2) WU is already committed to different HR class + // + if (app_hr_type(app)) { + if (hr_unknown_class(g_reply->host, app_hr_type(app))) { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [HOST#%d] [WU#%d %s] host is of unknown class in HR type %d\n", + g_reply->host.id, wu.id, wu.name, app_hr_type(app) + ); + } + return INFEASIBLE_HR; + } + if (already_sent_to_different_hr_class(wu, app)) { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [HOST#%d] [WU#%d %s] failed quick HR check: WU is class %d, host is class %d\n", + g_reply->host.id, wu.id, wu.name, wu.hr_class, hr_class(g_request->host, app_hr_type(app)) + ); + } + return INFEASIBLE_HR; + } + } + + // homogeneous app version + // + if (app.homogeneous_app_version) { + int avid = wu.app_version_id; + if (avid && bav.avp->id != avid) { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [HOST#%d] [WU#%d %s] failed homogeneous app version check: %d %d\n", + g_reply->host.id, wu.id, wu.name, avid, bav.avp->id + ); + } + return INFEASIBLE_HAV; + } + } + + if (config.one_result_per_user_per_wu || config.one_result_per_host_per_wu) { + if (wu_already_in_reply(wu)) { + return INFEASIBLE_DUP; + } + } + + retval = check_memory(wu); + if (retval) return retval; + retval = check_disk(wu); + if (retval) return retval; + retval = check_bandwidth(wu); + if (retval) return retval; + + if (app.non_cpu_intensive) { + return 0; + } + + // do deadline check last because EDF sim uses some CPU + // + double opt, pess; + get_delay_bound_range( + wu, res_server_state, res_priority, res_report_deadline, bav, opt, pess + ); + wu.delay_bound = (int)opt; + if (opt == 0) { + // this is a resend; skip deadline check + return 0; + } + retval = check_deadline(wu, app, bav); + if (retval && (opt != pess)) { + wu.delay_bound = (int)pess; + retval = check_deadline(wu, app, bav); + } + return retval; +} + +// Do checks that require DB access for whether we can send this job, +// and return: +// 0 if OK to send +// 1 if can't send to this host +// 2 if can't send to ANY host +// +int slow_check( + WU_RESULT& wu_result, // the job cache entry. + // We may refresh its hr_class and app_version_id fields. + APP* app, + BEST_APP_VERSION* bavp // the app version to be used +) { + int n, retval; + DB_RESULT result; + char buf[256]; + WORKUNIT& wu = wu_result.workunit; + + // Don't send if we've already sent a result of this WU to this user. + // + if (config.one_result_per_user_per_wu) { + sprintf(buf, + "where workunitid=%d and userid=%d", wu.id, g_reply->user.id + ); + retval = result.count(n, buf); + if (retval) { + log_messages.printf(MSG_CRITICAL, + "send_work: can't get result count (%s)\n", boincerror(retval) + ); + return 1; + } else { + if (n>0) { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [USER#%d] already has %d result(s) for [WU#%d]\n", + g_reply->user.id, n, wu.id + ); + } + return 1; + } + } + } else if (config.one_result_per_host_per_wu) { + // Don't send if we've already sent a result of this WU to this host. + // We only have to check this if we don't send one result per user. + // + sprintf(buf, + "where workunitid=%d and hostid=%d", wu.id, g_reply->host.id + ); + retval = result.count(n, buf); + if (retval) { + log_messages.printf(MSG_CRITICAL, + "send_work: can't get result count (%s)\n", boincerror(retval) + ); + return 1; + } else { + if (n>0) { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [HOST#%d] already has %d result(s) for [WU#%d]\n", + g_reply->host.id, n, wu.id + ); + } + return 1; + } + } + } + + // Checks that require looking up the WU. + // Lump these together so we only do 1 lookup + // + if (app_hr_type(*app) || app->homogeneous_app_version) { + DB_WORKUNIT db_wu; + db_wu.id = wu.id; + int vals[3]; + retval = db_wu.get_field_ints( + "hr_class, app_version_id, error_mask", 3, vals + ); + if (retval) { + log_messages.printf(MSG_CRITICAL, + "can't get fields for [WU#%d]: %s\n", db_wu.id, boincerror(retval) + ); + return 1; + } + + // check wu.error_mask + // + if (vals[2] != 0) { + return 2; + } + + if (app_hr_type(*app)) { + wu.hr_class = vals[0]; + if (already_sent_to_different_hr_class(wu, *app)) { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [HOST#%d] [WU#%d %s] is assigned to different HR class\n", + g_reply->host.id, wu.id, wu.name + ); + } + // Mark the workunit as infeasible. + // This ensures that jobs already assigned to an HR class + // are processed first. + // + wu_result.infeasible_count++; + return 1; + } + } + if (app->homogeneous_app_version) { + int wu_avid = vals[1]; + wu.app_version_id = wu_avid; + if (wu_avid && wu_avid != bavp->avp->id) { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [HOST#%d] [WU#%d %s] is assigned to different app version\n", + g_reply->host.id, wu.id, wu.name + ); + } + wu_result.infeasible_count++; + return 1; + } + } + } + return 0; +} + +// Check for pathological conditions that mean +// result is not sendable at all. +// +bool result_still_sendable(DB_RESULT& result, WORKUNIT& wu) { + int retval = result.lookup_id(result.id); + if (retval) { + log_messages.printf(MSG_CRITICAL, + "[RESULT#%d] result.lookup_id() failed: %s\n", + result.id, boincerror(retval) + ); + return false; + } + if (result.server_state != RESULT_SERVER_STATE_UNSENT) { + log_messages.printf(MSG_NORMAL, + "[RESULT#%d] expected to be unsent; instead, state is %d\n", + result.id, result.server_state + ); + return false; + } + if (result.workunitid != wu.id) { + log_messages.printf(MSG_CRITICAL, + "[RESULT#%d] wrong WU ID: wanted %d, got %d\n", + result.id, wu.id, result.workunitid + ); + return false; + } + return true; +} diff --git a/sched/sched_check.h b/sched/sched_check.h new file mode 100644 index 0000000000..d4c7414831 --- /dev/null +++ b/sched/sched_check.h @@ -0,0 +1,48 @@ +// This file is part of BOINC. +// http://boinc.berkeley.edu +// Copyright (C) 2013 University of California +// +// BOINC is free software; you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License +// as published by the Free Software Foundation, +// either version 3 of the License, or (at your option) any later version. +// +// BOINC is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// See the GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with BOINC. If not, see . +// values returned by wu_is_infeasible() + +#ifndef _SCHED_CHECK_ +#define _SCHED_CHECK_ + +#include "sched_shmem.h" +#include "sched_types.h" + +#define INFEASIBLE_MEM 1 +#define INFEASIBLE_DISK 2 +#define INFEASIBLE_CPU 3 +#define INFEASIBLE_WORK_BUF 4 +#define INFEASIBLE_APP_SETTING 5 +#define INFEASIBLE_WORKLOAD 6 +#define INFEASIBLE_DUP 7 +#define INFEASIBLE_HR 8 +#define INFEASIBLE_BANDWIDTH 9 +#define INFEASIBLE_CUSTOM 10 +#define INFEASIBLE_USER_FILTER 11 +#define INFEASIBLE_HAV 12 + +extern const char* infeasible_string(int); + +extern int wu_is_infeasible_fast( + WORKUNIT&, + int res_server_state, int res_priority, double res_report_deadline, + APP&, BEST_APP_VERSION& +); +extern int slow_check(WU_RESULT&, APP*, BEST_APP_VERSION*); +extern bool result_still_sendable(DB_RESULT& result, WORKUNIT& wu); + +#endif diff --git a/sched/sched_customize.cpp b/sched/sched_customize.cpp index 974c660900..f446283979 100644 --- a/sched/sched_customize.cpp +++ b/sched/sched_customize.cpp @@ -57,6 +57,7 @@ using std::string; #include "str_util.h" #include "util.h" +#include "sched_check.h" #include "sched_config.h" #include "sched_main.h" #include "sched_msgs.h" @@ -973,7 +974,7 @@ bool app_plan(SCHEDULER_REQUEST& sreq, char* plan_class, HOST_USAGE& hu) { return false; } -#if 1 +#ifndef NEW_SCORE // compute a "score" for sending this job to this host. // Return false if the WU is infeasible. // Otherwise set est_time and disk_usage. diff --git a/sched/sched_locality.cpp b/sched/sched_locality.cpp index caa5f99001..8c99c94e90 100644 --- a/sched/sched_locality.cpp +++ b/sched/sched_locality.cpp @@ -35,15 +35,16 @@ #include "str_util.h" #include "filesys.h" -#include "sched_main.h" -#include "sched_types.h" +#include "sched_check.h" +#include "sched_config.h" #include "sched_locality.h" +#include "sched_main.h" #include "sched_msgs.h" -#include "sched_shmem.h" #include "sched_send.h" +#include "sched_shmem.h" +#include "sched_types.h" #include "sched_util.h" #include "sched_version.h" -#include "sched_config.h" #define VERBOSE_DEBUG diff --git a/sched/sched_resend.cpp b/sched/sched_resend.cpp index a2c288e592..36fec7991b 100644 --- a/sched/sched_resend.cpp +++ b/sched/sched_resend.cpp @@ -32,16 +32,17 @@ #include "error_numbers.h" -#include "sched_main.h" +#include "sched_check.h" #include "sched_config.h" #include "sched_customize.h" #include "sched_locality.h" +#include "sched_main.h" #include "sched_msgs.h" #include "sched_send.h" #include "sched_shmem.h" +#include "sched_types.h" #include "sched_util.h" #include "sched_version.h" -#include "sched_types.h" #include "sched_resend.h" diff --git a/sched/sched_score.cpp b/sched/sched_score.cpp index ea2a013287..674becaa55 100644 --- a/sched/sched_score.cpp +++ b/sched/sched_score.cpp @@ -23,18 +23,19 @@ #include "error_numbers.h" #include "util.h" -#include "sched_main.h" +#include "sched_check.h" #include "sched_config.h" #include "sched_hr.h" +#include "sched_main.h" #include "sched_msgs.h" -#include "sched_shmem.h" #include "sched_send.h" -#include "sched_version.h" +#include "sched_shmem.h" #include "sched_types.h" +#include "sched_version.h" #include "sched_score.h" -#if 0 +#ifdef NEW_SCORE bool JOB::get_score(WU_RESULT& wu_result) { score = 0; diff --git a/sched/sched_score.h b/sched/sched_score.h index d3906100d8..d569ea83e5 100644 --- a/sched/sched_score.h +++ b/sched/sched_score.h @@ -15,7 +15,9 @@ // You should have received a copy of the GNU Lesser General Public License // along with BOINC. If not, see . -#if 0 +#define NEW_SCORE + +#ifdef NEW_SCORE struct JOB { int index; int result_id; diff --git a/sched/sched_send.cpp b/sched/sched_send.cpp index 3af687f2ed..a60a9ced90 100644 --- a/sched/sched_send.cpp +++ b/sched/sched_send.cpp @@ -122,20 +122,6 @@ void add_job_files_to_host(WORKUNIT& wu) { } } -const char* infeasible_string(int code) { - switch (code) { - case INFEASIBLE_MEM: return "Not enough memory"; - case INFEASIBLE_DISK: return "Not enough disk"; - case INFEASIBLE_CPU: return "CPU too slow"; - case INFEASIBLE_APP_SETTING: return "App not selected"; - case INFEASIBLE_WORKLOAD: return "Existing workload"; - case INFEASIBLE_DUP: return "Already in reply"; - case INFEASIBLE_HR: return "Homogeneous redundancy"; - case INFEASIBLE_BANDWIDTH: return "Download bandwidth too low"; - } - return "Unknown"; -} - const double MIN_REQ_SECS = 0; const double MAX_REQ_SECS = (28*SECONDS_IN_DAY); @@ -181,12 +167,155 @@ void WORK_REQ::get_job_limits() { config.max_jobs_in_progress.reset(effective_ncpus, effective_ngpus); } -static const char* find_user_friendly_name(int appid) { +const char* find_user_friendly_name(int appid) { APP* app = ssp->lookup_app(appid); if (app) return app->user_friendly_name; return "deprecated application"; } +static void update_quota(DB_HOST_APP_VERSION& hav) { + if (config.daily_result_quota) { + if (hav.max_jobs_per_day == 0) { + hav.max_jobs_per_day = config.daily_result_quota; + if (config.debug_quota) { + log_messages.printf(MSG_NORMAL, + "[quota] [HAV#%d] Initializing max_results_day to %d\n", + hav.app_version_id, + config.daily_result_quota + ); + } + } + } + + if (g_request->last_rpc_dayofyear != g_request->current_rpc_dayofyear) { + if (config.debug_quota) { + log_messages.printf(MSG_NORMAL, + "[quota] [HOST#%d] [HAV#%d] Resetting n_jobs_today\n", + g_reply->host.id, hav.app_version_id + ); + } + hav.n_jobs_today = 0; + } +} + +// see how much RAM we can use on this machine +// +static inline void get_mem_sizes() { + g_wreq->ram = g_reply->host.m_nbytes; + if (g_wreq->ram <= 0) g_wreq->ram = DEFAULT_RAM_SIZE; + g_wreq->usable_ram = g_wreq->ram; + double busy_frac = g_request->global_prefs.ram_max_used_busy_frac; + double idle_frac = g_request->global_prefs.ram_max_used_idle_frac; + double frac = 1; + if (busy_frac>0 && idle_frac>0) { + frac = std::max(busy_frac, idle_frac); + if (frac > 1) frac = 1; + g_wreq->usable_ram *= frac; + } +} + +// Decide whether or not this app version is 'reliable' +// An app version is reliable if the following conditions are true +// (for those that are set in the config file) +// 1) The host average turnaround is less than a threshold +// 2) consecutive_valid is above a threshold +// 3) The host results per day is equal to the max value +// +void get_reliability_version(HOST_APP_VERSION& hav, double multiplier) { + if (hav.turnaround.n > MIN_HOST_SAMPLES && config.reliable_max_avg_turnaround) { + + if (hav.turnaround.get_avg() > config.reliable_max_avg_turnaround*multiplier) { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [AV#%d] not reliable; avg turnaround: %.3f > %.3f hrs\n", + hav.app_version_id, + hav.turnaround.get_avg()/3600, + config.reliable_max_avg_turnaround*multiplier/3600 + ); + } + hav.reliable = false; + return; + } + } + if (hav.consecutive_valid < CONS_VALID_RELIABLE) { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [AV#%d] not reliable; cons valid %d < %d\n", + hav.app_version_id, + hav.consecutive_valid, CONS_VALID_RELIABLE + ); + } + hav.reliable = false; + return; + } + if (config.daily_result_quota) { + if (hav.max_jobs_per_day < config.daily_result_quota) { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [AV#%d] not reliable; max_jobs_per_day %d>%d\n", + hav.app_version_id, + hav.max_jobs_per_day, + config.daily_result_quota + ); + } + hav.reliable = false; + return; + } + } + hav.reliable = true; + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] [HOST#%d] app version %d is reliable\n", + g_reply->host.id, hav.app_version_id + ); + } + g_wreq->has_reliable_version = true; +} + +// decide whether do unreplicated jobs with this app version +// +static void set_trust(DB_HOST_APP_VERSION& hav) { + hav.trusted = false; + if (hav.consecutive_valid < CONS_VALID_UNREPLICATED) { + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] set_trust: cons valid %d < %d, don't use single replication\n", + hav.consecutive_valid, CONS_VALID_UNREPLICATED + ); + } + return; + } + double x = 1./hav.consecutive_valid; + if (drand() > x) hav.trusted = true; + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] set_trust: random choice for cons valid %d: %s\n", + hav.consecutive_valid, hav.trusted?"yes":"no" + ); + } +} + +static void get_reliability_and_trust() { + // Platforms other than Windows, Linux and Intel Macs need a + // larger set of computers to be marked reliable + // + double multiplier = 1.0; + if (strstr(g_reply->host.os_name,"Windows") + || strstr(g_reply->host.os_name,"Linux") + || (strstr(g_reply->host.os_name,"Darwin") + && !(strstr(g_reply->host.p_vendor,"Power Macintosh")) + )) { + multiplier = 1.0; + } else { + multiplier = 1.8; + } + + for (unsigned int i=0; ihost_app_versions.size(); i++) { + DB_HOST_APP_VERSION& hav = g_wreq->host_app_versions[i]; + get_reliability_version(hav, multiplier); + set_trust(hav); + } +} // Compute the max additional disk usage we can impose on the host. // Depending on the client version, it can either send us @@ -373,112 +502,6 @@ static void get_prefs_info() { } } -// Decide whether or not this app version is 'reliable' -// An app version is reliable if the following conditions are true -// (for those that are set in the config file) -// 1) The host average turnaround is less than a threshold -// 2) consecutive_valid is above a threshold -// 3) The host results per day is equal to the max value -// -void get_reliability_version(HOST_APP_VERSION& hav, double multiplier) { - if (hav.turnaround.n > MIN_HOST_SAMPLES && config.reliable_max_avg_turnaround) { - - if (hav.turnaround.get_avg() > config.reliable_max_avg_turnaround*multiplier) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [AV#%d] not reliable; avg turnaround: %.3f > %.3f hrs\n", - hav.app_version_id, - hav.turnaround.get_avg()/3600, - config.reliable_max_avg_turnaround*multiplier/3600 - ); - } - hav.reliable = false; - return; - } - } - if (hav.consecutive_valid < CONS_VALID_RELIABLE) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [AV#%d] not reliable; cons valid %d < %d\n", - hav.app_version_id, - hav.consecutive_valid, CONS_VALID_RELIABLE - ); - } - hav.reliable = false; - return; - } - if (config.daily_result_quota) { - if (hav.max_jobs_per_day < config.daily_result_quota) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [AV#%d] not reliable; max_jobs_per_day %d>%d\n", - hav.app_version_id, - hav.max_jobs_per_day, - config.daily_result_quota - ); - } - hav.reliable = false; - return; - } - } - hav.reliable = true; - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] app version %d is reliable\n", - g_reply->host.id, hav.app_version_id - ); - } - g_wreq->has_reliable_version = true; -} - -// decide whether do unreplicated jobs with this app version -// -static void set_trust(DB_HOST_APP_VERSION& hav) { - hav.trusted = false; - if (hav.consecutive_valid < CONS_VALID_UNREPLICATED) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] set_trust: cons valid %d < %d, don't use single replication\n", - hav.consecutive_valid, CONS_VALID_UNREPLICATED - ); - } - return; - } - double x = 1./hav.consecutive_valid; - if (drand() > x) hav.trusted = true; - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] set_trust: random choice for cons valid %d: %s\n", - hav.consecutive_valid, hav.trusted?"yes":"no" - ); - } -} - -static void update_quota(DB_HOST_APP_VERSION& hav) { - if (config.daily_result_quota) { - if (hav.max_jobs_per_day == 0) { - hav.max_jobs_per_day = config.daily_result_quota; - if (config.debug_quota) { - log_messages.printf(MSG_NORMAL, - "[quota] [HAV#%d] Initializing max_results_day to %d\n", - hav.app_version_id, - config.daily_result_quota - ); - } - } - } - - if (g_request->last_rpc_dayofyear != g_request->current_rpc_dayofyear) { - if (config.debug_quota) { - log_messages.printf(MSG_NORMAL, - "[quota] [HOST#%d] [HAV#%d] Resetting n_jobs_today\n", - g_reply->host.id, hav.app_version_id - ); - } - hav.n_jobs_today = 0; - } -} - void update_n_jobs_today() { for (unsigned int i=0; ihost_app_versions.size(); i++) { DB_HOST_APP_VERSION& hav = g_wreq->host_app_versions[i]; @@ -486,144 +509,6 @@ void update_n_jobs_today() { } } -static void get_reliability_and_trust() { - // Platforms other than Windows, Linux and Intel Macs need a - // larger set of computers to be marked reliable - // - double multiplier = 1.0; - if (strstr(g_reply->host.os_name,"Windows") - || strstr(g_reply->host.os_name,"Linux") - || (strstr(g_reply->host.os_name,"Darwin") - && !(strstr(g_reply->host.p_vendor,"Power Macintosh")) - )) { - multiplier = 1.0; - } else { - multiplier = 1.8; - } - - for (unsigned int i=0; ihost_app_versions.size(); i++) { - DB_HOST_APP_VERSION& hav = g_wreq->host_app_versions[i]; - get_reliability_version(hav, multiplier); - set_trust(hav); - } -} - -// Return true if the user has set application preferences, -// and this job is not for a selected app -// -bool app_not_selected(WORKUNIT& wu) { - unsigned int i; - - if (g_wreq->preferred_apps.size() == 0) return false; - for (i=0; ipreferred_apps.size(); i++) { - if (wu.appid == g_wreq->preferred_apps[i].appid) { - g_wreq->preferred_apps[i].work_available = true; - return false; - } - } - return true; -} - -// see how much RAM we can use on this machine -// -static inline void get_mem_sizes() { - g_wreq->ram = g_reply->host.m_nbytes; - if (g_wreq->ram <= 0) g_wreq->ram = DEFAULT_RAM_SIZE; - g_wreq->usable_ram = g_wreq->ram; - double busy_frac = g_request->global_prefs.ram_max_used_busy_frac; - double idle_frac = g_request->global_prefs.ram_max_used_idle_frac; - double frac = 1; - if (busy_frac>0 && idle_frac>0) { - frac = std::max(busy_frac, idle_frac); - if (frac > 1) frac = 1; - g_wreq->usable_ram *= frac; - } -} - -static inline int check_memory(WORKUNIT& wu) { - double diff = wu.rsc_memory_bound - g_wreq->usable_ram; - if (diff > 0) { - char message[256]; - sprintf(message, - "%s needs %0.2f MB RAM but only %0.2f MB is available for use.", - find_user_friendly_name(wu.appid), - wu.rsc_memory_bound/MEGA, g_wreq->usable_ram/MEGA - ); - add_no_work_message(message); - - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [WU#%d %s] needs %0.2fMB RAM; [HOST#%d] has %0.2fMB, %0.2fMB usable\n", - wu.id, wu.name, wu.rsc_memory_bound/MEGA, - g_reply->host.id, g_wreq->ram/MEGA, g_wreq->usable_ram/MEGA - ); - } - g_wreq->mem.set_insufficient(wu.rsc_memory_bound); - g_reply->set_delay(DELAY_NO_WORK_TEMP); - return INFEASIBLE_MEM; - } - return 0; -} - -static inline int check_disk(WORKUNIT& wu) { - double diff = wu.rsc_disk_bound - g_wreq->disk_available; - if (diff > 0) { - char message[256]; - sprintf(message, - "%s needs %0.2fMB more disk space. You currently have %0.2f MB available and it needs %0.2f MB.", - find_user_friendly_name(wu.appid), - diff/MEGA, g_wreq->disk_available/MEGA, wu.rsc_disk_bound/MEGA - ); - add_no_work_message(message); - - g_wreq->disk.set_insufficient(diff); - return INFEASIBLE_DISK; - } - return 0; -} - -static inline int check_bandwidth(WORKUNIT& wu) { - if (wu.rsc_bandwidth_bound == 0) return 0; - - // if n_bwdown is zero, the host has never downloaded anything, - // so skip this check - // - if (g_reply->host.n_bwdown == 0) return 0; - - double diff = wu.rsc_bandwidth_bound - g_reply->host.n_bwdown; - if (diff > 0) { - char message[256]; - sprintf(message, - "%s requires %0.2f KB/sec download bandwidth. Your computer has been measured at %0.2f KB/sec.", - find_user_friendly_name(wu.appid), - wu.rsc_bandwidth_bound/KILO, g_reply->host.n_bwdown/KILO - ); - add_no_work_message(message); - - g_wreq->bandwidth.set_insufficient(diff); - return INFEASIBLE_BANDWIDTH; - } - return 0; -} - -// Determine if the app is "hard", -// and we should send it only to high-end hosts. -// Currently this is specified by setting weight=-1; -// this is a kludge for SETI@home/Astropulse. -// -static inline bool hard_app(APP& app) { - return (app.weight == -1); -} - -static inline double get_estimated_delay(BEST_APP_VERSION& bav) { - int pt = bav.host_usage.proc_type; - if (pt == PROC_TYPE_CPU) { - return g_request->cpu_estimated_delay; - } - COPROC* cp = g_request->coprocs.type_to_coproc(pt); - return cp->estimated_delay; -} - static inline void update_estimated_delay(BEST_APP_VERSION& bav, double dt) { int pt = bav.host_usage.proc_type; if (pt == PROC_TYPE_CPU) { @@ -634,239 +519,6 @@ static inline void update_estimated_delay(BEST_APP_VERSION& bav, double dt) { } } -// return the delay bound to use for this job/host. -// Actually, return two: optimistic (lower) and pessimistic (higher). -// If the deadline check with the optimistic bound fails, -// try the pessimistic bound. -// TODO: clean up this mess -// -static void get_delay_bound_range( - WORKUNIT& wu, - int res_server_state, int res_priority, double res_report_deadline, - BEST_APP_VERSION& bav, - double& opt, double& pess -) { - if (res_server_state == RESULT_SERVER_STATE_IN_PROGRESS) { - double now = dtime(); - if (res_report_deadline < now) { - // if original deadline has passed, return zeros - // This will skip deadline check. - opt = pess = 0; - return; - } - opt = res_report_deadline - now; - pess = wu.delay_bound; - } else { - opt = pess = wu.delay_bound; - - // If the workunit needs reliable and is being sent to a reliable host, - // then shorten the delay bound by the percent specified - // - if (config.reliable_on_priority && res_priority >= config.reliable_on_priority && config.reliable_reduced_delay_bound > 0.01 - ) { - opt = wu.delay_bound*config.reliable_reduced_delay_bound; - double est_wallclock_duration = estimate_duration(wu, bav); - - // Check to see how reasonable this reduced time is. - // Increase it to twice the estimated delay bound - // if all the following apply: - // - // 1) Twice the estimate is longer then the reduced delay bound - // 2) Twice the estimate is less then the original delay bound - // 3) Twice the estimate is less then the twice the reduced delay bound - if (est_wallclock_duration*2 > opt - && est_wallclock_duration*2 < wu.delay_bound - && est_wallclock_duration*2 < wu.delay_bound*config.reliable_reduced_delay_bound*2 - ) { - opt = est_wallclock_duration*2; - } - } - } -} - -// return 0 if the job, with the given delay bound, -// will complete by its deadline, and won't cause other jobs to miss deadlines. -// -static inline int check_deadline( - WORKUNIT& wu, APP& app, BEST_APP_VERSION& bav -) { - if (config.ignore_delay_bound) return 0; - - // skip delay check if host currently doesn't have any work - // and it's not a hard app. - // (i.e. everyone gets one result, no matter how slow they are) - // - if (get_estimated_delay(bav) == 0 && !hard_app(app)) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] est delay 0, skipping deadline check\n" - ); - } - return 0; - } - - // if it's a hard app, don't send it to a host with no credit - // - if (hard_app(app) && g_reply->host.total_credit == 0) { - return INFEASIBLE_CPU; - } - - // do EDF simulation if possible; else use cruder approximation - // - if (config.workload_sim && g_request->have_other_results_list) { - double est_dur = estimate_duration(wu, bav); - if (g_reply->wreq.edf_reject_test(est_dur, wu.delay_bound)) { - return INFEASIBLE_WORKLOAD; - } - IP_RESULT candidate("", wu.delay_bound, est_dur); - strcpy(candidate.name, wu.name); - if (check_candidate(candidate, g_wreq->effective_ncpus, g_request->ip_results)) { - // it passed the feasibility test, - // but don't add it to the workload yet; - // wait until we commit to sending it - } else { - g_reply->wreq.edf_reject(est_dur, wu.delay_bound); - g_reply->wreq.speed.set_insufficient(0); - return INFEASIBLE_WORKLOAD; - } - } else { - double ewd = estimate_duration(wu, bav); - if (hard_app(app)) ewd *= 1.3; - double est_report_delay = get_estimated_delay(bav) + ewd; - double diff = est_report_delay - wu.delay_bound; - if (diff > 0) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [WU#%d] deadline miss %d > %d\n", - wu.id, (int)est_report_delay, wu.delay_bound - ); - } - g_reply->wreq.speed.set_insufficient(diff); - return INFEASIBLE_CPU; - } else { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [WU#%d] meets deadline: %.2f + %.2f < %d\n", - wu.id, get_estimated_delay(bav), ewd, wu.delay_bound - ); - } - } - } - return 0; -} - -// Fast checks (no DB access) to see if the job can be sent to the host. -// Reasons why not include: -// 1) the host doesn't have enough memory; -// 2) the host doesn't have enough disk space; -// 3) based on CPU speed, resource share and estimated delay, -// the host probably won't get the result done within the delay bound -// 4) app isn't in user's "approved apps" list -// -// If the job is feasible, return 0 and fill in wu.delay_bound -// with the delay bound we've decided to use. -// -int wu_is_infeasible_fast( - WORKUNIT& wu, - int res_server_state, int res_priority, double res_report_deadline, - APP& app, BEST_APP_VERSION& bav -) { - int retval; - - // project-specific check - // - if (wu_is_infeasible_custom(wu, app, bav)) { - return INFEASIBLE_CUSTOM; - } - - if (config.user_filter) { - if (wu.batch && wu.batch != g_reply->user.id) { - return INFEASIBLE_USER_FILTER; - } - } - - // homogeneous redundancy: can't send if app uses HR and - // 1) host is of unknown HR class, or - // 2) WU is already committed to different HR class - // - if (app_hr_type(app)) { - if (hr_unknown_class(g_reply->host, app_hr_type(app))) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] [WU#%d %s] host is of unknown class in HR type %d\n", - g_reply->host.id, wu.id, wu.name, app_hr_type(app) - ); - } - return INFEASIBLE_HR; - } - if (already_sent_to_different_hr_class(wu, app)) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] [WU#%d %s] failed quick HR check: WU is class %d, host is class %d\n", - g_reply->host.id, wu.id, wu.name, wu.hr_class, hr_class(g_request->host, app_hr_type(app)) - ); - } - return INFEASIBLE_HR; - } - } - - // homogeneous app version - // - if (app.homogeneous_app_version) { - int avid = wu.app_version_id; - if (avid && bav.avp->id != avid) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] [WU#%d %s] failed homogeneous app version check: %d %d\n", - g_reply->host.id, wu.id, wu.name, avid, bav.avp->id - ); - } - return INFEASIBLE_HAV; - } - } - - if (config.one_result_per_user_per_wu || config.one_result_per_host_per_wu) { - if (wu_already_in_reply(wu)) { - return INFEASIBLE_DUP; - } - } - - retval = check_memory(wu); - if (retval) return retval; - retval = check_disk(wu); - if (retval) return retval; - retval = check_bandwidth(wu); - if (retval) return retval; - - if (app.non_cpu_intensive) { - return 0; - } - - // do deadline check last because EDF sim uses some CPU - // - double opt, pess; - get_delay_bound_range( - wu, res_server_state, res_priority, res_report_deadline, bav, opt, pess - ); - wu.delay_bound = (int)opt; - if (opt == 0) { - // this is a resend; skip deadline check - return 0; - } - retval = check_deadline(wu, app, bav); - if (retval && (opt != pess)) { - wu.delay_bound = (int)pess; - retval = check_deadline(wu, app, bav); - } - return retval; -} - -// return true if the client has a sticky file used by this job -// -bool host_has_job_file(WORKUNIT&) { - return false; -} - // insert "text" right after "after" in the given buffer // static int insert_after(char* buffer, const char* after, const char* text) { diff --git a/sched/sched_send.h b/sched/sched_send.h index 72a357fcf4..fa797eaa34 100644 --- a/sched/sched_send.h +++ b/sched/sched_send.h @@ -42,29 +42,6 @@ inline bool is_anonymous(PLATFORM* platform) { extern bool app_core_compatible(WORK_REQ& wreq, APP_VERSION& av); -// values returned by wu_is_infeasible() -// -#define INFEASIBLE_MEM 1 -#define INFEASIBLE_DISK 2 -#define INFEASIBLE_CPU 3 -#define INFEASIBLE_WORK_BUF 4 -#define INFEASIBLE_APP_SETTING 5 -#define INFEASIBLE_WORKLOAD 6 -#define INFEASIBLE_DUP 7 -#define INFEASIBLE_HR 8 -#define INFEASIBLE_BANDWIDTH 9 -#define INFEASIBLE_CUSTOM 10 -#define INFEASIBLE_USER_FILTER 11 -#define INFEASIBLE_HAV 12 - -extern int wu_is_infeasible_fast( - WORKUNIT&, - int res_server_state, int res_priority, double res_report_deadline, - APP&, BEST_APP_VERSION& -); -extern int slow_check(WU_RESULT&, APP*, BEST_APP_VERSION*); -extern bool result_still_sendable(DB_RESULT& result, WORKUNIT& wu); - extern double max_allowable_disk(); extern bool wu_already_in_reply(WORKUNIT& wu); @@ -75,7 +52,7 @@ extern int update_wu_on_send(WORKUNIT wu, time_t x, APP&, BEST_APP_VERSION&); extern void lock_sema(); extern void unlock_sema(); -extern const char* infeasible_string(int); +extern const char* find_user_friendly_name(int appid); extern bool app_not_selected(WORKUNIT&); extern bool work_needed(bool); extern void send_work_setup();