boinc/sched/sched_check.cpp

529 lines
18 KiB
C++

// This file is part of BOINC.
// http://boinc.berkeley.edu
// Copyright (C) 2013 University of California
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation,
// either version 3 of the License, or (at your option) any later version.
//
// BOINC is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
// Check whether a job can be sent to a host
#include "credit.h"
#include "sched_config.h"
#include "sched_hr.h"
#include "sched_main.h"
#include "sched_msgs.h"
#include "sched_send.h"
#include "sched_check.h"
#ifdef _USING_FCGI_
#include "boinc_fcgi.h"
#endif
const char* infeasible_string(int code) {
switch (code) {
case INFEASIBLE_MEM: return "Not enough memory";
case INFEASIBLE_DISK: return "Not enough disk";
case INFEASIBLE_CPU: return "CPU too slow";
case INFEASIBLE_APP_SETTING: return "App not selected";
case INFEASIBLE_WORKLOAD: return "Existing workload";
case INFEASIBLE_DUP: return "Already in reply";
case INFEASIBLE_HR: return "Homogeneous redundancy";
case INFEASIBLE_BANDWIDTH: return "Download bandwidth too low";
}
return "Unknown";
}
// Return true if the user has set application preferences,
// and excluded this app
//
bool app_not_selected(int appid) {
unsigned int i;
if (g_wreq->preferred_apps.size() == 0) return false;
for (i=0; i<g_wreq->preferred_apps.size(); i++) {
if (appid == g_wreq->preferred_apps[i].appid) {
g_wreq->preferred_apps[i].work_available = true;
return false;
}
}
return true;
}
static inline int check_memory(WORKUNIT& wu) {
double diff = wu.rsc_memory_bound - g_wreq->usable_ram;
if (diff > 0) {
char message[256];
sprintf(message,
"%s needs %0.2f MB RAM but only %0.2f MB is available for use.",
find_user_friendly_name(wu.appid),
wu.rsc_memory_bound/MEGA, g_wreq->usable_ram/MEGA
);
add_no_work_message(message);
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [WU#%lu %s] needs %0.2fMB RAM; [HOST#%lu] has %0.2fMB, %0.2fMB usable\n",
wu.id, wu.name, wu.rsc_memory_bound/MEGA,
g_reply->host.id, g_wreq->ram/MEGA, g_wreq->usable_ram/MEGA
);
}
g_wreq->mem.set_insufficient(wu.rsc_memory_bound);
g_reply->set_delay(DELAY_NO_WORK_TEMP);
return INFEASIBLE_MEM;
}
return 0;
}
static inline int check_disk(WORKUNIT& wu) {
double diff = wu.rsc_disk_bound - g_wreq->disk_available;
if (diff > 0) {
char message[256];
sprintf(message,
"%s needs %0.2fMB more disk space. You currently have %0.2f MB available and it needs %0.2f MB.",
find_user_friendly_name(wu.appid),
diff/MEGA, g_wreq->disk_available/MEGA, wu.rsc_disk_bound/MEGA
);
add_no_work_message(message);
g_wreq->disk.set_insufficient(diff);
return INFEASIBLE_DISK;
}
return 0;
}
static inline int check_bandwidth(WORKUNIT& wu) {
if (wu.rsc_bandwidth_bound == 0) return 0;
// if n_bwdown is zero, the host has never downloaded anything,
// so skip this check
//
if (g_reply->host.n_bwdown == 0) return 0;
double diff = wu.rsc_bandwidth_bound - g_reply->host.n_bwdown;
if (diff > 0) {
char message[256];
sprintf(message,
"%s requires %0.2f KB/sec download bandwidth. Your computer has been measured at %0.2f KB/sec.",
find_user_friendly_name(wu.appid),
wu.rsc_bandwidth_bound/KILO, g_reply->host.n_bwdown/KILO
);
add_no_work_message(message);
g_wreq->bandwidth.set_insufficient(diff);
return INFEASIBLE_BANDWIDTH;
}
return 0;
}
// Determine if the app is "hard",
// and we should send it only to high-end hosts.
// Currently this is specified by setting weight=-1;
// this is a kludge for SETI@home/Astropulse.
//
static inline bool hard_app(APP& app) {
return (app.weight == -1);
}
static inline double get_estimated_delay(BEST_APP_VERSION& bav) {
int pt = bav.host_usage.proc_type;
if (pt == PROC_TYPE_CPU) {
return g_request->cpu_estimated_delay;
}
COPROC* cp = g_request->coprocs.proc_type_to_coproc(pt);
return cp->estimated_delay;
}
// return the delay bound to use for this job/host.
// Actually, return two: optimistic (lower) and pessimistic (higher).
// If the deadline check with the optimistic bound fails,
// try the pessimistic bound.
// TODO: clean up this mess
//
static void get_delay_bound_range(
WORKUNIT& wu,
int res_server_state, int res_priority, double res_report_deadline,
BEST_APP_VERSION& bav,
double& opt, double& pess
) {
if (res_server_state == RESULT_SERVER_STATE_IN_PROGRESS) {
double now = dtime();
if (res_report_deadline < now) {
// if original deadline has passed, return zeros
// This will skip deadline check.
opt = pess = 0;
return;
}
opt = res_report_deadline - now;
pess = wu.delay_bound;
} else {
opt = pess = wu.delay_bound;
// If the workunit needs reliable and is being sent to a reliable host,
// then shorten the delay bound by the percent specified
//
if (config.reliable_on_priority && res_priority >= config.reliable_on_priority && config.reliable_reduced_delay_bound > 0.01
) {
opt = wu.delay_bound*config.reliable_reduced_delay_bound;
double est_wallclock_duration = estimate_duration(wu, bav);
// Check to see how reasonable this reduced time is.
// Increase it to twice the estimated delay bound
// if all the following apply:
//
// 1) Twice the estimate is longer then the reduced delay bound
// 2) Twice the estimate is less then the original delay bound
// 3) Twice the estimate is less then the twice the reduced delay bound
if (est_wallclock_duration*2 > opt
&& est_wallclock_duration*2 < wu.delay_bound
&& est_wallclock_duration*2 < wu.delay_bound*config.reliable_reduced_delay_bound*2
) {
opt = est_wallclock_duration*2;
}
}
}
}
// return 0 if the job, with the given delay bound,
// will complete by its deadline, and won't cause other jobs to miss deadlines.
//
static inline int check_deadline(
WORKUNIT& wu, APP& app, BEST_APP_VERSION& bav
) {
if (config.ignore_delay_bound) return 0;
// skip delay check if host currently doesn't have any work
// and it's not a hard app.
// (i.e. everyone gets one result, no matter how slow they are)
//
if (get_estimated_delay(bav) == 0 && !hard_app(app)) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [WU#%lu] est delay 0, skipping deadline check\n",
wu.id
);
}
return 0;
}
// if it's a hard app, don't send it to a host with no credit
//
if (hard_app(app) && g_reply->host.total_credit == 0) {
return INFEASIBLE_CPU;
}
// do EDF simulation if possible; else use cruder approximation
//
if (config.workload_sim && g_request->have_other_results_list) {
double est_dur = estimate_duration(wu, bav);
if (g_reply->wreq.edf_reject_test(est_dur, wu.delay_bound)) {
return INFEASIBLE_WORKLOAD;
}
IP_RESULT candidate("", wu.delay_bound, est_dur);
safe_strcpy(candidate.name, wu.name);
if (check_candidate(candidate, g_wreq->effective_ncpus, g_request->ip_results)) {
// it passed the feasibility test,
// but don't add it to the workload yet;
// wait until we commit to sending it
} else {
g_reply->wreq.edf_reject(est_dur, wu.delay_bound);
g_reply->wreq.speed.set_insufficient(0);
return INFEASIBLE_WORKLOAD;
}
} else {
double ewd = estimate_duration(wu, bav);
if (hard_app(app)) ewd *= 1.3;
double est_report_delay = get_estimated_delay(bav) + ewd;
double diff = est_report_delay - wu.delay_bound;
if (diff > 0) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [WU#%lu] deadline miss %d > %d\n",
wu.id, (int)est_report_delay, wu.delay_bound
);
}
g_reply->wreq.speed.set_insufficient(diff);
return INFEASIBLE_CPU;
} else {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [WU#%lu] meets deadline: %.2f + %.2f < %d\n",
wu.id, get_estimated_delay(bav), ewd, wu.delay_bound
);
}
}
}
return 0;
}
// Fast checks (no DB access) to see if the job can be sent to the host.
// Reasons why not include:
// 1) the host doesn't have enough memory;
// 2) the host doesn't have enough disk space;
// 3) based on CPU speed, resource share and estimated delay,
// the host probably won't get the result done within the delay bound
// 4) app isn't in user's "approved apps" list
//
// If the job is feasible, return 0 and fill in wu.delay_bound
// with the delay bound we've decided to use.
//
int wu_is_infeasible_fast(
WORKUNIT& wu,
int res_server_state, int res_priority, double res_report_deadline,
APP& app, BEST_APP_VERSION& bav
) {
int retval;
// project-specific check
//
if (wu_is_infeasible_custom(wu, app, bav)) {
return INFEASIBLE_CUSTOM;
}
if (config.user_filter) {
if (wu.batch && wu.batch != g_reply->user.id) {
return INFEASIBLE_USER_FILTER;
}
}
// homogeneous redundancy: can't send if app uses HR and
// 1) host is of unknown HR class, or
// 2) WU is already committed to different HR class
//
if (app_hr_type(app)) {
if (hr_unknown_class(g_reply->host, app_hr_type(app))) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [HOST#%lu] [WU#%lu %s] host is of unknown class in HR type %d\n",
g_reply->host.id, wu.id, wu.name, app_hr_type(app)
);
}
return INFEASIBLE_HR;
}
if (already_sent_to_different_hr_class(wu, app)) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [HOST#%lu] [WU#%lu %s] failed quick HR check: WU is class %d, host is class %d\n",
g_reply->host.id, wu.id, wu.name, wu.hr_class, hr_class(g_request->host, app_hr_type(app))
);
}
return INFEASIBLE_HR;
}
}
// homogeneous app version
//
if (app.homogeneous_app_version) {
DB_ID_TYPE avid = wu.app_version_id;
if (avid && bav.avp->id != avid) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [HOST#%lu] [WU#%lu %s] failed homogeneous app version check: %lu %lu\n",
g_reply->host.id, wu.id, wu.name, avid, bav.avp->id
);
}
return INFEASIBLE_HAV;
}
}
if (config.one_result_per_user_per_wu || config.one_result_per_host_per_wu) {
if (wu_already_in_reply(wu)) {
return INFEASIBLE_DUP;
}
}
retval = check_memory(wu);
if (retval) return retval;
retval = check_disk(wu);
if (retval) return retval;
retval = check_bandwidth(wu);
if (retval) return retval;
if (app.non_cpu_intensive) {
return 0;
}
// do deadline check last because EDF sim uses some CPU
//
double opt, pess;
get_delay_bound_range(
wu, res_server_state, res_priority, res_report_deadline, bav, opt, pess
);
wu.delay_bound = (int)opt;
if (opt == 0) {
// this is a resend; skip deadline check
return 0;
}
retval = check_deadline(wu, app, bav);
if (retval && (opt != pess)) {
wu.delay_bound = (int)pess;
retval = check_deadline(wu, app, bav);
}
return retval;
}
// Do checks that require DB access for whether we can send this job,
// and return:
// 0 if OK to send
// 1 if can't send to this host
// 2 if can't send to ANY host
//
int slow_check(
WU_RESULT& wu_result, // the job cache entry.
// We may refresh its hr_class and app_version_id fields.
APP* app,
BEST_APP_VERSION* bavp // the app version to be used
) {
int retval;
long n;
DB_RESULT result;
char buf[256];
WORKUNIT& wu = wu_result.workunit;
// Don't send if we've already sent a result of this WU to this user.
//
if (config.one_result_per_user_per_wu) {
sprintf(buf,
"where workunitid=%lu and userid=%lu", wu.id, g_reply->user.id
);
retval = result.count(n, buf);
if (retval) {
log_messages.printf(MSG_CRITICAL,
"send_work: can't get result count (%s)\n", boincerror(retval)
);
return 1;
} else {
if (n>0) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [USER#%lu] already has %ld result(s) for [WU#%lu]\n",
g_reply->user.id, n, wu.id
);
}
return 1;
}
}
} else if (config.one_result_per_host_per_wu) {
// Don't send if we've already sent a result of this WU to this host.
// We only have to check this if we don't send one result per user.
//
sprintf(buf,
"where workunitid=%lu and hostid=%lu", wu.id, g_reply->host.id
);
retval = result.count(n, buf);
if (retval) {
log_messages.printf(MSG_CRITICAL,
"send_work: can't get result count (%s)\n", boincerror(retval)
);
return 1;
} else {
if (n>0) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [HOST#%lu] already has %ld result(s) for [WU#%lu]\n",
g_reply->host.id, n, wu.id
);
}
return 1;
}
}
}
// Checks that require looking up the WU.
// Lump these together so we only do 1 lookup
//
if (app_hr_type(*app) || app->homogeneous_app_version) {
DB_WORKUNIT db_wu;
db_wu.id = wu.id;
int vals[3];
retval = db_wu.get_field_ints(
"hr_class, app_version_id, error_mask", 3, vals
);
if (retval) {
log_messages.printf(MSG_CRITICAL,
"can't get fields for [WU#%lu]: %s\n", db_wu.id, boincerror(retval)
);
return 1;
}
// check wu.error_mask
//
if (vals[2] != 0) {
return 2;
}
if (app_hr_type(*app)) {
wu.hr_class = vals[0];
if (already_sent_to_different_hr_class(wu, *app)) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [HOST#%lu] [WU#%lu %s] is assigned to different HR class\n",
g_reply->host.id, wu.id, wu.name
);
}
// Mark the workunit as infeasible.
// This ensures that jobs already assigned to an HR class
// are processed first.
//
wu_result.infeasible_count++;
return 1;
}
}
if (app->homogeneous_app_version) {
int wu_avid = vals[1];
wu.app_version_id = wu_avid;
if (wu_avid && wu_avid != bavp->avp->id) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [HOST#%lu] [WU#%lu %s] is assigned to different app version\n",
g_reply->host.id, wu.id, wu.name
);
}
wu_result.infeasible_count++;
return 1;
}
}
}
return 0;
}
// Check for pathological conditions that mean
// result is not sendable at all.
//
bool result_still_sendable(DB_RESULT& result, WORKUNIT& wu) {
int retval = result.lookup_id(result.id);
if (retval) {
log_messages.printf(MSG_CRITICAL,
"[RESULT#%lu] result.lookup_id() failed: %s\n",
result.id, boincerror(retval)
);
return false;
}
if (result.server_state != RESULT_SERVER_STATE_UNSENT) {
log_messages.printf(MSG_NORMAL,
"[RESULT#%lu] expected to be unsent; instead, state is %d\n",
result.id, result.server_state
);
return false;
}
if (result.workunitid != wu.id) {
log_messages.printf(MSG_CRITICAL,
"[RESULT#%lu] wrong WU ID: wanted %lu, got %lu\n",
result.id, wu.id, result.workunitid
);
return false;
}
return true;
}