mirror of https://github.com/BOINC/boinc.git
528 lines
18 KiB
C++
528 lines
18 KiB
C++
// This file is part of BOINC.
|
|
// http://boinc.berkeley.edu
|
|
// Copyright (C) 2013 University of California
|
|
//
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
// under the terms of the GNU Lesser General Public License
|
|
// as published by the Free Software Foundation,
|
|
// either version 3 of the License, or (at your option) any later version.
|
|
//
|
|
// BOINC is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
// See the GNU Lesser General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
// Check whether a job can be sent to a host
|
|
|
|
#include "credit.h"
|
|
#include "sched_config.h"
|
|
#include "sched_hr.h"
|
|
#include "sched_main.h"
|
|
#include "sched_msgs.h"
|
|
#include "sched_send.h"
|
|
|
|
#include "sched_check.h"
|
|
|
|
#ifdef _USING_FCGI_
|
|
#include "boinc_fcgi.h"
|
|
#endif
|
|
|
|
const char* infeasible_string(int code) {
|
|
switch (code) {
|
|
case INFEASIBLE_MEM: return "Not enough memory";
|
|
case INFEASIBLE_DISK: return "Not enough disk";
|
|
case INFEASIBLE_CPU: return "CPU too slow";
|
|
case INFEASIBLE_APP_SETTING: return "App not selected";
|
|
case INFEASIBLE_WORKLOAD: return "Existing workload";
|
|
case INFEASIBLE_DUP: return "Already in reply";
|
|
case INFEASIBLE_HR: return "Homogeneous redundancy";
|
|
case INFEASIBLE_BANDWIDTH: return "Download bandwidth too low";
|
|
}
|
|
return "Unknown";
|
|
}
|
|
|
|
// Return true if the user has set application preferences,
|
|
// and excluded this app
|
|
//
|
|
bool app_not_selected(int appid) {
|
|
unsigned int i;
|
|
|
|
if (g_wreq->preferred_apps.size() == 0) return false;
|
|
for (i=0; i<g_wreq->preferred_apps.size(); i++) {
|
|
if (appid == g_wreq->preferred_apps[i].appid) {
|
|
g_wreq->preferred_apps[i].work_available = true;
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static inline int check_memory(WORKUNIT& wu) {
|
|
double diff = wu.rsc_memory_bound - g_wreq->usable_ram;
|
|
if (diff > 0) {
|
|
char message[256];
|
|
sprintf(message,
|
|
"%s needs %0.2f MB RAM but only %0.2f MB is available for use.",
|
|
find_user_friendly_name(wu.appid),
|
|
wu.rsc_memory_bound/MEGA, g_wreq->usable_ram/MEGA
|
|
);
|
|
add_no_work_message(message);
|
|
|
|
if (config.debug_send_job) {
|
|
log_messages.printf(MSG_NORMAL,
|
|
"[send_job] [WU#%u %s] needs %0.2fMB RAM; [HOST#%d] has %0.2fMB, %0.2fMB usable\n",
|
|
wu.id, wu.name, wu.rsc_memory_bound/MEGA,
|
|
g_reply->host.id, g_wreq->ram/MEGA, g_wreq->usable_ram/MEGA
|
|
);
|
|
}
|
|
g_wreq->mem.set_insufficient(wu.rsc_memory_bound);
|
|
g_reply->set_delay(DELAY_NO_WORK_TEMP);
|
|
return INFEASIBLE_MEM;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static inline int check_disk(WORKUNIT& wu) {
|
|
double diff = wu.rsc_disk_bound - g_wreq->disk_available;
|
|
if (diff > 0) {
|
|
char message[256];
|
|
sprintf(message,
|
|
"%s needs %0.2fMB more disk space. You currently have %0.2f MB available and it needs %0.2f MB.",
|
|
find_user_friendly_name(wu.appid),
|
|
diff/MEGA, g_wreq->disk_available/MEGA, wu.rsc_disk_bound/MEGA
|
|
);
|
|
add_no_work_message(message);
|
|
|
|
g_wreq->disk.set_insufficient(diff);
|
|
return INFEASIBLE_DISK;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static inline int check_bandwidth(WORKUNIT& wu) {
|
|
if (wu.rsc_bandwidth_bound == 0) return 0;
|
|
|
|
// if n_bwdown is zero, the host has never downloaded anything,
|
|
// so skip this check
|
|
//
|
|
if (g_reply->host.n_bwdown == 0) return 0;
|
|
|
|
double diff = wu.rsc_bandwidth_bound - g_reply->host.n_bwdown;
|
|
if (diff > 0) {
|
|
char message[256];
|
|
sprintf(message,
|
|
"%s requires %0.2f KB/sec download bandwidth. Your computer has been measured at %0.2f KB/sec.",
|
|
find_user_friendly_name(wu.appid),
|
|
wu.rsc_bandwidth_bound/KILO, g_reply->host.n_bwdown/KILO
|
|
);
|
|
add_no_work_message(message);
|
|
|
|
g_wreq->bandwidth.set_insufficient(diff);
|
|
return INFEASIBLE_BANDWIDTH;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Determine if the app is "hard",
|
|
// and we should send it only to high-end hosts.
|
|
// Currently this is specified by setting weight=-1;
|
|
// this is a kludge for SETI@home/Astropulse.
|
|
//
|
|
static inline bool hard_app(APP& app) {
|
|
return (app.weight == -1);
|
|
}
|
|
|
|
static inline double get_estimated_delay(BEST_APP_VERSION& bav) {
|
|
int pt = bav.host_usage.proc_type;
|
|
if (pt == PROC_TYPE_CPU) {
|
|
return g_request->cpu_estimated_delay;
|
|
}
|
|
COPROC* cp = g_request->coprocs.proc_type_to_coproc(pt);
|
|
return cp->estimated_delay;
|
|
}
|
|
|
|
|
|
// return the delay bound to use for this job/host.
|
|
// Actually, return two: optimistic (lower) and pessimistic (higher).
|
|
// If the deadline check with the optimistic bound fails,
|
|
// try the pessimistic bound.
|
|
// TODO: clean up this mess
|
|
//
|
|
static void get_delay_bound_range(
|
|
WORKUNIT& wu,
|
|
int res_server_state, int res_priority, double res_report_deadline,
|
|
BEST_APP_VERSION& bav,
|
|
double& opt, double& pess
|
|
) {
|
|
if (res_server_state == RESULT_SERVER_STATE_IN_PROGRESS) {
|
|
double now = dtime();
|
|
if (res_report_deadline < now) {
|
|
// if original deadline has passed, return zeros
|
|
// This will skip deadline check.
|
|
opt = pess = 0;
|
|
return;
|
|
}
|
|
opt = res_report_deadline - now;
|
|
pess = wu.delay_bound;
|
|
} else {
|
|
opt = pess = wu.delay_bound;
|
|
|
|
// If the workunit needs reliable and is being sent to a reliable host,
|
|
// then shorten the delay bound by the percent specified
|
|
//
|
|
if (config.reliable_on_priority && res_priority >= config.reliable_on_priority && config.reliable_reduced_delay_bound > 0.01
|
|
) {
|
|
opt = wu.delay_bound*config.reliable_reduced_delay_bound;
|
|
double est_wallclock_duration = estimate_duration(wu, bav);
|
|
|
|
// Check to see how reasonable this reduced time is.
|
|
// Increase it to twice the estimated delay bound
|
|
// if all the following apply:
|
|
//
|
|
// 1) Twice the estimate is longer then the reduced delay bound
|
|
// 2) Twice the estimate is less then the original delay bound
|
|
// 3) Twice the estimate is less then the twice the reduced delay bound
|
|
if (est_wallclock_duration*2 > opt
|
|
&& est_wallclock_duration*2 < wu.delay_bound
|
|
&& est_wallclock_duration*2 < wu.delay_bound*config.reliable_reduced_delay_bound*2
|
|
) {
|
|
opt = est_wallclock_duration*2;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// return 0 if the job, with the given delay bound,
|
|
// will complete by its deadline, and won't cause other jobs to miss deadlines.
|
|
//
|
|
static inline int check_deadline(
|
|
WORKUNIT& wu, APP& app, BEST_APP_VERSION& bav
|
|
) {
|
|
if (config.ignore_delay_bound) return 0;
|
|
|
|
// skip delay check if host currently doesn't have any work
|
|
// and it's not a hard app.
|
|
// (i.e. everyone gets one result, no matter how slow they are)
|
|
//
|
|
if (get_estimated_delay(bav) == 0 && !hard_app(app)) {
|
|
if (config.debug_send_job) {
|
|
log_messages.printf(MSG_NORMAL,
|
|
"[send_job] [WU#%u] est delay 0, skipping deadline check\n",
|
|
wu.id
|
|
);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// if it's a hard app, don't send it to a host with no credit
|
|
//
|
|
if (hard_app(app) && g_reply->host.total_credit == 0) {
|
|
return INFEASIBLE_CPU;
|
|
}
|
|
|
|
// do EDF simulation if possible; else use cruder approximation
|
|
//
|
|
if (config.workload_sim && g_request->have_other_results_list) {
|
|
double est_dur = estimate_duration(wu, bav);
|
|
if (g_reply->wreq.edf_reject_test(est_dur, wu.delay_bound)) {
|
|
return INFEASIBLE_WORKLOAD;
|
|
}
|
|
IP_RESULT candidate("", wu.delay_bound, est_dur);
|
|
safe_strcpy(candidate.name, wu.name);
|
|
if (check_candidate(candidate, g_wreq->effective_ncpus, g_request->ip_results)) {
|
|
// it passed the feasibility test,
|
|
// but don't add it to the workload yet;
|
|
// wait until we commit to sending it
|
|
} else {
|
|
g_reply->wreq.edf_reject(est_dur, wu.delay_bound);
|
|
g_reply->wreq.speed.set_insufficient(0);
|
|
return INFEASIBLE_WORKLOAD;
|
|
}
|
|
} else {
|
|
double ewd = estimate_duration(wu, bav);
|
|
if (hard_app(app)) ewd *= 1.3;
|
|
double est_report_delay = get_estimated_delay(bav) + ewd;
|
|
double diff = est_report_delay - wu.delay_bound;
|
|
if (diff > 0) {
|
|
if (config.debug_send_job) {
|
|
log_messages.printf(MSG_NORMAL,
|
|
"[send_job] [WU#%u] deadline miss %d > %d\n",
|
|
wu.id, (int)est_report_delay, wu.delay_bound
|
|
);
|
|
}
|
|
g_reply->wreq.speed.set_insufficient(diff);
|
|
return INFEASIBLE_CPU;
|
|
} else {
|
|
if (config.debug_send_job) {
|
|
log_messages.printf(MSG_NORMAL,
|
|
"[send_job] [WU#%u] meets deadline: %.2f + %.2f < %d\n",
|
|
wu.id, get_estimated_delay(bav), ewd, wu.delay_bound
|
|
);
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Fast checks (no DB access) to see if the job can be sent to the host.
|
|
// Reasons why not include:
|
|
// 1) the host doesn't have enough memory;
|
|
// 2) the host doesn't have enough disk space;
|
|
// 3) based on CPU speed, resource share and estimated delay,
|
|
// the host probably won't get the result done within the delay bound
|
|
// 4) app isn't in user's "approved apps" list
|
|
//
|
|
// If the job is feasible, return 0 and fill in wu.delay_bound
|
|
// with the delay bound we've decided to use.
|
|
//
|
|
int wu_is_infeasible_fast(
|
|
WORKUNIT& wu,
|
|
int res_server_state, int res_priority, double res_report_deadline,
|
|
APP& app, BEST_APP_VERSION& bav
|
|
) {
|
|
int retval;
|
|
|
|
// project-specific check
|
|
//
|
|
if (wu_is_infeasible_custom(wu, app, bav)) {
|
|
return INFEASIBLE_CUSTOM;
|
|
}
|
|
|
|
if (config.user_filter) {
|
|
if (wu.batch && wu.batch != g_reply->user.id) {
|
|
return INFEASIBLE_USER_FILTER;
|
|
}
|
|
}
|
|
|
|
// homogeneous redundancy: can't send if app uses HR and
|
|
// 1) host is of unknown HR class, or
|
|
// 2) WU is already committed to different HR class
|
|
//
|
|
if (app_hr_type(app)) {
|
|
if (hr_unknown_class(g_reply->host, app_hr_type(app))) {
|
|
if (config.debug_send_job) {
|
|
log_messages.printf(MSG_NORMAL,
|
|
"[send_job] [HOST#%d] [WU#%u %s] host is of unknown class in HR type %d\n",
|
|
g_reply->host.id, wu.id, wu.name, app_hr_type(app)
|
|
);
|
|
}
|
|
return INFEASIBLE_HR;
|
|
}
|
|
if (already_sent_to_different_hr_class(wu, app)) {
|
|
if (config.debug_send_job) {
|
|
log_messages.printf(MSG_NORMAL,
|
|
"[send_job] [HOST#%d] [WU#%u %s] failed quick HR check: WU is class %d, host is class %d\n",
|
|
g_reply->host.id, wu.id, wu.name, wu.hr_class, hr_class(g_request->host, app_hr_type(app))
|
|
);
|
|
}
|
|
return INFEASIBLE_HR;
|
|
}
|
|
}
|
|
|
|
// homogeneous app version
|
|
//
|
|
if (app.homogeneous_app_version) {
|
|
int avid = wu.app_version_id;
|
|
if (avid && bav.avp->id != avid) {
|
|
if (config.debug_send_job) {
|
|
log_messages.printf(MSG_NORMAL,
|
|
"[send_job] [HOST#%d] [WU#%u %s] failed homogeneous app version check: %d %d\n",
|
|
g_reply->host.id, wu.id, wu.name, avid, bav.avp->id
|
|
);
|
|
}
|
|
return INFEASIBLE_HAV;
|
|
}
|
|
}
|
|
|
|
if (config.one_result_per_user_per_wu || config.one_result_per_host_per_wu) {
|
|
if (wu_already_in_reply(wu)) {
|
|
return INFEASIBLE_DUP;
|
|
}
|
|
}
|
|
|
|
retval = check_memory(wu);
|
|
if (retval) return retval;
|
|
retval = check_disk(wu);
|
|
if (retval) return retval;
|
|
retval = check_bandwidth(wu);
|
|
if (retval) return retval;
|
|
|
|
if (app.non_cpu_intensive) {
|
|
return 0;
|
|
}
|
|
|
|
// do deadline check last because EDF sim uses some CPU
|
|
//
|
|
double opt, pess;
|
|
get_delay_bound_range(
|
|
wu, res_server_state, res_priority, res_report_deadline, bav, opt, pess
|
|
);
|
|
wu.delay_bound = (int)opt;
|
|
if (opt == 0) {
|
|
// this is a resend; skip deadline check
|
|
return 0;
|
|
}
|
|
retval = check_deadline(wu, app, bav);
|
|
if (retval && (opt != pess)) {
|
|
wu.delay_bound = (int)pess;
|
|
retval = check_deadline(wu, app, bav);
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
// Do checks that require DB access for whether we can send this job,
|
|
// and return:
|
|
// 0 if OK to send
|
|
// 1 if can't send to this host
|
|
// 2 if can't send to ANY host
|
|
//
|
|
int slow_check(
|
|
WU_RESULT& wu_result, // the job cache entry.
|
|
// We may refresh its hr_class and app_version_id fields.
|
|
APP* app,
|
|
BEST_APP_VERSION* bavp // the app version to be used
|
|
) {
|
|
int n, retval;
|
|
DB_RESULT result;
|
|
char buf[256];
|
|
WORKUNIT& wu = wu_result.workunit;
|
|
|
|
// Don't send if we've already sent a result of this WU to this user.
|
|
//
|
|
if (config.one_result_per_user_per_wu) {
|
|
sprintf(buf,
|
|
"where workunitid=%d and userid=%d", wu.id, g_reply->user.id
|
|
);
|
|
retval = result.count(n, buf);
|
|
if (retval) {
|
|
log_messages.printf(MSG_CRITICAL,
|
|
"send_work: can't get result count (%s)\n", boincerror(retval)
|
|
);
|
|
return 1;
|
|
} else {
|
|
if (n>0) {
|
|
if (config.debug_send_job) {
|
|
log_messages.printf(MSG_NORMAL,
|
|
"[send_job] [USER#%d] already has %d result(s) for [WU#%u]\n",
|
|
g_reply->user.id, n, wu.id
|
|
);
|
|
}
|
|
return 1;
|
|
}
|
|
}
|
|
} else if (config.one_result_per_host_per_wu) {
|
|
// Don't send if we've already sent a result of this WU to this host.
|
|
// We only have to check this if we don't send one result per user.
|
|
//
|
|
sprintf(buf,
|
|
"where workunitid=%d and hostid=%d", wu.id, g_reply->host.id
|
|
);
|
|
retval = result.count(n, buf);
|
|
if (retval) {
|
|
log_messages.printf(MSG_CRITICAL,
|
|
"send_work: can't get result count (%s)\n", boincerror(retval)
|
|
);
|
|
return 1;
|
|
} else {
|
|
if (n>0) {
|
|
if (config.debug_send_job) {
|
|
log_messages.printf(MSG_NORMAL,
|
|
"[send_job] [HOST#%d] already has %d result(s) for [WU#%u]\n",
|
|
g_reply->host.id, n, wu.id
|
|
);
|
|
}
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Checks that require looking up the WU.
|
|
// Lump these together so we only do 1 lookup
|
|
//
|
|
if (app_hr_type(*app) || app->homogeneous_app_version) {
|
|
DB_WORKUNIT db_wu;
|
|
db_wu.id = wu.id;
|
|
int vals[3];
|
|
retval = db_wu.get_field_ints(
|
|
"hr_class, app_version_id, error_mask", 3, vals
|
|
);
|
|
if (retval) {
|
|
log_messages.printf(MSG_CRITICAL,
|
|
"can't get fields for [WU#%u]: %s\n", db_wu.id, boincerror(retval)
|
|
);
|
|
return 1;
|
|
}
|
|
|
|
// check wu.error_mask
|
|
//
|
|
if (vals[2] != 0) {
|
|
return 2;
|
|
}
|
|
|
|
if (app_hr_type(*app)) {
|
|
wu.hr_class = vals[0];
|
|
if (already_sent_to_different_hr_class(wu, *app)) {
|
|
if (config.debug_send_job) {
|
|
log_messages.printf(MSG_NORMAL,
|
|
"[send_job] [HOST#%d] [WU#%u %s] is assigned to different HR class\n",
|
|
g_reply->host.id, wu.id, wu.name
|
|
);
|
|
}
|
|
// Mark the workunit as infeasible.
|
|
// This ensures that jobs already assigned to an HR class
|
|
// are processed first.
|
|
//
|
|
wu_result.infeasible_count++;
|
|
return 1;
|
|
}
|
|
}
|
|
if (app->homogeneous_app_version) {
|
|
int wu_avid = vals[1];
|
|
wu.app_version_id = wu_avid;
|
|
if (wu_avid && wu_avid != bavp->avp->id) {
|
|
if (config.debug_send_job) {
|
|
log_messages.printf(MSG_NORMAL,
|
|
"[send_job] [HOST#%d] [WU#%u %s] is assigned to different app version\n",
|
|
g_reply->host.id, wu.id, wu.name
|
|
);
|
|
}
|
|
wu_result.infeasible_count++;
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Check for pathological conditions that mean
|
|
// result is not sendable at all.
|
|
//
|
|
bool result_still_sendable(DB_RESULT& result, WORKUNIT& wu) {
|
|
int retval = result.lookup_id(result.id);
|
|
if (retval) {
|
|
log_messages.printf(MSG_CRITICAL,
|
|
"[RESULT#%u] result.lookup_id() failed: %s\n",
|
|
result.id, boincerror(retval)
|
|
);
|
|
return false;
|
|
}
|
|
if (result.server_state != RESULT_SERVER_STATE_UNSENT) {
|
|
log_messages.printf(MSG_NORMAL,
|
|
"[RESULT#%u] expected to be unsent; instead, state is %d\n",
|
|
result.id, result.server_state
|
|
);
|
|
return false;
|
|
}
|
|
if (result.workunitid != wu.id) {
|
|
log_messages.printf(MSG_CRITICAL,
|
|
"[RESULT#%u] wrong WU ID: wanted %d, got %d\n",
|
|
result.id, wu.id, result.workunitid
|
|
);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|