mirror of https://github.com/BOINC/boinc.git
- scheduler: code shuffle; new file sched_check.cpp contains functions
that decide whether a job can be sent to a host
This commit is contained in:
parent
12319ca82b
commit
c9c9f2bae0
|
@ -151,6 +151,7 @@ cgi_sources = \
|
|||
plan_class_spec.cpp \
|
||||
sched_array.cpp \
|
||||
sched_assign.cpp \
|
||||
sched_check.cpp \
|
||||
sched_customize.cpp \
|
||||
sched_files.cpp \
|
||||
sched_hr.cpp \
|
||||
|
|
|
@ -963,22 +963,6 @@ int assign_credit_set(
|
|||
return 0;
|
||||
}
|
||||
|
||||
// A job has:
|
||||
// - errored out (scheduler)
|
||||
// - timed out (transitioner)
|
||||
// - failed validation (validator).
|
||||
// Put (host/app_version) on "host scale probation",
|
||||
// so that we won't use host scaling for a while.
|
||||
//
|
||||
void got_error(DB_HOST_APP_VERSION &hav) {
|
||||
if (config.debug_credit) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[credit] [HAV#%d] got error, setting error rate to %f\n",
|
||||
hav.app_version_id, ERROR_RATE_INIT
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// carefully write any app_version records that have changed;
|
||||
// done at the end of every validator scan.
|
||||
//
|
||||
|
|
|
@ -19,9 +19,6 @@
|
|||
|
||||
#include "boinc_db.h"
|
||||
|
||||
#define ERROR_RATE_INIT 0.1
|
||||
// the initial error rate of a host or app version
|
||||
|
||||
#define MIN_HOST_SAMPLES 10
|
||||
// use host scaling only if have this many samples for host
|
||||
#define MIN_VERSION_SAMPLES 100
|
||||
|
|
|
@ -23,15 +23,17 @@
|
|||
|
||||
#include "config.h"
|
||||
|
||||
#include "sched_main.h"
|
||||
#include "sched_types.h"
|
||||
#include "sched_shmem.h"
|
||||
#include "sched_hr.h"
|
||||
#include "sched_check.h"
|
||||
#include "sched_config.h"
|
||||
#include "sched_util.h"
|
||||
#include "sched_hr.h"
|
||||
#include "sched_main.h"
|
||||
#include "sched_msgs.h"
|
||||
#include "sched_send.h"
|
||||
#include "sched_shmem.h"
|
||||
#include "sched_types.h"
|
||||
#include "sched_util.h"
|
||||
#include "sched_version.h"
|
||||
|
||||
#ifdef _USING_FCGI_
|
||||
#include "boinc_fcgi.h"
|
||||
#endif
|
||||
|
@ -201,159 +203,6 @@ static bool quick_check(
|
|||
return true;
|
||||
}
|
||||
|
||||
// Do checks that require DB access for whether we can send this job,
|
||||
// and return:
|
||||
// 0 if OK to send
|
||||
// 1 if can't send to this host
|
||||
// 2 if can't send to ANY host
|
||||
//
|
||||
int slow_check(
|
||||
WU_RESULT& wu_result, // the job cache entry.
|
||||
// We may refresh its hr_class and app_version_id fields.
|
||||
APP* app,
|
||||
BEST_APP_VERSION* bavp // the app version to be used
|
||||
) {
|
||||
int n, retval;
|
||||
DB_RESULT result;
|
||||
char buf[256];
|
||||
WORKUNIT& wu = wu_result.workunit;
|
||||
|
||||
// Don't send if we've already sent a result of this WU to this user.
|
||||
//
|
||||
if (config.one_result_per_user_per_wu) {
|
||||
sprintf(buf,
|
||||
"where workunitid=%d and userid=%d", wu.id, g_reply->user.id
|
||||
);
|
||||
retval = result.count(n, buf);
|
||||
if (retval) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
"send_work: can't get result count (%s)\n", boincerror(retval)
|
||||
);
|
||||
return 1;
|
||||
} else {
|
||||
if (n>0) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [USER#%d] already has %d result(s) for [WU#%d]\n",
|
||||
g_reply->user.id, n, wu.id
|
||||
);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
} else if (config.one_result_per_host_per_wu) {
|
||||
// Don't send if we've already sent a result of this WU to this host.
|
||||
// We only have to check this if we don't send one result per user.
|
||||
//
|
||||
sprintf(buf,
|
||||
"where workunitid=%d and hostid=%d", wu.id, g_reply->host.id
|
||||
);
|
||||
retval = result.count(n, buf);
|
||||
if (retval) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
"send_work: can't get result count (%s)\n", boincerror(retval)
|
||||
);
|
||||
return 1;
|
||||
} else {
|
||||
if (n>0) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] already has %d result(s) for [WU#%d]\n",
|
||||
g_reply->host.id, n, wu.id
|
||||
);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Checks that require looking up the WU.
|
||||
// Lump these together so we only do 1 lookup
|
||||
//
|
||||
if (app_hr_type(*app) || app->homogeneous_app_version) {
|
||||
DB_WORKUNIT db_wu;
|
||||
db_wu.id = wu.id;
|
||||
int vals[3];
|
||||
retval = db_wu.get_field_ints(
|
||||
"hr_class, app_version_id, error_mask", 3, vals
|
||||
);
|
||||
if (retval) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
"can't get fields for [WU#%d]: %s\n", db_wu.id, boincerror(retval)
|
||||
);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// check wu.error_mask
|
||||
//
|
||||
if (vals[2] != 0) {
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (app_hr_type(*app)) {
|
||||
wu.hr_class = vals[0];
|
||||
if (already_sent_to_different_hr_class(wu, *app)) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] [WU#%d %s] is assigned to different HR class\n",
|
||||
g_reply->host.id, wu.id, wu.name
|
||||
);
|
||||
}
|
||||
// Mark the workunit as infeasible.
|
||||
// This ensures that jobs already assigned to an HR class
|
||||
// are processed first.
|
||||
//
|
||||
wu_result.infeasible_count++;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (app->homogeneous_app_version) {
|
||||
int wu_avid = vals[1];
|
||||
wu.app_version_id = wu_avid;
|
||||
if (wu_avid && wu_avid != bavp->avp->id) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] [WU#%d %s] is assigned to different app version\n",
|
||||
g_reply->host.id, wu.id, wu.name
|
||||
);
|
||||
}
|
||||
wu_result.infeasible_count++;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Check for pathological conditions that mean
|
||||
// result is not sendable at all.
|
||||
//
|
||||
bool result_still_sendable(DB_RESULT& result, WORKUNIT& wu) {
|
||||
int retval = result.lookup_id(result.id);
|
||||
if (retval) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
"[RESULT#%d] result.lookup_id() failed: %s\n",
|
||||
result.id, boincerror(retval)
|
||||
);
|
||||
return false;
|
||||
}
|
||||
if (result.server_state != RESULT_SERVER_STATE_UNSENT) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[RESULT#%d] expected to be unsent; instead, state is %d\n",
|
||||
result.id, result.server_state
|
||||
);
|
||||
return false;
|
||||
}
|
||||
if (result.workunitid != wu.id) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
"[RESULT#%d] wrong WU ID: wanted %d, got %d\n",
|
||||
result.id, wu.id, result.workunitid
|
||||
);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Make a pass through the wu/results array, sending work.
|
||||
// The choice of jobs is limited by flags in g_wreq, as follows:
|
||||
// infeasible_only:
|
||||
|
@ -502,7 +351,6 @@ recheck:
|
|||
|
||||
// Send work by scanning the job array multiple times,
|
||||
// with different selection criteria on each scan.
|
||||
// This has been superceded by send_work_matchmaker()
|
||||
//
|
||||
void send_work_old() {
|
||||
g_wreq->beta_only = false;
|
||||
|
|
|
@ -0,0 +1,526 @@
|
|||
// This file is part of BOINC.
|
||||
// http://boinc.berkeley.edu
|
||||
// Copyright (C) 2013 University of California
|
||||
//
|
||||
// BOINC is free software; you can redistribute it and/or modify it
|
||||
// under the terms of the GNU Lesser General Public License
|
||||
// as published by the Free Software Foundation,
|
||||
// either version 3 of the License, or (at your option) any later version.
|
||||
//
|
||||
// BOINC is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
// See the GNU Lesser General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Lesser General Public License
|
||||
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
// Check whether a job can be sent to a host
|
||||
|
||||
#include "credit.h"
|
||||
#include "sched_config.h"
|
||||
#include "sched_hr.h"
|
||||
#include "sched_main.h"
|
||||
#include "sched_msgs.h"
|
||||
#include "sched_send.h"
|
||||
|
||||
#include "sched_check.h"
|
||||
|
||||
#ifdef _USING_FCGI_
|
||||
#include "boinc_fcgi.h"
|
||||
#endif
|
||||
|
||||
const char* infeasible_string(int code) {
|
||||
switch (code) {
|
||||
case INFEASIBLE_MEM: return "Not enough memory";
|
||||
case INFEASIBLE_DISK: return "Not enough disk";
|
||||
case INFEASIBLE_CPU: return "CPU too slow";
|
||||
case INFEASIBLE_APP_SETTING: return "App not selected";
|
||||
case INFEASIBLE_WORKLOAD: return "Existing workload";
|
||||
case INFEASIBLE_DUP: return "Already in reply";
|
||||
case INFEASIBLE_HR: return "Homogeneous redundancy";
|
||||
case INFEASIBLE_BANDWIDTH: return "Download bandwidth too low";
|
||||
}
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
// Return true if the user has set application preferences,
|
||||
// and this job is not for a selected app
|
||||
//
|
||||
bool app_not_selected(WORKUNIT& wu) {
|
||||
unsigned int i;
|
||||
|
||||
if (g_wreq->preferred_apps.size() == 0) return false;
|
||||
for (i=0; i<g_wreq->preferred_apps.size(); i++) {
|
||||
if (wu.appid == g_wreq->preferred_apps[i].appid) {
|
||||
g_wreq->preferred_apps[i].work_available = true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline int check_memory(WORKUNIT& wu) {
|
||||
double diff = wu.rsc_memory_bound - g_wreq->usable_ram;
|
||||
if (diff > 0) {
|
||||
char message[256];
|
||||
sprintf(message,
|
||||
"%s needs %0.2f MB RAM but only %0.2f MB is available for use.",
|
||||
find_user_friendly_name(wu.appid),
|
||||
wu.rsc_memory_bound/MEGA, g_wreq->usable_ram/MEGA
|
||||
);
|
||||
add_no_work_message(message);
|
||||
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [WU#%d %s] needs %0.2fMB RAM; [HOST#%d] has %0.2fMB, %0.2fMB usable\n",
|
||||
wu.id, wu.name, wu.rsc_memory_bound/MEGA,
|
||||
g_reply->host.id, g_wreq->ram/MEGA, g_wreq->usable_ram/MEGA
|
||||
);
|
||||
}
|
||||
g_wreq->mem.set_insufficient(wu.rsc_memory_bound);
|
||||
g_reply->set_delay(DELAY_NO_WORK_TEMP);
|
||||
return INFEASIBLE_MEM;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int check_disk(WORKUNIT& wu) {
|
||||
double diff = wu.rsc_disk_bound - g_wreq->disk_available;
|
||||
if (diff > 0) {
|
||||
char message[256];
|
||||
sprintf(message,
|
||||
"%s needs %0.2fMB more disk space. You currently have %0.2f MB available and it needs %0.2f MB.",
|
||||
find_user_friendly_name(wu.appid),
|
||||
diff/MEGA, g_wreq->disk_available/MEGA, wu.rsc_disk_bound/MEGA
|
||||
);
|
||||
add_no_work_message(message);
|
||||
|
||||
g_wreq->disk.set_insufficient(diff);
|
||||
return INFEASIBLE_DISK;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int check_bandwidth(WORKUNIT& wu) {
|
||||
if (wu.rsc_bandwidth_bound == 0) return 0;
|
||||
|
||||
// if n_bwdown is zero, the host has never downloaded anything,
|
||||
// so skip this check
|
||||
//
|
||||
if (g_reply->host.n_bwdown == 0) return 0;
|
||||
|
||||
double diff = wu.rsc_bandwidth_bound - g_reply->host.n_bwdown;
|
||||
if (diff > 0) {
|
||||
char message[256];
|
||||
sprintf(message,
|
||||
"%s requires %0.2f KB/sec download bandwidth. Your computer has been measured at %0.2f KB/sec.",
|
||||
find_user_friendly_name(wu.appid),
|
||||
wu.rsc_bandwidth_bound/KILO, g_reply->host.n_bwdown/KILO
|
||||
);
|
||||
add_no_work_message(message);
|
||||
|
||||
g_wreq->bandwidth.set_insufficient(diff);
|
||||
return INFEASIBLE_BANDWIDTH;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Determine if the app is "hard",
|
||||
// and we should send it only to high-end hosts.
|
||||
// Currently this is specified by setting weight=-1;
|
||||
// this is a kludge for SETI@home/Astropulse.
|
||||
//
|
||||
static inline bool hard_app(APP& app) {
|
||||
return (app.weight == -1);
|
||||
}
|
||||
|
||||
static inline double get_estimated_delay(BEST_APP_VERSION& bav) {
|
||||
int pt = bav.host_usage.proc_type;
|
||||
if (pt == PROC_TYPE_CPU) {
|
||||
return g_request->cpu_estimated_delay;
|
||||
}
|
||||
COPROC* cp = g_request->coprocs.type_to_coproc(pt);
|
||||
return cp->estimated_delay;
|
||||
}
|
||||
|
||||
|
||||
// return the delay bound to use for this job/host.
|
||||
// Actually, return two: optimistic (lower) and pessimistic (higher).
|
||||
// If the deadline check with the optimistic bound fails,
|
||||
// try the pessimistic bound.
|
||||
// TODO: clean up this mess
|
||||
//
|
||||
static void get_delay_bound_range(
|
||||
WORKUNIT& wu,
|
||||
int res_server_state, int res_priority, double res_report_deadline,
|
||||
BEST_APP_VERSION& bav,
|
||||
double& opt, double& pess
|
||||
) {
|
||||
if (res_server_state == RESULT_SERVER_STATE_IN_PROGRESS) {
|
||||
double now = dtime();
|
||||
if (res_report_deadline < now) {
|
||||
// if original deadline has passed, return zeros
|
||||
// This will skip deadline check.
|
||||
opt = pess = 0;
|
||||
return;
|
||||
}
|
||||
opt = res_report_deadline - now;
|
||||
pess = wu.delay_bound;
|
||||
} else {
|
||||
opt = pess = wu.delay_bound;
|
||||
|
||||
// If the workunit needs reliable and is being sent to a reliable host,
|
||||
// then shorten the delay bound by the percent specified
|
||||
//
|
||||
if (config.reliable_on_priority && res_priority >= config.reliable_on_priority && config.reliable_reduced_delay_bound > 0.01
|
||||
) {
|
||||
opt = wu.delay_bound*config.reliable_reduced_delay_bound;
|
||||
double est_wallclock_duration = estimate_duration(wu, bav);
|
||||
|
||||
// Check to see how reasonable this reduced time is.
|
||||
// Increase it to twice the estimated delay bound
|
||||
// if all the following apply:
|
||||
//
|
||||
// 1) Twice the estimate is longer then the reduced delay bound
|
||||
// 2) Twice the estimate is less then the original delay bound
|
||||
// 3) Twice the estimate is less then the twice the reduced delay bound
|
||||
if (est_wallclock_duration*2 > opt
|
||||
&& est_wallclock_duration*2 < wu.delay_bound
|
||||
&& est_wallclock_duration*2 < wu.delay_bound*config.reliable_reduced_delay_bound*2
|
||||
) {
|
||||
opt = est_wallclock_duration*2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// return 0 if the job, with the given delay bound,
|
||||
// will complete by its deadline, and won't cause other jobs to miss deadlines.
|
||||
//
|
||||
static inline int check_deadline(
|
||||
WORKUNIT& wu, APP& app, BEST_APP_VERSION& bav
|
||||
) {
|
||||
if (config.ignore_delay_bound) return 0;
|
||||
|
||||
// skip delay check if host currently doesn't have any work
|
||||
// and it's not a hard app.
|
||||
// (i.e. everyone gets one result, no matter how slow they are)
|
||||
//
|
||||
if (get_estimated_delay(bav) == 0 && !hard_app(app)) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] est delay 0, skipping deadline check\n"
|
||||
);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// if it's a hard app, don't send it to a host with no credit
|
||||
//
|
||||
if (hard_app(app) && g_reply->host.total_credit == 0) {
|
||||
return INFEASIBLE_CPU;
|
||||
}
|
||||
|
||||
// do EDF simulation if possible; else use cruder approximation
|
||||
//
|
||||
if (config.workload_sim && g_request->have_other_results_list) {
|
||||
double est_dur = estimate_duration(wu, bav);
|
||||
if (g_reply->wreq.edf_reject_test(est_dur, wu.delay_bound)) {
|
||||
return INFEASIBLE_WORKLOAD;
|
||||
}
|
||||
IP_RESULT candidate("", wu.delay_bound, est_dur);
|
||||
strcpy(candidate.name, wu.name);
|
||||
if (check_candidate(candidate, g_wreq->effective_ncpus, g_request->ip_results)) {
|
||||
// it passed the feasibility test,
|
||||
// but don't add it to the workload yet;
|
||||
// wait until we commit to sending it
|
||||
} else {
|
||||
g_reply->wreq.edf_reject(est_dur, wu.delay_bound);
|
||||
g_reply->wreq.speed.set_insufficient(0);
|
||||
return INFEASIBLE_WORKLOAD;
|
||||
}
|
||||
} else {
|
||||
double ewd = estimate_duration(wu, bav);
|
||||
if (hard_app(app)) ewd *= 1.3;
|
||||
double est_report_delay = get_estimated_delay(bav) + ewd;
|
||||
double diff = est_report_delay - wu.delay_bound;
|
||||
if (diff > 0) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [WU#%d] deadline miss %d > %d\n",
|
||||
wu.id, (int)est_report_delay, wu.delay_bound
|
||||
);
|
||||
}
|
||||
g_reply->wreq.speed.set_insufficient(diff);
|
||||
return INFEASIBLE_CPU;
|
||||
} else {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [WU#%d] meets deadline: %.2f + %.2f < %d\n",
|
||||
wu.id, get_estimated_delay(bav), ewd, wu.delay_bound
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Fast checks (no DB access) to see if the job can be sent to the host.
|
||||
// Reasons why not include:
|
||||
// 1) the host doesn't have enough memory;
|
||||
// 2) the host doesn't have enough disk space;
|
||||
// 3) based on CPU speed, resource share and estimated delay,
|
||||
// the host probably won't get the result done within the delay bound
|
||||
// 4) app isn't in user's "approved apps" list
|
||||
//
|
||||
// If the job is feasible, return 0 and fill in wu.delay_bound
|
||||
// with the delay bound we've decided to use.
|
||||
//
|
||||
int wu_is_infeasible_fast(
|
||||
WORKUNIT& wu,
|
||||
int res_server_state, int res_priority, double res_report_deadline,
|
||||
APP& app, BEST_APP_VERSION& bav
|
||||
) {
|
||||
int retval;
|
||||
|
||||
// project-specific check
|
||||
//
|
||||
if (wu_is_infeasible_custom(wu, app, bav)) {
|
||||
return INFEASIBLE_CUSTOM;
|
||||
}
|
||||
|
||||
if (config.user_filter) {
|
||||
if (wu.batch && wu.batch != g_reply->user.id) {
|
||||
return INFEASIBLE_USER_FILTER;
|
||||
}
|
||||
}
|
||||
|
||||
// homogeneous redundancy: can't send if app uses HR and
|
||||
// 1) host is of unknown HR class, or
|
||||
// 2) WU is already committed to different HR class
|
||||
//
|
||||
if (app_hr_type(app)) {
|
||||
if (hr_unknown_class(g_reply->host, app_hr_type(app))) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] [WU#%d %s] host is of unknown class in HR type %d\n",
|
||||
g_reply->host.id, wu.id, wu.name, app_hr_type(app)
|
||||
);
|
||||
}
|
||||
return INFEASIBLE_HR;
|
||||
}
|
||||
if (already_sent_to_different_hr_class(wu, app)) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] [WU#%d %s] failed quick HR check: WU is class %d, host is class %d\n",
|
||||
g_reply->host.id, wu.id, wu.name, wu.hr_class, hr_class(g_request->host, app_hr_type(app))
|
||||
);
|
||||
}
|
||||
return INFEASIBLE_HR;
|
||||
}
|
||||
}
|
||||
|
||||
// homogeneous app version
|
||||
//
|
||||
if (app.homogeneous_app_version) {
|
||||
int avid = wu.app_version_id;
|
||||
if (avid && bav.avp->id != avid) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] [WU#%d %s] failed homogeneous app version check: %d %d\n",
|
||||
g_reply->host.id, wu.id, wu.name, avid, bav.avp->id
|
||||
);
|
||||
}
|
||||
return INFEASIBLE_HAV;
|
||||
}
|
||||
}
|
||||
|
||||
if (config.one_result_per_user_per_wu || config.one_result_per_host_per_wu) {
|
||||
if (wu_already_in_reply(wu)) {
|
||||
return INFEASIBLE_DUP;
|
||||
}
|
||||
}
|
||||
|
||||
retval = check_memory(wu);
|
||||
if (retval) return retval;
|
||||
retval = check_disk(wu);
|
||||
if (retval) return retval;
|
||||
retval = check_bandwidth(wu);
|
||||
if (retval) return retval;
|
||||
|
||||
if (app.non_cpu_intensive) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// do deadline check last because EDF sim uses some CPU
|
||||
//
|
||||
double opt, pess;
|
||||
get_delay_bound_range(
|
||||
wu, res_server_state, res_priority, res_report_deadline, bav, opt, pess
|
||||
);
|
||||
wu.delay_bound = (int)opt;
|
||||
if (opt == 0) {
|
||||
// this is a resend; skip deadline check
|
||||
return 0;
|
||||
}
|
||||
retval = check_deadline(wu, app, bav);
|
||||
if (retval && (opt != pess)) {
|
||||
wu.delay_bound = (int)pess;
|
||||
retval = check_deadline(wu, app, bav);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
// Do checks that require DB access for whether we can send this job,
|
||||
// and return:
|
||||
// 0 if OK to send
|
||||
// 1 if can't send to this host
|
||||
// 2 if can't send to ANY host
|
||||
//
|
||||
int slow_check(
|
||||
WU_RESULT& wu_result, // the job cache entry.
|
||||
// We may refresh its hr_class and app_version_id fields.
|
||||
APP* app,
|
||||
BEST_APP_VERSION* bavp // the app version to be used
|
||||
) {
|
||||
int n, retval;
|
||||
DB_RESULT result;
|
||||
char buf[256];
|
||||
WORKUNIT& wu = wu_result.workunit;
|
||||
|
||||
// Don't send if we've already sent a result of this WU to this user.
|
||||
//
|
||||
if (config.one_result_per_user_per_wu) {
|
||||
sprintf(buf,
|
||||
"where workunitid=%d and userid=%d", wu.id, g_reply->user.id
|
||||
);
|
||||
retval = result.count(n, buf);
|
||||
if (retval) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
"send_work: can't get result count (%s)\n", boincerror(retval)
|
||||
);
|
||||
return 1;
|
||||
} else {
|
||||
if (n>0) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [USER#%d] already has %d result(s) for [WU#%d]\n",
|
||||
g_reply->user.id, n, wu.id
|
||||
);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
} else if (config.one_result_per_host_per_wu) {
|
||||
// Don't send if we've already sent a result of this WU to this host.
|
||||
// We only have to check this if we don't send one result per user.
|
||||
//
|
||||
sprintf(buf,
|
||||
"where workunitid=%d and hostid=%d", wu.id, g_reply->host.id
|
||||
);
|
||||
retval = result.count(n, buf);
|
||||
if (retval) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
"send_work: can't get result count (%s)\n", boincerror(retval)
|
||||
);
|
||||
return 1;
|
||||
} else {
|
||||
if (n>0) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] already has %d result(s) for [WU#%d]\n",
|
||||
g_reply->host.id, n, wu.id
|
||||
);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Checks that require looking up the WU.
|
||||
// Lump these together so we only do 1 lookup
|
||||
//
|
||||
if (app_hr_type(*app) || app->homogeneous_app_version) {
|
||||
DB_WORKUNIT db_wu;
|
||||
db_wu.id = wu.id;
|
||||
int vals[3];
|
||||
retval = db_wu.get_field_ints(
|
||||
"hr_class, app_version_id, error_mask", 3, vals
|
||||
);
|
||||
if (retval) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
"can't get fields for [WU#%d]: %s\n", db_wu.id, boincerror(retval)
|
||||
);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// check wu.error_mask
|
||||
//
|
||||
if (vals[2] != 0) {
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (app_hr_type(*app)) {
|
||||
wu.hr_class = vals[0];
|
||||
if (already_sent_to_different_hr_class(wu, *app)) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] [WU#%d %s] is assigned to different HR class\n",
|
||||
g_reply->host.id, wu.id, wu.name
|
||||
);
|
||||
}
|
||||
// Mark the workunit as infeasible.
|
||||
// This ensures that jobs already assigned to an HR class
|
||||
// are processed first.
|
||||
//
|
||||
wu_result.infeasible_count++;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (app->homogeneous_app_version) {
|
||||
int wu_avid = vals[1];
|
||||
wu.app_version_id = wu_avid;
|
||||
if (wu_avid && wu_avid != bavp->avp->id) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] [WU#%d %s] is assigned to different app version\n",
|
||||
g_reply->host.id, wu.id, wu.name
|
||||
);
|
||||
}
|
||||
wu_result.infeasible_count++;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Check for pathological conditions that mean
|
||||
// result is not sendable at all.
|
||||
//
|
||||
bool result_still_sendable(DB_RESULT& result, WORKUNIT& wu) {
|
||||
int retval = result.lookup_id(result.id);
|
||||
if (retval) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
"[RESULT#%d] result.lookup_id() failed: %s\n",
|
||||
result.id, boincerror(retval)
|
||||
);
|
||||
return false;
|
||||
}
|
||||
if (result.server_state != RESULT_SERVER_STATE_UNSENT) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[RESULT#%d] expected to be unsent; instead, state is %d\n",
|
||||
result.id, result.server_state
|
||||
);
|
||||
return false;
|
||||
}
|
||||
if (result.workunitid != wu.id) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
"[RESULT#%d] wrong WU ID: wanted %d, got %d\n",
|
||||
result.id, wu.id, result.workunitid
|
||||
);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
// This file is part of BOINC.
|
||||
// http://boinc.berkeley.edu
|
||||
// Copyright (C) 2013 University of California
|
||||
//
|
||||
// BOINC is free software; you can redistribute it and/or modify it
|
||||
// under the terms of the GNU Lesser General Public License
|
||||
// as published by the Free Software Foundation,
|
||||
// either version 3 of the License, or (at your option) any later version.
|
||||
//
|
||||
// BOINC is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
// See the GNU Lesser General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Lesser General Public License
|
||||
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
||||
// values returned by wu_is_infeasible()
|
||||
|
||||
#ifndef _SCHED_CHECK_
|
||||
#define _SCHED_CHECK_
|
||||
|
||||
#include "sched_shmem.h"
|
||||
#include "sched_types.h"
|
||||
|
||||
#define INFEASIBLE_MEM 1
|
||||
#define INFEASIBLE_DISK 2
|
||||
#define INFEASIBLE_CPU 3
|
||||
#define INFEASIBLE_WORK_BUF 4
|
||||
#define INFEASIBLE_APP_SETTING 5
|
||||
#define INFEASIBLE_WORKLOAD 6
|
||||
#define INFEASIBLE_DUP 7
|
||||
#define INFEASIBLE_HR 8
|
||||
#define INFEASIBLE_BANDWIDTH 9
|
||||
#define INFEASIBLE_CUSTOM 10
|
||||
#define INFEASIBLE_USER_FILTER 11
|
||||
#define INFEASIBLE_HAV 12
|
||||
|
||||
extern const char* infeasible_string(int);
|
||||
|
||||
extern int wu_is_infeasible_fast(
|
||||
WORKUNIT&,
|
||||
int res_server_state, int res_priority, double res_report_deadline,
|
||||
APP&, BEST_APP_VERSION&
|
||||
);
|
||||
extern int slow_check(WU_RESULT&, APP*, BEST_APP_VERSION*);
|
||||
extern bool result_still_sendable(DB_RESULT& result, WORKUNIT& wu);
|
||||
|
||||
#endif
|
|
@ -57,6 +57,7 @@ using std::string;
|
|||
#include "str_util.h"
|
||||
#include "util.h"
|
||||
|
||||
#include "sched_check.h"
|
||||
#include "sched_config.h"
|
||||
#include "sched_main.h"
|
||||
#include "sched_msgs.h"
|
||||
|
@ -973,7 +974,7 @@ bool app_plan(SCHEDULER_REQUEST& sreq, char* plan_class, HOST_USAGE& hu) {
|
|||
return false;
|
||||
}
|
||||
|
||||
#if 1
|
||||
#ifndef NEW_SCORE
|
||||
// compute a "score" for sending this job to this host.
|
||||
// Return false if the WU is infeasible.
|
||||
// Otherwise set est_time and disk_usage.
|
||||
|
|
|
@ -35,15 +35,16 @@
|
|||
#include "str_util.h"
|
||||
#include "filesys.h"
|
||||
|
||||
#include "sched_main.h"
|
||||
#include "sched_types.h"
|
||||
#include "sched_check.h"
|
||||
#include "sched_config.h"
|
||||
#include "sched_locality.h"
|
||||
#include "sched_main.h"
|
||||
#include "sched_msgs.h"
|
||||
#include "sched_shmem.h"
|
||||
#include "sched_send.h"
|
||||
#include "sched_shmem.h"
|
||||
#include "sched_types.h"
|
||||
#include "sched_util.h"
|
||||
#include "sched_version.h"
|
||||
#include "sched_config.h"
|
||||
|
||||
#define VERBOSE_DEBUG
|
||||
|
||||
|
|
|
@ -32,16 +32,17 @@
|
|||
|
||||
#include "error_numbers.h"
|
||||
|
||||
#include "sched_main.h"
|
||||
#include "sched_check.h"
|
||||
#include "sched_config.h"
|
||||
#include "sched_customize.h"
|
||||
#include "sched_locality.h"
|
||||
#include "sched_main.h"
|
||||
#include "sched_msgs.h"
|
||||
#include "sched_send.h"
|
||||
#include "sched_shmem.h"
|
||||
#include "sched_types.h"
|
||||
#include "sched_util.h"
|
||||
#include "sched_version.h"
|
||||
#include "sched_types.h"
|
||||
|
||||
#include "sched_resend.h"
|
||||
|
||||
|
|
|
@ -23,18 +23,19 @@
|
|||
#include "error_numbers.h"
|
||||
#include "util.h"
|
||||
|
||||
#include "sched_main.h"
|
||||
#include "sched_check.h"
|
||||
#include "sched_config.h"
|
||||
#include "sched_hr.h"
|
||||
#include "sched_main.h"
|
||||
#include "sched_msgs.h"
|
||||
#include "sched_shmem.h"
|
||||
#include "sched_send.h"
|
||||
#include "sched_version.h"
|
||||
#include "sched_shmem.h"
|
||||
#include "sched_types.h"
|
||||
#include "sched_version.h"
|
||||
|
||||
#include "sched_score.h"
|
||||
|
||||
#if 0
|
||||
#ifdef NEW_SCORE
|
||||
|
||||
bool JOB::get_score(WU_RESULT& wu_result) {
|
||||
score = 0;
|
||||
|
|
|
@ -15,7 +15,9 @@
|
|||
// You should have received a copy of the GNU Lesser General Public License
|
||||
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#if 0
|
||||
#define NEW_SCORE
|
||||
|
||||
#ifdef NEW_SCORE
|
||||
struct JOB {
|
||||
int index;
|
||||
int result_id;
|
||||
|
|
|
@ -122,20 +122,6 @@ void add_job_files_to_host(WORKUNIT& wu) {
|
|||
}
|
||||
}
|
||||
|
||||
const char* infeasible_string(int code) {
|
||||
switch (code) {
|
||||
case INFEASIBLE_MEM: return "Not enough memory";
|
||||
case INFEASIBLE_DISK: return "Not enough disk";
|
||||
case INFEASIBLE_CPU: return "CPU too slow";
|
||||
case INFEASIBLE_APP_SETTING: return "App not selected";
|
||||
case INFEASIBLE_WORKLOAD: return "Existing workload";
|
||||
case INFEASIBLE_DUP: return "Already in reply";
|
||||
case INFEASIBLE_HR: return "Homogeneous redundancy";
|
||||
case INFEASIBLE_BANDWIDTH: return "Download bandwidth too low";
|
||||
}
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
const double MIN_REQ_SECS = 0;
|
||||
const double MAX_REQ_SECS = (28*SECONDS_IN_DAY);
|
||||
|
||||
|
@ -181,12 +167,155 @@ void WORK_REQ::get_job_limits() {
|
|||
config.max_jobs_in_progress.reset(effective_ncpus, effective_ngpus);
|
||||
}
|
||||
|
||||
static const char* find_user_friendly_name(int appid) {
|
||||
const char* find_user_friendly_name(int appid) {
|
||||
APP* app = ssp->lookup_app(appid);
|
||||
if (app) return app->user_friendly_name;
|
||||
return "deprecated application";
|
||||
}
|
||||
|
||||
static void update_quota(DB_HOST_APP_VERSION& hav) {
|
||||
if (config.daily_result_quota) {
|
||||
if (hav.max_jobs_per_day == 0) {
|
||||
hav.max_jobs_per_day = config.daily_result_quota;
|
||||
if (config.debug_quota) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[quota] [HAV#%d] Initializing max_results_day to %d\n",
|
||||
hav.app_version_id,
|
||||
config.daily_result_quota
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (g_request->last_rpc_dayofyear != g_request->current_rpc_dayofyear) {
|
||||
if (config.debug_quota) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[quota] [HOST#%d] [HAV#%d] Resetting n_jobs_today\n",
|
||||
g_reply->host.id, hav.app_version_id
|
||||
);
|
||||
}
|
||||
hav.n_jobs_today = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// see how much RAM we can use on this machine
|
||||
//
|
||||
static inline void get_mem_sizes() {
|
||||
g_wreq->ram = g_reply->host.m_nbytes;
|
||||
if (g_wreq->ram <= 0) g_wreq->ram = DEFAULT_RAM_SIZE;
|
||||
g_wreq->usable_ram = g_wreq->ram;
|
||||
double busy_frac = g_request->global_prefs.ram_max_used_busy_frac;
|
||||
double idle_frac = g_request->global_prefs.ram_max_used_idle_frac;
|
||||
double frac = 1;
|
||||
if (busy_frac>0 && idle_frac>0) {
|
||||
frac = std::max(busy_frac, idle_frac);
|
||||
if (frac > 1) frac = 1;
|
||||
g_wreq->usable_ram *= frac;
|
||||
}
|
||||
}
|
||||
|
||||
// Decide whether or not this app version is 'reliable'
|
||||
// An app version is reliable if the following conditions are true
|
||||
// (for those that are set in the config file)
|
||||
// 1) The host average turnaround is less than a threshold
|
||||
// 2) consecutive_valid is above a threshold
|
||||
// 3) The host results per day is equal to the max value
|
||||
//
|
||||
void get_reliability_version(HOST_APP_VERSION& hav, double multiplier) {
|
||||
if (hav.turnaround.n > MIN_HOST_SAMPLES && config.reliable_max_avg_turnaround) {
|
||||
|
||||
if (hav.turnaround.get_avg() > config.reliable_max_avg_turnaround*multiplier) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [AV#%d] not reliable; avg turnaround: %.3f > %.3f hrs\n",
|
||||
hav.app_version_id,
|
||||
hav.turnaround.get_avg()/3600,
|
||||
config.reliable_max_avg_turnaround*multiplier/3600
|
||||
);
|
||||
}
|
||||
hav.reliable = false;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (hav.consecutive_valid < CONS_VALID_RELIABLE) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [AV#%d] not reliable; cons valid %d < %d\n",
|
||||
hav.app_version_id,
|
||||
hav.consecutive_valid, CONS_VALID_RELIABLE
|
||||
);
|
||||
}
|
||||
hav.reliable = false;
|
||||
return;
|
||||
}
|
||||
if (config.daily_result_quota) {
|
||||
if (hav.max_jobs_per_day < config.daily_result_quota) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [AV#%d] not reliable; max_jobs_per_day %d>%d\n",
|
||||
hav.app_version_id,
|
||||
hav.max_jobs_per_day,
|
||||
config.daily_result_quota
|
||||
);
|
||||
}
|
||||
hav.reliable = false;
|
||||
return;
|
||||
}
|
||||
}
|
||||
hav.reliable = true;
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] app version %d is reliable\n",
|
||||
g_reply->host.id, hav.app_version_id
|
||||
);
|
||||
}
|
||||
g_wreq->has_reliable_version = true;
|
||||
}
|
||||
|
||||
// decide whether do unreplicated jobs with this app version
|
||||
//
|
||||
static void set_trust(DB_HOST_APP_VERSION& hav) {
|
||||
hav.trusted = false;
|
||||
if (hav.consecutive_valid < CONS_VALID_UNREPLICATED) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] set_trust: cons valid %d < %d, don't use single replication\n",
|
||||
hav.consecutive_valid, CONS_VALID_UNREPLICATED
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
double x = 1./hav.consecutive_valid;
|
||||
if (drand() > x) hav.trusted = true;
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] set_trust: random choice for cons valid %d: %s\n",
|
||||
hav.consecutive_valid, hav.trusted?"yes":"no"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
static void get_reliability_and_trust() {
|
||||
// Platforms other than Windows, Linux and Intel Macs need a
|
||||
// larger set of computers to be marked reliable
|
||||
//
|
||||
double multiplier = 1.0;
|
||||
if (strstr(g_reply->host.os_name,"Windows")
|
||||
|| strstr(g_reply->host.os_name,"Linux")
|
||||
|| (strstr(g_reply->host.os_name,"Darwin")
|
||||
&& !(strstr(g_reply->host.p_vendor,"Power Macintosh"))
|
||||
)) {
|
||||
multiplier = 1.0;
|
||||
} else {
|
||||
multiplier = 1.8;
|
||||
}
|
||||
|
||||
for (unsigned int i=0; i<g_wreq->host_app_versions.size(); i++) {
|
||||
DB_HOST_APP_VERSION& hav = g_wreq->host_app_versions[i];
|
||||
get_reliability_version(hav, multiplier);
|
||||
set_trust(hav);
|
||||
}
|
||||
}
|
||||
|
||||
// Compute the max additional disk usage we can impose on the host.
|
||||
// Depending on the client version, it can either send us
|
||||
|
@ -373,112 +502,6 @@ static void get_prefs_info() {
|
|||
}
|
||||
}
|
||||
|
||||
// Decide whether or not this app version is 'reliable'
|
||||
// An app version is reliable if the following conditions are true
|
||||
// (for those that are set in the config file)
|
||||
// 1) The host average turnaround is less than a threshold
|
||||
// 2) consecutive_valid is above a threshold
|
||||
// 3) The host results per day is equal to the max value
|
||||
//
|
||||
void get_reliability_version(HOST_APP_VERSION& hav, double multiplier) {
|
||||
if (hav.turnaround.n > MIN_HOST_SAMPLES && config.reliable_max_avg_turnaround) {
|
||||
|
||||
if (hav.turnaround.get_avg() > config.reliable_max_avg_turnaround*multiplier) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [AV#%d] not reliable; avg turnaround: %.3f > %.3f hrs\n",
|
||||
hav.app_version_id,
|
||||
hav.turnaround.get_avg()/3600,
|
||||
config.reliable_max_avg_turnaround*multiplier/3600
|
||||
);
|
||||
}
|
||||
hav.reliable = false;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (hav.consecutive_valid < CONS_VALID_RELIABLE) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [AV#%d] not reliable; cons valid %d < %d\n",
|
||||
hav.app_version_id,
|
||||
hav.consecutive_valid, CONS_VALID_RELIABLE
|
||||
);
|
||||
}
|
||||
hav.reliable = false;
|
||||
return;
|
||||
}
|
||||
if (config.daily_result_quota) {
|
||||
if (hav.max_jobs_per_day < config.daily_result_quota) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [AV#%d] not reliable; max_jobs_per_day %d>%d\n",
|
||||
hav.app_version_id,
|
||||
hav.max_jobs_per_day,
|
||||
config.daily_result_quota
|
||||
);
|
||||
}
|
||||
hav.reliable = false;
|
||||
return;
|
||||
}
|
||||
}
|
||||
hav.reliable = true;
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] app version %d is reliable\n",
|
||||
g_reply->host.id, hav.app_version_id
|
||||
);
|
||||
}
|
||||
g_wreq->has_reliable_version = true;
|
||||
}
|
||||
|
||||
// decide whether do unreplicated jobs with this app version
|
||||
//
|
||||
static void set_trust(DB_HOST_APP_VERSION& hav) {
|
||||
hav.trusted = false;
|
||||
if (hav.consecutive_valid < CONS_VALID_UNREPLICATED) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] set_trust: cons valid %d < %d, don't use single replication\n",
|
||||
hav.consecutive_valid, CONS_VALID_UNREPLICATED
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
double x = 1./hav.consecutive_valid;
|
||||
if (drand() > x) hav.trusted = true;
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] set_trust: random choice for cons valid %d: %s\n",
|
||||
hav.consecutive_valid, hav.trusted?"yes":"no"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
static void update_quota(DB_HOST_APP_VERSION& hav) {
|
||||
if (config.daily_result_quota) {
|
||||
if (hav.max_jobs_per_day == 0) {
|
||||
hav.max_jobs_per_day = config.daily_result_quota;
|
||||
if (config.debug_quota) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[quota] [HAV#%d] Initializing max_results_day to %d\n",
|
||||
hav.app_version_id,
|
||||
config.daily_result_quota
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (g_request->last_rpc_dayofyear != g_request->current_rpc_dayofyear) {
|
||||
if (config.debug_quota) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[quota] [HOST#%d] [HAV#%d] Resetting n_jobs_today\n",
|
||||
g_reply->host.id, hav.app_version_id
|
||||
);
|
||||
}
|
||||
hav.n_jobs_today = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void update_n_jobs_today() {
|
||||
for (unsigned int i=0; i<g_wreq->host_app_versions.size(); i++) {
|
||||
DB_HOST_APP_VERSION& hav = g_wreq->host_app_versions[i];
|
||||
|
@ -486,144 +509,6 @@ void update_n_jobs_today() {
|
|||
}
|
||||
}
|
||||
|
||||
static void get_reliability_and_trust() {
|
||||
// Platforms other than Windows, Linux and Intel Macs need a
|
||||
// larger set of computers to be marked reliable
|
||||
//
|
||||
double multiplier = 1.0;
|
||||
if (strstr(g_reply->host.os_name,"Windows")
|
||||
|| strstr(g_reply->host.os_name,"Linux")
|
||||
|| (strstr(g_reply->host.os_name,"Darwin")
|
||||
&& !(strstr(g_reply->host.p_vendor,"Power Macintosh"))
|
||||
)) {
|
||||
multiplier = 1.0;
|
||||
} else {
|
||||
multiplier = 1.8;
|
||||
}
|
||||
|
||||
for (unsigned int i=0; i<g_wreq->host_app_versions.size(); i++) {
|
||||
DB_HOST_APP_VERSION& hav = g_wreq->host_app_versions[i];
|
||||
get_reliability_version(hav, multiplier);
|
||||
set_trust(hav);
|
||||
}
|
||||
}
|
||||
|
||||
// Return true if the user has set application preferences,
|
||||
// and this job is not for a selected app
|
||||
//
|
||||
bool app_not_selected(WORKUNIT& wu) {
|
||||
unsigned int i;
|
||||
|
||||
if (g_wreq->preferred_apps.size() == 0) return false;
|
||||
for (i=0; i<g_wreq->preferred_apps.size(); i++) {
|
||||
if (wu.appid == g_wreq->preferred_apps[i].appid) {
|
||||
g_wreq->preferred_apps[i].work_available = true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// see how much RAM we can use on this machine
|
||||
//
|
||||
static inline void get_mem_sizes() {
|
||||
g_wreq->ram = g_reply->host.m_nbytes;
|
||||
if (g_wreq->ram <= 0) g_wreq->ram = DEFAULT_RAM_SIZE;
|
||||
g_wreq->usable_ram = g_wreq->ram;
|
||||
double busy_frac = g_request->global_prefs.ram_max_used_busy_frac;
|
||||
double idle_frac = g_request->global_prefs.ram_max_used_idle_frac;
|
||||
double frac = 1;
|
||||
if (busy_frac>0 && idle_frac>0) {
|
||||
frac = std::max(busy_frac, idle_frac);
|
||||
if (frac > 1) frac = 1;
|
||||
g_wreq->usable_ram *= frac;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int check_memory(WORKUNIT& wu) {
|
||||
double diff = wu.rsc_memory_bound - g_wreq->usable_ram;
|
||||
if (diff > 0) {
|
||||
char message[256];
|
||||
sprintf(message,
|
||||
"%s needs %0.2f MB RAM but only %0.2f MB is available for use.",
|
||||
find_user_friendly_name(wu.appid),
|
||||
wu.rsc_memory_bound/MEGA, g_wreq->usable_ram/MEGA
|
||||
);
|
||||
add_no_work_message(message);
|
||||
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [WU#%d %s] needs %0.2fMB RAM; [HOST#%d] has %0.2fMB, %0.2fMB usable\n",
|
||||
wu.id, wu.name, wu.rsc_memory_bound/MEGA,
|
||||
g_reply->host.id, g_wreq->ram/MEGA, g_wreq->usable_ram/MEGA
|
||||
);
|
||||
}
|
||||
g_wreq->mem.set_insufficient(wu.rsc_memory_bound);
|
||||
g_reply->set_delay(DELAY_NO_WORK_TEMP);
|
||||
return INFEASIBLE_MEM;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int check_disk(WORKUNIT& wu) {
|
||||
double diff = wu.rsc_disk_bound - g_wreq->disk_available;
|
||||
if (diff > 0) {
|
||||
char message[256];
|
||||
sprintf(message,
|
||||
"%s needs %0.2fMB more disk space. You currently have %0.2f MB available and it needs %0.2f MB.",
|
||||
find_user_friendly_name(wu.appid),
|
||||
diff/MEGA, g_wreq->disk_available/MEGA, wu.rsc_disk_bound/MEGA
|
||||
);
|
||||
add_no_work_message(message);
|
||||
|
||||
g_wreq->disk.set_insufficient(diff);
|
||||
return INFEASIBLE_DISK;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int check_bandwidth(WORKUNIT& wu) {
|
||||
if (wu.rsc_bandwidth_bound == 0) return 0;
|
||||
|
||||
// if n_bwdown is zero, the host has never downloaded anything,
|
||||
// so skip this check
|
||||
//
|
||||
if (g_reply->host.n_bwdown == 0) return 0;
|
||||
|
||||
double diff = wu.rsc_bandwidth_bound - g_reply->host.n_bwdown;
|
||||
if (diff > 0) {
|
||||
char message[256];
|
||||
sprintf(message,
|
||||
"%s requires %0.2f KB/sec download bandwidth. Your computer has been measured at %0.2f KB/sec.",
|
||||
find_user_friendly_name(wu.appid),
|
||||
wu.rsc_bandwidth_bound/KILO, g_reply->host.n_bwdown/KILO
|
||||
);
|
||||
add_no_work_message(message);
|
||||
|
||||
g_wreq->bandwidth.set_insufficient(diff);
|
||||
return INFEASIBLE_BANDWIDTH;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Determine if the app is "hard",
|
||||
// and we should send it only to high-end hosts.
|
||||
// Currently this is specified by setting weight=-1;
|
||||
// this is a kludge for SETI@home/Astropulse.
|
||||
//
|
||||
static inline bool hard_app(APP& app) {
|
||||
return (app.weight == -1);
|
||||
}
|
||||
|
||||
static inline double get_estimated_delay(BEST_APP_VERSION& bav) {
|
||||
int pt = bav.host_usage.proc_type;
|
||||
if (pt == PROC_TYPE_CPU) {
|
||||
return g_request->cpu_estimated_delay;
|
||||
}
|
||||
COPROC* cp = g_request->coprocs.type_to_coproc(pt);
|
||||
return cp->estimated_delay;
|
||||
}
|
||||
|
||||
static inline void update_estimated_delay(BEST_APP_VERSION& bav, double dt) {
|
||||
int pt = bav.host_usage.proc_type;
|
||||
if (pt == PROC_TYPE_CPU) {
|
||||
|
@ -634,239 +519,6 @@ static inline void update_estimated_delay(BEST_APP_VERSION& bav, double dt) {
|
|||
}
|
||||
}
|
||||
|
||||
// return the delay bound to use for this job/host.
|
||||
// Actually, return two: optimistic (lower) and pessimistic (higher).
|
||||
// If the deadline check with the optimistic bound fails,
|
||||
// try the pessimistic bound.
|
||||
// TODO: clean up this mess
|
||||
//
|
||||
static void get_delay_bound_range(
|
||||
WORKUNIT& wu,
|
||||
int res_server_state, int res_priority, double res_report_deadline,
|
||||
BEST_APP_VERSION& bav,
|
||||
double& opt, double& pess
|
||||
) {
|
||||
if (res_server_state == RESULT_SERVER_STATE_IN_PROGRESS) {
|
||||
double now = dtime();
|
||||
if (res_report_deadline < now) {
|
||||
// if original deadline has passed, return zeros
|
||||
// This will skip deadline check.
|
||||
opt = pess = 0;
|
||||
return;
|
||||
}
|
||||
opt = res_report_deadline - now;
|
||||
pess = wu.delay_bound;
|
||||
} else {
|
||||
opt = pess = wu.delay_bound;
|
||||
|
||||
// If the workunit needs reliable and is being sent to a reliable host,
|
||||
// then shorten the delay bound by the percent specified
|
||||
//
|
||||
if (config.reliable_on_priority && res_priority >= config.reliable_on_priority && config.reliable_reduced_delay_bound > 0.01
|
||||
) {
|
||||
opt = wu.delay_bound*config.reliable_reduced_delay_bound;
|
||||
double est_wallclock_duration = estimate_duration(wu, bav);
|
||||
|
||||
// Check to see how reasonable this reduced time is.
|
||||
// Increase it to twice the estimated delay bound
|
||||
// if all the following apply:
|
||||
//
|
||||
// 1) Twice the estimate is longer then the reduced delay bound
|
||||
// 2) Twice the estimate is less then the original delay bound
|
||||
// 3) Twice the estimate is less then the twice the reduced delay bound
|
||||
if (est_wallclock_duration*2 > opt
|
||||
&& est_wallclock_duration*2 < wu.delay_bound
|
||||
&& est_wallclock_duration*2 < wu.delay_bound*config.reliable_reduced_delay_bound*2
|
||||
) {
|
||||
opt = est_wallclock_duration*2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// return 0 if the job, with the given delay bound,
|
||||
// will complete by its deadline, and won't cause other jobs to miss deadlines.
|
||||
//
|
||||
static inline int check_deadline(
|
||||
WORKUNIT& wu, APP& app, BEST_APP_VERSION& bav
|
||||
) {
|
||||
if (config.ignore_delay_bound) return 0;
|
||||
|
||||
// skip delay check if host currently doesn't have any work
|
||||
// and it's not a hard app.
|
||||
// (i.e. everyone gets one result, no matter how slow they are)
|
||||
//
|
||||
if (get_estimated_delay(bav) == 0 && !hard_app(app)) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] est delay 0, skipping deadline check\n"
|
||||
);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// if it's a hard app, don't send it to a host with no credit
|
||||
//
|
||||
if (hard_app(app) && g_reply->host.total_credit == 0) {
|
||||
return INFEASIBLE_CPU;
|
||||
}
|
||||
|
||||
// do EDF simulation if possible; else use cruder approximation
|
||||
//
|
||||
if (config.workload_sim && g_request->have_other_results_list) {
|
||||
double est_dur = estimate_duration(wu, bav);
|
||||
if (g_reply->wreq.edf_reject_test(est_dur, wu.delay_bound)) {
|
||||
return INFEASIBLE_WORKLOAD;
|
||||
}
|
||||
IP_RESULT candidate("", wu.delay_bound, est_dur);
|
||||
strcpy(candidate.name, wu.name);
|
||||
if (check_candidate(candidate, g_wreq->effective_ncpus, g_request->ip_results)) {
|
||||
// it passed the feasibility test,
|
||||
// but don't add it to the workload yet;
|
||||
// wait until we commit to sending it
|
||||
} else {
|
||||
g_reply->wreq.edf_reject(est_dur, wu.delay_bound);
|
||||
g_reply->wreq.speed.set_insufficient(0);
|
||||
return INFEASIBLE_WORKLOAD;
|
||||
}
|
||||
} else {
|
||||
double ewd = estimate_duration(wu, bav);
|
||||
if (hard_app(app)) ewd *= 1.3;
|
||||
double est_report_delay = get_estimated_delay(bav) + ewd;
|
||||
double diff = est_report_delay - wu.delay_bound;
|
||||
if (diff > 0) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [WU#%d] deadline miss %d > %d\n",
|
||||
wu.id, (int)est_report_delay, wu.delay_bound
|
||||
);
|
||||
}
|
||||
g_reply->wreq.speed.set_insufficient(diff);
|
||||
return INFEASIBLE_CPU;
|
||||
} else {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [WU#%d] meets deadline: %.2f + %.2f < %d\n",
|
||||
wu.id, get_estimated_delay(bav), ewd, wu.delay_bound
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Fast checks (no DB access) to see if the job can be sent to the host.
|
||||
// Reasons why not include:
|
||||
// 1) the host doesn't have enough memory;
|
||||
// 2) the host doesn't have enough disk space;
|
||||
// 3) based on CPU speed, resource share and estimated delay,
|
||||
// the host probably won't get the result done within the delay bound
|
||||
// 4) app isn't in user's "approved apps" list
|
||||
//
|
||||
// If the job is feasible, return 0 and fill in wu.delay_bound
|
||||
// with the delay bound we've decided to use.
|
||||
//
|
||||
int wu_is_infeasible_fast(
|
||||
WORKUNIT& wu,
|
||||
int res_server_state, int res_priority, double res_report_deadline,
|
||||
APP& app, BEST_APP_VERSION& bav
|
||||
) {
|
||||
int retval;
|
||||
|
||||
// project-specific check
|
||||
//
|
||||
if (wu_is_infeasible_custom(wu, app, bav)) {
|
||||
return INFEASIBLE_CUSTOM;
|
||||
}
|
||||
|
||||
if (config.user_filter) {
|
||||
if (wu.batch && wu.batch != g_reply->user.id) {
|
||||
return INFEASIBLE_USER_FILTER;
|
||||
}
|
||||
}
|
||||
|
||||
// homogeneous redundancy: can't send if app uses HR and
|
||||
// 1) host is of unknown HR class, or
|
||||
// 2) WU is already committed to different HR class
|
||||
//
|
||||
if (app_hr_type(app)) {
|
||||
if (hr_unknown_class(g_reply->host, app_hr_type(app))) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] [WU#%d %s] host is of unknown class in HR type %d\n",
|
||||
g_reply->host.id, wu.id, wu.name, app_hr_type(app)
|
||||
);
|
||||
}
|
||||
return INFEASIBLE_HR;
|
||||
}
|
||||
if (already_sent_to_different_hr_class(wu, app)) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] [WU#%d %s] failed quick HR check: WU is class %d, host is class %d\n",
|
||||
g_reply->host.id, wu.id, wu.name, wu.hr_class, hr_class(g_request->host, app_hr_type(app))
|
||||
);
|
||||
}
|
||||
return INFEASIBLE_HR;
|
||||
}
|
||||
}
|
||||
|
||||
// homogeneous app version
|
||||
//
|
||||
if (app.homogeneous_app_version) {
|
||||
int avid = wu.app_version_id;
|
||||
if (avid && bav.avp->id != avid) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_NORMAL,
|
||||
"[send] [HOST#%d] [WU#%d %s] failed homogeneous app version check: %d %d\n",
|
||||
g_reply->host.id, wu.id, wu.name, avid, bav.avp->id
|
||||
);
|
||||
}
|
||||
return INFEASIBLE_HAV;
|
||||
}
|
||||
}
|
||||
|
||||
if (config.one_result_per_user_per_wu || config.one_result_per_host_per_wu) {
|
||||
if (wu_already_in_reply(wu)) {
|
||||
return INFEASIBLE_DUP;
|
||||
}
|
||||
}
|
||||
|
||||
retval = check_memory(wu);
|
||||
if (retval) return retval;
|
||||
retval = check_disk(wu);
|
||||
if (retval) return retval;
|
||||
retval = check_bandwidth(wu);
|
||||
if (retval) return retval;
|
||||
|
||||
if (app.non_cpu_intensive) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// do deadline check last because EDF sim uses some CPU
|
||||
//
|
||||
double opt, pess;
|
||||
get_delay_bound_range(
|
||||
wu, res_server_state, res_priority, res_report_deadline, bav, opt, pess
|
||||
);
|
||||
wu.delay_bound = (int)opt;
|
||||
if (opt == 0) {
|
||||
// this is a resend; skip deadline check
|
||||
return 0;
|
||||
}
|
||||
retval = check_deadline(wu, app, bav);
|
||||
if (retval && (opt != pess)) {
|
||||
wu.delay_bound = (int)pess;
|
||||
retval = check_deadline(wu, app, bav);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
// return true if the client has a sticky file used by this job
|
||||
//
|
||||
bool host_has_job_file(WORKUNIT&) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// insert "text" right after "after" in the given buffer
|
||||
//
|
||||
static int insert_after(char* buffer, const char* after, const char* text) {
|
||||
|
|
|
@ -42,29 +42,6 @@ inline bool is_anonymous(PLATFORM* platform) {
|
|||
|
||||
extern bool app_core_compatible(WORK_REQ& wreq, APP_VERSION& av);
|
||||
|
||||
// values returned by wu_is_infeasible()
|
||||
//
|
||||
#define INFEASIBLE_MEM 1
|
||||
#define INFEASIBLE_DISK 2
|
||||
#define INFEASIBLE_CPU 3
|
||||
#define INFEASIBLE_WORK_BUF 4
|
||||
#define INFEASIBLE_APP_SETTING 5
|
||||
#define INFEASIBLE_WORKLOAD 6
|
||||
#define INFEASIBLE_DUP 7
|
||||
#define INFEASIBLE_HR 8
|
||||
#define INFEASIBLE_BANDWIDTH 9
|
||||
#define INFEASIBLE_CUSTOM 10
|
||||
#define INFEASIBLE_USER_FILTER 11
|
||||
#define INFEASIBLE_HAV 12
|
||||
|
||||
extern int wu_is_infeasible_fast(
|
||||
WORKUNIT&,
|
||||
int res_server_state, int res_priority, double res_report_deadline,
|
||||
APP&, BEST_APP_VERSION&
|
||||
);
|
||||
extern int slow_check(WU_RESULT&, APP*, BEST_APP_VERSION*);
|
||||
extern bool result_still_sendable(DB_RESULT& result, WORKUNIT& wu);
|
||||
|
||||
extern double max_allowable_disk();
|
||||
|
||||
extern bool wu_already_in_reply(WORKUNIT& wu);
|
||||
|
@ -75,7 +52,7 @@ extern int update_wu_on_send(WORKUNIT wu, time_t x, APP&, BEST_APP_VERSION&);
|
|||
|
||||
extern void lock_sema();
|
||||
extern void unlock_sema();
|
||||
extern const char* infeasible_string(int);
|
||||
extern const char* find_user_friendly_name(int appid);
|
||||
extern bool app_not_selected(WORKUNIT&);
|
||||
extern bool work_needed(bool);
|
||||
extern void send_work_setup();
|
||||
|
|
Loading…
Reference in New Issue