2008-08-06 18:36:30 +00:00
|
|
|
// This file is part of BOINC.
|
2005-08-04 03:50:04 +00:00
|
|
|
// http://boinc.berkeley.edu
|
2008-08-06 18:36:30 +00:00
|
|
|
// Copyright (C) 2008 University of California
|
2005-08-04 03:50:04 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
2005-08-04 03:50:04 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
2005-08-04 03:50:04 +00:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
2005-08-04 03:50:04 +00:00
|
|
|
|
|
|
|
// scheduler code related to sending work
|
|
|
|
|
2008-02-27 23:26:38 +00:00
|
|
|
#include <cstdlib>
|
|
|
|
#include <string>
|
|
|
|
#include <cstring>
|
2005-08-04 03:50:04 +00:00
|
|
|
|
2005-11-21 18:34:44 +00:00
|
|
|
#include "config.h"
|
2010-11-08 17:51:57 +00:00
|
|
|
|
2009-08-10 04:49:02 +00:00
|
|
|
#include "sched_main.h"
|
|
|
|
#include "sched_types.h"
|
2005-08-04 03:50:04 +00:00
|
|
|
#include "sched_shmem.h"
|
|
|
|
#include "sched_hr.h"
|
|
|
|
#include "sched_config.h"
|
|
|
|
#include "sched_util.h"
|
|
|
|
#include "sched_msgs.h"
|
|
|
|
#include "sched_send.h"
|
2009-03-19 16:35:35 +00:00
|
|
|
#include "sched_version.h"
|
2010-03-03 19:29:23 +00:00
|
|
|
#ifdef _USING_FCGI_
|
|
|
|
#include "boinc_fcgi.h"
|
|
|
|
#endif
|
2005-08-04 03:58:00 +00:00
|
|
|
|
|
|
|
#include "sched_array.h"
|
2005-08-04 03:50:04 +00:00
|
|
|
|
2010-03-03 19:29:23 +00:00
|
|
|
// do fast checks on this job, i.e. ones that don't require DB access
|
|
|
|
// if any check fails, return false
|
|
|
|
//
|
|
|
|
static bool quick_check(
|
2011-10-26 16:51:10 +00:00
|
|
|
WU_RESULT& wu_result,
|
|
|
|
WORKUNIT& wu, // a mutable copy of wu_result.workunit.
|
|
|
|
// We may modify its delay_bound and rsc_fpops_est
|
|
|
|
BEST_APP_VERSION* &bavp,
|
2012-09-01 04:58:12 +00:00
|
|
|
APP* app,
|
|
|
|
int& last_retval
|
2010-03-03 19:29:23 +00:00
|
|
|
) {
|
|
|
|
int retval;
|
|
|
|
|
2010-06-16 22:07:19 +00:00
|
|
|
g_wreq->no_jobs_available = false;
|
|
|
|
|
2010-03-04 04:16:00 +00:00
|
|
|
// If we're looking for beta jobs and this isn't one, skip it
|
|
|
|
//
|
2010-03-03 19:29:23 +00:00
|
|
|
if (g_wreq->beta_only) {
|
|
|
|
if (!app->beta) {
|
2012-11-08 07:43:43 +00:00
|
|
|
if (config.debug_array_detail) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[array_detail] job is not from beta app; skipping\n"
|
|
|
|
);
|
2012-05-18 18:13:04 +00:00
|
|
|
}
|
2010-03-03 19:29:23 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [HOST#%d] beta work found: [RESULT#%d]\n",
|
|
|
|
g_reply->host.id, wu_result.resultid
|
|
|
|
);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (app->beta) {
|
2012-11-08 07:43:43 +00:00
|
|
|
if (config.debug_array_detail) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[array_detail] job is from beta app; skipping\n"
|
|
|
|
);
|
2012-05-18 18:13:04 +00:00
|
|
|
}
|
2010-03-03 19:29:23 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2011-03-04 19:40:59 +00:00
|
|
|
|
2012-08-31 06:11:06 +00:00
|
|
|
// Are we scanning for need_reliable results?
|
|
|
|
// skip this check the app is beta
|
|
|
|
// (beta apps don't use the reliable mechanism)
|
2010-03-03 19:29:23 +00:00
|
|
|
//
|
|
|
|
if (!app->beta) {
|
|
|
|
if (g_wreq->reliable_only && (!wu_result.need_reliable)) {
|
2012-11-08 07:43:43 +00:00
|
|
|
if (config.debug_array_detail) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[array_detail] job doesn't need reliable host; skipping\n"
|
|
|
|
);
|
2012-05-18 18:13:04 +00:00
|
|
|
}
|
2010-03-03 19:29:23 +00:00
|
|
|
return false;
|
|
|
|
} else if (!g_wreq->reliable_only && wu_result.need_reliable) {
|
2012-11-08 07:43:43 +00:00
|
|
|
if (config.debug_array_detail) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[array_detail] job needs reliable host; skipping\n"
|
|
|
|
);
|
2012-05-18 18:13:04 +00:00
|
|
|
}
|
2010-03-03 19:29:23 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2011-03-04 19:40:59 +00:00
|
|
|
|
2010-03-03 19:29:23 +00:00
|
|
|
// don't send if we are looking for infeasible results
|
|
|
|
// and the result is not infeasible
|
|
|
|
//
|
|
|
|
if (g_wreq->infeasible_only && (wu_result.infeasible_count==0)) {
|
2012-11-08 07:43:43 +00:00
|
|
|
if (config.debug_array_detail) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[array_detail] job is not infeasible; skipping\n"
|
|
|
|
);
|
2012-05-18 18:13:04 +00:00
|
|
|
}
|
2010-03-03 19:29:23 +00:00
|
|
|
return false;
|
|
|
|
}
|
2011-03-04 19:40:59 +00:00
|
|
|
|
2012-08-27 17:00:43 +00:00
|
|
|
// locality sched lite check.
|
2012-09-13 17:38:55 +00:00
|
|
|
// Allow non-LSL jobs; otherwise we could starve them
|
|
|
|
// NOTE: THIS NEGATES THE OTHER SCHED POLICIES (reliable, etc.).
|
|
|
|
// Need to think of some way of combining them.
|
2012-08-27 17:00:43 +00:00
|
|
|
//
|
|
|
|
if (g_wreq->locality_sched_lite) {
|
2012-09-13 17:38:55 +00:00
|
|
|
// skip this job if host has sticky files
|
|
|
|
// but none of them is used by this job.
|
|
|
|
// TODO: it should really be "host has sticky files for this app".
|
|
|
|
// However, we don't have a way of making that association.
|
|
|
|
// Could add something based on filename
|
|
|
|
//
|
|
|
|
if (app->locality_scheduling == LOCALITY_SCHED_LITE
|
|
|
|
&& g_request->file_infos.size()
|
|
|
|
) {
|
2012-08-29 03:09:10 +00:00
|
|
|
int n = nfiles_on_host(wu_result.workunit);
|
2012-11-07 19:13:46 +00:00
|
|
|
if (config.debug_locality_lite) {
|
2012-08-29 03:09:10 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-11-07 19:13:46 +00:00
|
|
|
"[loc_lite] job %s has %d files on this host\n",
|
2012-08-29 03:09:10 +00:00
|
|
|
wu_result.workunit.name, n
|
|
|
|
);
|
|
|
|
}
|
|
|
|
if (n == 0) {
|
2012-08-27 17:00:43 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-04 19:40:59 +00:00
|
|
|
// Find the app and best app_version for this host.
|
|
|
|
//
|
|
|
|
bavp = get_app_version(wu, true, g_wreq->reliable_only);
|
|
|
|
if (!bavp) {
|
2012-11-08 07:43:43 +00:00
|
|
|
if (config.debug_array_detail) {
|
2011-03-04 19:40:59 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-11-08 07:43:43 +00:00
|
|
|
"[array_detail] No app version for job; skipping\n"
|
2011-03-04 19:40:59 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check app filter if needed.
|
|
|
|
// Do this AFTER get_app_version(), otherwise we could send
|
|
|
|
// a misleading message to user
|
2010-03-03 19:29:23 +00:00
|
|
|
//
|
|
|
|
if (g_wreq->user_apps_only &&
|
|
|
|
(!g_wreq->beta_only || config.distinct_beta_apps)
|
|
|
|
) {
|
|
|
|
if (app_not_selected(wu)) {
|
|
|
|
g_wreq->no_allowed_apps_available = true;
|
2012-11-08 07:43:43 +00:00
|
|
|
if (config.debug_array_detail) {
|
2010-03-03 19:29:23 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-11-08 07:43:43 +00:00
|
|
|
"[array_detail] [USER#%d] [WU#%d] user doesn't want work for app %s\n",
|
2010-03-03 19:29:23 +00:00
|
|
|
g_reply->user.id, wu.id, app->name
|
|
|
|
);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-26 16:51:10 +00:00
|
|
|
// Check whether we can send this job.
|
|
|
|
// This may modify wu.delay_bound and wu.rsc_fpops_est
|
2010-03-03 19:29:23 +00:00
|
|
|
//
|
|
|
|
retval = wu_is_infeasible_fast(
|
|
|
|
wu,
|
|
|
|
wu_result.res_server_state, wu_result.res_priority,
|
|
|
|
wu_result.res_report_deadline,
|
|
|
|
*app, *bavp
|
|
|
|
);
|
|
|
|
if (retval) {
|
|
|
|
if (retval != last_retval && config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [HOST#%d] [WU#%d %s] WU is infeasible: %s\n",
|
|
|
|
g_reply->host.id, wu.id, wu.name, infeasible_string(retval)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
last_retval = retval;
|
2012-11-08 07:43:43 +00:00
|
|
|
if (config.debug_array_detail) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[array_detail] is_infeasible_fast() failed; skipping\n"
|
|
|
|
);
|
2010-03-03 19:29:23 +00:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-10-26 16:51:10 +00:00
|
|
|
// Do checks that require DB access for whether we can send this job,
|
2012-06-29 06:53:48 +00:00
|
|
|
// and return:
|
|
|
|
// 0 if OK to send
|
|
|
|
// 1 if can't send to this host
|
|
|
|
// 2 if can't send to ANY host
|
2010-03-03 19:29:23 +00:00
|
|
|
//
|
2012-06-29 06:53:48 +00:00
|
|
|
static int slow_check(
|
2011-10-26 16:51:10 +00:00
|
|
|
WU_RESULT& wu_result, // the job cache entry.
|
|
|
|
// We may refresh its hr_class and app_version_id fields.
|
|
|
|
APP* app,
|
|
|
|
BEST_APP_VERSION* bavp // the app version to be used
|
2011-10-26 07:15:22 +00:00
|
|
|
) {
|
2010-03-03 19:29:23 +00:00
|
|
|
int n, retval;
|
|
|
|
DB_RESULT result;
|
|
|
|
char buf[256];
|
2011-10-26 16:51:10 +00:00
|
|
|
WORKUNIT& wu = wu_result.workunit;
|
2010-03-03 19:29:23 +00:00
|
|
|
|
|
|
|
// Don't send if we've already sent a result of this WU to this user.
|
|
|
|
//
|
|
|
|
if (config.one_result_per_user_per_wu) {
|
|
|
|
sprintf(buf,
|
2011-10-26 16:51:10 +00:00
|
|
|
"where workunitid=%d and userid=%d", wu.id, g_reply->user.id
|
2010-03-03 19:29:23 +00:00
|
|
|
);
|
|
|
|
retval = result.count(n, buf);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"send_work: can't get result count (%s)\n", boincerror(retval)
|
2010-03-03 19:29:23 +00:00
|
|
|
);
|
2012-06-29 06:53:48 +00:00
|
|
|
return 1;
|
2010-03-03 19:29:23 +00:00
|
|
|
} else {
|
|
|
|
if (n>0) {
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [USER#%d] already has %d result(s) for [WU#%d]\n",
|
2011-10-26 16:51:10 +00:00
|
|
|
g_reply->user.id, n, wu.id
|
2010-03-03 19:29:23 +00:00
|
|
|
);
|
|
|
|
}
|
2012-06-29 06:53:48 +00:00
|
|
|
return 1;
|
2010-03-03 19:29:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (config.one_result_per_host_per_wu) {
|
2011-10-26 16:51:10 +00:00
|
|
|
// Don't send if we've already sent a result of this WU to this host.
|
|
|
|
// We only have to check this if we don't send one result per user.
|
2010-03-03 19:29:23 +00:00
|
|
|
//
|
|
|
|
sprintf(buf,
|
2011-10-26 16:51:10 +00:00
|
|
|
"where workunitid=%d and hostid=%d", wu.id, g_reply->host.id
|
2010-03-03 19:29:23 +00:00
|
|
|
);
|
|
|
|
retval = result.count(n, buf);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"send_work: can't get result count (%s)\n", boincerror(retval)
|
2010-03-03 19:29:23 +00:00
|
|
|
);
|
2012-06-29 06:53:48 +00:00
|
|
|
return 1;
|
2010-03-03 19:29:23 +00:00
|
|
|
} else {
|
|
|
|
if (n>0) {
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [HOST#%d] already has %d result(s) for [WU#%d]\n",
|
2011-10-26 16:51:10 +00:00
|
|
|
g_reply->host.id, n, wu.id
|
2010-03-03 19:29:23 +00:00
|
|
|
);
|
|
|
|
}
|
2012-06-29 06:53:48 +00:00
|
|
|
return 1;
|
2010-03-03 19:29:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-26 07:15:22 +00:00
|
|
|
// Checks that require looking up the WU.
|
|
|
|
// Lump these together so we only do 1 lookup
|
|
|
|
//
|
|
|
|
if (app_hr_type(*app) || app->homogeneous_app_version) {
|
|
|
|
DB_WORKUNIT db_wu;
|
2011-10-26 16:51:10 +00:00
|
|
|
db_wu.id = wu.id;
|
2012-06-29 06:53:48 +00:00
|
|
|
int vals[3];
|
|
|
|
retval = db_wu.get_field_ints(
|
|
|
|
"hr_class, app_version_id, error_mask", 3, vals
|
|
|
|
);
|
2011-10-26 07:15:22 +00:00
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"can't get fields for [WU#%d]: %s\n", db_wu.id, boincerror(retval)
|
|
|
|
);
|
2012-06-29 06:53:48 +00:00
|
|
|
return 1;
|
2010-03-03 19:29:23 +00:00
|
|
|
}
|
2012-06-29 06:53:48 +00:00
|
|
|
|
|
|
|
// check wu.error_mask
|
|
|
|
//
|
|
|
|
if (vals[2] != 0) {
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
|
2011-10-26 07:15:22 +00:00
|
|
|
if (app_hr_type(*app)) {
|
2011-10-26 16:51:10 +00:00
|
|
|
wu.hr_class = vals[0];
|
|
|
|
if (already_sent_to_different_hr_class(wu, *app)) {
|
2011-10-26 07:15:22 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [HOST#%d] [WU#%d %s] is assigned to different HR class\n",
|
|
|
|
g_reply->host.id, wu.id, wu.name
|
|
|
|
);
|
|
|
|
}
|
|
|
|
// Mark the workunit as infeasible.
|
|
|
|
// This ensures that jobs already assigned to an HR class
|
|
|
|
// are processed first.
|
|
|
|
//
|
|
|
|
wu_result.infeasible_count++;
|
2012-06-29 06:53:48 +00:00
|
|
|
return 1;
|
2011-10-26 07:15:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (app->homogeneous_app_version) {
|
|
|
|
int wu_avid = vals[1];
|
2011-10-26 16:51:10 +00:00
|
|
|
wu.app_version_id = wu_avid;
|
2011-10-26 07:15:22 +00:00
|
|
|
if (wu_avid && wu_avid != bavp->avp->id) {
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [HOST#%d] [WU#%d %s] is assigned to different app version\n",
|
|
|
|
g_reply->host.id, wu.id, wu.name
|
|
|
|
);
|
|
|
|
}
|
|
|
|
wu_result.infeasible_count++;
|
2012-06-29 06:53:48 +00:00
|
|
|
return 1;
|
2011-10-26 07:15:22 +00:00
|
|
|
}
|
|
|
|
}
|
2010-03-03 19:29:23 +00:00
|
|
|
}
|
2012-06-29 06:53:48 +00:00
|
|
|
return 0;
|
2010-03-03 19:29:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Check for pathological conditions that mean
|
|
|
|
// result is not sendable at all.
|
|
|
|
//
|
|
|
|
static bool result_still_sendable(DB_RESULT& result, WORKUNIT& wu) {
|
|
|
|
int retval = result.lookup_id(result.id);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"[RESULT#%d] result.lookup_id() failed: %s\n",
|
|
|
|
result.id, boincerror(retval)
|
2010-03-03 19:29:23 +00:00
|
|
|
);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (result.server_state != RESULT_SERVER_STATE_UNSENT) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[RESULT#%d] expected to be unsent; instead, state is %d\n",
|
|
|
|
result.id, result.server_state
|
|
|
|
);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (result.workunitid != wu.id) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"[RESULT#%d] wrong WU ID: wanted %d, got %d\n",
|
|
|
|
result.id, wu.id, result.workunitid
|
|
|
|
);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2005-08-04 03:50:04 +00:00
|
|
|
|
|
|
|
// Make a pass through the wu/results array, sending work.
|
2008-12-22 00:10:02 +00:00
|
|
|
// The choice of jobs is limited by flags in g_wreq, as follows:
|
|
|
|
// infeasible_only:
|
|
|
|
// send only results that were previously infeasible for some host
|
2011-03-04 19:40:59 +00:00
|
|
|
// reliable_only:
|
2012-08-31 06:11:06 +00:00
|
|
|
// send only jobs with "need_reliable" set (e.g. retries)
|
|
|
|
// and send them only w/ app versions that are "reliable" for this host
|
2008-12-22 00:10:02 +00:00
|
|
|
// user_apps_only:
|
|
|
|
// Send only jobs for apps selected by user
|
|
|
|
// beta_only:
|
|
|
|
// Send only jobs for beta-test apps
|
2012-08-31 06:11:06 +00:00
|
|
|
// locality_sched_lite:
|
|
|
|
// For apps that use locality sched Lite,
|
|
|
|
// send only jobs for which the host already has at least 1 file
|
2008-12-22 00:10:02 +00:00
|
|
|
//
|
2009-06-01 22:15:14 +00:00
|
|
|
// Return true if no more work is needed.
|
2005-08-04 03:50:04 +00:00
|
|
|
//
|
2009-06-01 22:15:14 +00:00
|
|
|
static bool scan_work_array() {
|
2012-02-22 22:13:08 +00:00
|
|
|
int i, j, rnd_off, last_retval=0;;
|
2005-08-04 03:50:04 +00:00
|
|
|
APP* app;
|
2010-03-03 19:29:23 +00:00
|
|
|
BEST_APP_VERSION* bavp;
|
2009-06-01 22:15:14 +00:00
|
|
|
bool no_more_needed = false;
|
2011-06-06 03:40:42 +00:00
|
|
|
SCHED_DB_RESULT result;
|
2009-01-23 22:52:35 +00:00
|
|
|
|
2005-08-04 03:50:04 +00:00
|
|
|
lock_sema();
|
2011-03-04 19:40:59 +00:00
|
|
|
|
2008-02-22 22:21:00 +00:00
|
|
|
rnd_off = rand() % ssp->max_wu_results;
|
|
|
|
for (j=0; j<ssp->max_wu_results; j++) {
|
|
|
|
i = (j+rnd_off) % ssp->max_wu_results;
|
2005-08-04 03:50:04 +00:00
|
|
|
|
2008-02-22 22:21:00 +00:00
|
|
|
WU_RESULT& wu_result = ssp->wu_results[i];
|
2011-10-26 16:51:10 +00:00
|
|
|
|
2012-11-08 07:43:43 +00:00
|
|
|
if (config.debug_array_detail) {
|
2012-05-18 18:13:04 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-11-08 07:43:43 +00:00
|
|
|
"[array_detail] scanning slot %d\n", i
|
2012-05-18 18:13:04 +00:00
|
|
|
);
|
|
|
|
}
|
2012-08-29 18:08:15 +00:00
|
|
|
|
2012-09-01 04:58:12 +00:00
|
|
|
if (wu_result.state != WR_STATE_PRESENT && wu_result.state != g_pid) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2011-10-26 16:51:10 +00:00
|
|
|
// make a copy of the WORKUNIT part,
|
|
|
|
// which we can modify without affecting the cache
|
|
|
|
//
|
2010-03-03 19:29:23 +00:00
|
|
|
WORKUNIT wu = wu_result.workunit;
|
2005-08-04 03:50:04 +00:00
|
|
|
|
2012-09-01 04:58:12 +00:00
|
|
|
app = ssp->lookup_app(wu_result.workunit.appid);
|
|
|
|
if (app == NULL) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"[WU#%d] no app\n",
|
|
|
|
wu_result.workunit.id
|
|
|
|
);
|
|
|
|
continue; // this should never happen
|
2012-08-29 18:08:15 +00:00
|
|
|
}
|
|
|
|
|
2012-09-01 04:58:12 +00:00
|
|
|
if (app->non_cpu_intensive) continue;
|
|
|
|
|
2011-10-26 16:51:10 +00:00
|
|
|
// do fast (non-DB) checks.
|
|
|
|
// This may modify wu.rsc_fpops_est
|
2005-08-04 03:50:04 +00:00
|
|
|
//
|
2010-03-03 19:29:23 +00:00
|
|
|
if (!quick_check(wu_result, wu, bavp, app, last_retval)) {
|
2012-11-08 07:43:43 +00:00
|
|
|
if (config.debug_array_detail) {
|
2012-05-18 18:13:04 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-11-08 07:43:43 +00:00
|
|
|
"[array_detail] slot %d failed quick check\n", i
|
2012-05-18 18:13:04 +00:00
|
|
|
);
|
|
|
|
}
|
2005-08-04 03:50:04 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2007-06-14 18:02:00 +00:00
|
|
|
// mark wu_result as checked out and release semaphore.
|
2005-08-04 03:50:04 +00:00
|
|
|
// from here on in this loop, don't continue on failure;
|
|
|
|
// instead, goto dont_send (so that we reacquire semaphore)
|
2007-06-14 18:02:00 +00:00
|
|
|
//
|
|
|
|
// Note: without the semaphore we don't have mutual exclusion;
|
|
|
|
// ideally we should use a transaction from now until when
|
|
|
|
// we commit to sending the results.
|
2005-08-04 03:50:04 +00:00
|
|
|
|
2006-01-19 21:46:25 +00:00
|
|
|
wu_result.state = g_pid;
|
2005-08-04 03:50:04 +00:00
|
|
|
unlock_sema();
|
|
|
|
|
2012-06-29 06:53:48 +00:00
|
|
|
switch (slow_check(wu_result, app, bavp)) {
|
|
|
|
case 1:
|
2010-03-03 19:29:23 +00:00
|
|
|
// if we couldn't send the result to this host,
|
|
|
|
// set its state back to PRESENT
|
2007-03-11 03:15:17 +00:00
|
|
|
//
|
2010-03-03 19:29:23 +00:00
|
|
|
wu_result.state = WR_STATE_PRESENT;
|
2012-06-29 06:53:48 +00:00
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
// can't send this job to any host
|
|
|
|
//
|
|
|
|
wu_result.state = WR_STATE_EMPTY;
|
|
|
|
break;
|
|
|
|
default:
|
2011-10-26 16:51:10 +00:00
|
|
|
// slow_check() refreshes fields of wu_result.workunit;
|
|
|
|
// update our copy too
|
|
|
|
//
|
|
|
|
wu.hr_class = wu_result.workunit.hr_class;
|
|
|
|
wu.app_version_id = wu_result.workunit.app_version_id;
|
2005-08-04 03:50:04 +00:00
|
|
|
|
2010-03-03 19:29:23 +00:00
|
|
|
// mark slot as empty AFTER we've copied out of it
|
|
|
|
// (since otherwise feeder might overwrite it)
|
|
|
|
//
|
|
|
|
wu_result.state = WR_STATE_EMPTY;
|
2005-08-04 03:50:04 +00:00
|
|
|
|
2010-03-03 19:29:23 +00:00
|
|
|
// reread result from DB, make sure it's still unsent
|
|
|
|
// TODO: from here to end of add_result_to_reply()
|
|
|
|
// (which updates the DB record) should be a transaction
|
|
|
|
//
|
2011-10-26 16:51:10 +00:00
|
|
|
result.id = wu_result.resultid;
|
2010-03-03 19:29:23 +00:00
|
|
|
if (result_still_sendable(result, wu)) {
|
2012-02-22 22:13:08 +00:00
|
|
|
add_result_to_reply(result, wu, bavp, false);
|
2010-03-03 19:29:23 +00:00
|
|
|
|
|
|
|
// add_result_to_reply() fails only in pathological cases -
|
|
|
|
// e.g. we couldn't update the DB record or modify XML fields.
|
|
|
|
// If this happens, don't replace the record in the array
|
|
|
|
// (we can't anyway, since we marked the entry as "empty").
|
|
|
|
// The feeder will eventually pick it up again,
|
|
|
|
// and hopefully the problem won't happen twice.
|
|
|
|
}
|
2012-06-29 06:53:48 +00:00
|
|
|
break;
|
2005-08-04 03:50:04 +00:00
|
|
|
}
|
|
|
|
lock_sema();
|
2009-06-01 22:15:14 +00:00
|
|
|
if (!work_needed(false)) {
|
|
|
|
no_more_needed = true;
|
|
|
|
break;
|
|
|
|
}
|
2005-08-04 03:50:04 +00:00
|
|
|
}
|
|
|
|
unlock_sema();
|
2009-06-01 22:15:14 +00:00
|
|
|
return no_more_needed;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Send work by scanning the job array multiple times,
|
|
|
|
// with different selection criteria on each scan.
|
|
|
|
// This has been superceded by send_work_matchmaker()
|
|
|
|
//
|
|
|
|
void send_work_old() {
|
|
|
|
g_wreq->beta_only = false;
|
|
|
|
g_wreq->user_apps_only = true;
|
|
|
|
g_wreq->infeasible_only = false;
|
|
|
|
|
|
|
|
// give top priority to results that require a 'reliable host'
|
|
|
|
//
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (g_wreq->has_reliable_version) {
|
2009-06-01 22:15:14 +00:00
|
|
|
g_wreq->reliable_only = true;
|
2012-11-07 21:50:41 +00:00
|
|
|
if (config.debug_array) {
|
2012-08-29 03:09:10 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-11-07 21:50:41 +00:00
|
|
|
"[array] scanning for jobs that need reliable host\n"
|
2012-08-29 03:09:10 +00:00
|
|
|
);
|
|
|
|
}
|
2009-06-01 22:15:14 +00:00
|
|
|
if (scan_work_array()) return;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
g_wreq->reliable_only = false;
|
|
|
|
g_wreq->best_app_versions.clear();
|
2012-11-07 21:50:41 +00:00
|
|
|
} else {
|
|
|
|
if (config.debug_array) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-11-08 07:43:43 +00:00
|
|
|
"[array] host has no reliable app versions; skipping scan\n"
|
2012-11-07 21:50:41 +00:00
|
|
|
);
|
|
|
|
}
|
2009-06-01 22:15:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// give 2nd priority to results for a beta app
|
|
|
|
// (projects should load beta work with care,
|
|
|
|
// otherwise your users won't get production work done!
|
|
|
|
//
|
|
|
|
if (g_wreq->allow_beta_work) {
|
|
|
|
g_wreq->beta_only = true;
|
2012-11-07 21:50:41 +00:00
|
|
|
if (config.debug_array) {
|
2009-06-01 22:15:14 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-11-07 21:50:41 +00:00
|
|
|
"[array] host will accept beta jobs. Scanning for them.\n"
|
2009-06-01 22:15:14 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
if (scan_work_array()) return;
|
2012-08-27 17:00:43 +00:00
|
|
|
g_wreq->beta_only = false;
|
2009-06-01 22:15:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// give next priority to results that were infeasible for some other host
|
|
|
|
//
|
|
|
|
g_wreq->infeasible_only = true;
|
2012-11-07 21:50:41 +00:00
|
|
|
if (config.debug_array) {
|
2012-08-29 03:09:10 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-11-07 21:50:41 +00:00
|
|
|
"[array] Scanning for jobs that were infeasible for another host.\n"
|
2012-08-29 03:09:10 +00:00
|
|
|
);
|
|
|
|
}
|
2012-08-27 17:00:43 +00:00
|
|
|
if (scan_work_array()) return;
|
|
|
|
g_wreq->infeasible_only = false;
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2012-08-27 17:00:43 +00:00
|
|
|
// if some app uses locality sched lite,
|
|
|
|
// make a pass accepting only jobs for which the client has a file
|
|
|
|
//
|
2012-08-29 03:09:10 +00:00
|
|
|
if (ssp->locality_sched_lite) {
|
2012-11-07 21:50:41 +00:00
|
|
|
if (config.debug_array) {
|
2012-08-29 03:09:10 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-11-07 21:50:41 +00:00
|
|
|
"[array] Scanning for locality sched Lite jobs.\n"
|
2012-08-29 03:09:10 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
g_wreq->locality_sched_lite = true;
|
|
|
|
if (scan_work_array()) return;
|
|
|
|
g_wreq->locality_sched_lite = false;
|
|
|
|
}
|
2012-08-27 17:00:43 +00:00
|
|
|
|
|
|
|
// end of high-priority cases. Now do general scan.
|
2010-03-03 19:29:23 +00:00
|
|
|
//
|
2012-11-07 21:50:41 +00:00
|
|
|
if (config.debug_array) {
|
2012-08-29 03:09:10 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-11-07 21:50:41 +00:00
|
|
|
"[array] Scanning: general case.\n"
|
2012-08-29 03:09:10 +00:00
|
|
|
);
|
|
|
|
}
|
2009-06-01 22:15:14 +00:00
|
|
|
if (scan_work_array()) return;
|
|
|
|
|
|
|
|
// If user has selected apps but will accept any,
|
|
|
|
// and we haven't found any jobs for selected apps, try others
|
|
|
|
//
|
|
|
|
if (!g_wreq->njobs_sent && g_wreq->allow_non_preferred_apps ) {
|
|
|
|
g_wreq->user_apps_only = false;
|
|
|
|
preferred_app_message_index = g_wreq->no_work_messages.size();
|
2012-11-07 21:50:41 +00:00
|
|
|
if (config.debug_array) {
|
2009-06-01 22:15:14 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-11-07 21:50:41 +00:00
|
|
|
"[array] scanning for jobs from non-preferred applications\n"
|
2009-06-01 22:15:14 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
scan_work_array();
|
|
|
|
}
|
2005-08-04 03:50:04 +00:00
|
|
|
}
|
|
|
|
|
2012-09-01 04:58:12 +00:00
|
|
|
|
|
|
|
// try to send a job for the given app
|
|
|
|
//
|
|
|
|
int send_job_for_app(APP& app) {
|
|
|
|
int retval = 0;
|
|
|
|
BEST_APP_VERSION* bavp;
|
|
|
|
SCHED_DB_RESULT result;
|
|
|
|
|
|
|
|
lock_sema();
|
|
|
|
for (int i=0; i<ssp->max_wu_results; i++) {
|
|
|
|
WU_RESULT& wu_result = ssp->wu_results[i];
|
|
|
|
if (wu_result.state != WR_STATE_PRESENT) continue;
|
|
|
|
WORKUNIT wu = wu_result.workunit;
|
|
|
|
if (wu.appid != app.id) continue;
|
|
|
|
if (!quick_check(wu_result, wu, bavp, &app, retval)) {
|
2012-09-08 04:05:50 +00:00
|
|
|
// All jobs for a given NCI app are identical.
|
|
|
|
// If we can't send one, we can't send any.
|
|
|
|
//
|
2012-09-01 04:58:12 +00:00
|
|
|
unlock_sema();
|
2012-09-08 04:05:50 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"quick_check() failed for NCI job\n"
|
|
|
|
);
|
2012-09-01 04:58:12 +00:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
wu_result.state = g_pid;
|
|
|
|
unlock_sema();
|
|
|
|
result.id = wu_result.resultid;
|
2012-09-06 23:44:03 +00:00
|
|
|
wu_result.state = WR_STATE_EMPTY;
|
2012-09-01 04:58:12 +00:00
|
|
|
if (result_still_sendable(result, wu)) {
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"Sending non-CPU-intensive job: %s\n", wu.name
|
|
|
|
);
|
|
|
|
}
|
|
|
|
add_result_to_reply(result, wu, bavp, false);
|
2012-09-06 23:44:03 +00:00
|
|
|
return 0;
|
2012-09-01 04:58:12 +00:00
|
|
|
}
|
2012-09-08 04:05:50 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"NCI job was not still sendable\n"
|
|
|
|
);
|
2012-09-01 04:58:12 +00:00
|
|
|
lock_sema();
|
|
|
|
}
|
2012-09-08 04:05:50 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"no sendable NCI jobs for %s\n", app.user_friendly_name
|
|
|
|
);
|
2012-09-01 04:58:12 +00:00
|
|
|
unlock_sema();
|
2012-09-06 23:44:03 +00:00
|
|
|
return 1;
|
2012-09-01 04:58:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// try to send jobs for non-CPU-intensive (NCI) apps
|
|
|
|
// for which the host doesn't have a job in progress
|
|
|
|
//
|
|
|
|
int send_nci() {
|
|
|
|
int retval;
|
|
|
|
vector<APP> nci_apps;
|
|
|
|
char buf[1024];
|
|
|
|
|
|
|
|
// make a vector of NCI apps
|
|
|
|
//
|
|
|
|
for (int i=0; i<ssp->napps; i++) {
|
|
|
|
APP& app = ssp->apps[i];
|
|
|
|
if (!app.non_cpu_intensive) continue;
|
|
|
|
app.have_job = false;
|
|
|
|
nci_apps.push_back(app);
|
|
|
|
}
|
|
|
|
|
|
|
|
// scan through the list of in-progress jobs,
|
|
|
|
// flagging the associated apps as having jobs
|
|
|
|
//
|
|
|
|
for (unsigned int i=0; i<g_request->other_results.size(); i++) {
|
|
|
|
DB_RESULT r;
|
|
|
|
OTHER_RESULT &ores = g_request->other_results[i];
|
|
|
|
sprintf(buf, "where name='%s'", ores.name);
|
|
|
|
retval = r.lookup(buf);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_NORMAL, "No such result: %s\n", ores.name);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
APP* app = ssp->lookup_app(r.appid);
|
|
|
|
app->have_job = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// For each NCI app w/o a job, try to send one
|
|
|
|
//
|
|
|
|
for (unsigned int i=0; i<nci_apps.size(); i++) {
|
|
|
|
APP& app = nci_apps[i];
|
|
|
|
if (app.have_job) continue;
|
2012-09-06 23:44:03 +00:00
|
|
|
retval = send_job_for_app(app);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"failed to send job for NCI app %s\n", app.user_friendly_name
|
|
|
|
);
|
|
|
|
}
|
2012-09-01 04:58:12 +00:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-09-13 09:01:56 +00:00
|
|
|
const char *BOINC_RCSID_d9f764fd14="$Id$";
|