2008-08-06 18:36:30 +00:00
|
|
|
// This file is part of BOINC.
|
2005-01-20 23:22:22 +00:00
|
|
|
// http://boinc.berkeley.edu
|
2008-08-06 18:36:30 +00:00
|
|
|
// Copyright (C) 2008 University of California
|
2004-07-09 21:17:42 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
2004-07-09 21:17:42 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
2005-01-20 23:22:22 +00:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
2004-04-04 01:59:47 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
2004-04-04 01:59:47 +00:00
|
|
|
|
2009-06-01 22:15:14 +00:00
|
|
|
// scheduler code related to sending jobs.
|
|
|
|
// NOTE: there should be nothing here specific to particular
|
|
|
|
// scheduling policies (array scan, matchmaking, locality)
|
2005-02-10 06:21:58 +00:00
|
|
|
|
2005-11-21 18:34:44 +00:00
|
|
|
#include "config.h"
|
2004-04-04 01:59:47 +00:00
|
|
|
#include <vector>
|
2008-02-25 18:05:04 +00:00
|
|
|
#include <list>
|
2004-04-04 01:59:47 +00:00
|
|
|
#include <string>
|
2004-06-24 08:49:37 +00:00
|
|
|
#include <ctime>
|
|
|
|
#include <cstdio>
|
2008-02-27 23:26:38 +00:00
|
|
|
#include <cstring>
|
2005-02-02 18:13:00 +00:00
|
|
|
#include <stdlib.h>
|
2009-03-03 00:12:55 +00:00
|
|
|
#include <sys/time.h>
|
2005-02-10 06:21:58 +00:00
|
|
|
#include <unistd.h>
|
|
|
|
|
2004-04-04 01:59:47 +00:00
|
|
|
#include "error_numbers.h"
|
2005-02-10 20:31:11 +00:00
|
|
|
#include "parse.h"
|
2006-07-11 21:56:53 +00:00
|
|
|
#include "util.h"
|
2007-02-21 16:26:51 +00:00
|
|
|
#include "str_util.h"
|
2009-01-13 23:06:02 +00:00
|
|
|
#include "synch.h"
|
2005-02-10 20:31:11 +00:00
|
|
|
|
2010-03-29 22:28:20 +00:00
|
|
|
#include "credit.h"
|
2012-08-26 21:24:18 +00:00
|
|
|
#include "hr.h"
|
2005-08-04 03:50:04 +00:00
|
|
|
#include "sched_array.h"
|
2012-08-26 21:24:18 +00:00
|
|
|
#include "sched_assign.h"
|
|
|
|
#include "sched_config.h"
|
|
|
|
#include "sched_customize.h"
|
2007-05-31 18:14:45 +00:00
|
|
|
#include "sched_hr.h"
|
2004-09-10 21:02:11 +00:00
|
|
|
#include "sched_locality.h"
|
2012-08-26 21:24:18 +00:00
|
|
|
#include "sched_main.h"
|
|
|
|
#include "sched_msgs.h"
|
|
|
|
#include "sched_shmem.h"
|
|
|
|
#include "sched_score.h"
|
2005-02-10 20:31:11 +00:00
|
|
|
#include "sched_timezone.h"
|
2012-08-26 21:24:18 +00:00
|
|
|
#include "sched_types.h"
|
|
|
|
#include "sched_util.h"
|
2010-04-08 23:14:47 +00:00
|
|
|
#include "sched_version.h"
|
2004-04-04 01:59:47 +00:00
|
|
|
|
2007-05-31 18:14:45 +00:00
|
|
|
#include "sched_send.h"
|
2005-02-10 06:21:58 +00:00
|
|
|
|
2004-07-03 21:38:22 +00:00
|
|
|
#ifdef _USING_FCGI_
|
2008-09-09 19:10:42 +00:00
|
|
|
#include "boinc_fcgi.h"
|
2004-07-03 21:38:22 +00:00
|
|
|
#endif
|
|
|
|
|
- scheduler: add support for anonymous-platform coproc apps.
Old: although the request message contained all info
about the app version (flops, coproc usage etc.)
the server ignored this info,
and assumed that all anonymous platform apps where CPU.
With 6.6 client, this could produce infinite work fetch:
- client uses anon platform, has coproc app
- client has idle CPU, requests CPU work
- scheduler sends it jobs, thinking they will be done by CPU app
- client asks for more work etc.
New: scheduler parses full info on anon platform app versions:
plan class, FLOPS, coprocs.
It uses this info to make scheduling decisions;
in particular, if the request is for CUDA work,
if will only send jobs that use a CUDA app version.
The <result> records it returns contain info
(plan_class) that tells the client which app_version to use.
This will work correctly even if the client has multiple app versions
for the same app (e.g., a CPU version and a GPU version)
svn path=/trunk/boinc/; revision=17506
2009-03-05 17:30:10 +00:00
|
|
|
// if host sends us an impossible RAM size, use this instead
|
|
|
|
//
|
|
|
|
const double DEFAULT_RAM_SIZE = 64000000;
|
2008-02-27 22:26:37 +00:00
|
|
|
|
2008-04-26 23:34:38 +00:00
|
|
|
int preferred_app_message_index=0;
|
|
|
|
|
2012-08-27 17:00:43 +00:00
|
|
|
static inline bool file_present_on_host(const char* name) {
|
|
|
|
for (unsigned i=0; i<g_request->file_infos.size(); i++) {
|
|
|
|
FILE_INFO& fi = g_request->file_infos[i];
|
|
|
|
if (!strstr(name, fi.name)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-08-26 21:24:18 +00:00
|
|
|
// return the number of sticky files present on host, used by job
|
|
|
|
//
|
|
|
|
int nfiles_on_host(WORKUNIT& wu) {
|
|
|
|
MIOFILE mf;
|
|
|
|
mf.init_buf_read(wu.xml_doc);
|
|
|
|
XML_PARSER xp(&mf);
|
|
|
|
int n=0;
|
|
|
|
while (!xp.get_tag()) {
|
|
|
|
if (xp.match_tag("file_info")) {
|
|
|
|
FILE_INFO fi;
|
|
|
|
int retval = fi.parse(xp);
|
|
|
|
if (retval) continue;
|
2012-08-27 17:00:43 +00:00
|
|
|
if (!fi.sticky) continue;
|
|
|
|
if (file_present_on_host(fi.name)) {
|
|
|
|
n++;
|
2012-08-26 21:24:18 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
2012-08-27 17:00:43 +00:00
|
|
|
// we're going to send the client this job,
|
|
|
|
// and the app uses locality scheduling lite.
|
|
|
|
// Add the job's sticky files to the list of files present on host.
|
|
|
|
//
|
|
|
|
void add_job_files_to_host(WORKUNIT& wu) {
|
|
|
|
MIOFILE mf;
|
|
|
|
mf.init_buf_read(wu.xml_doc);
|
|
|
|
XML_PARSER xp(&mf);
|
|
|
|
while (!xp.get_tag()) {
|
|
|
|
if (xp.match_tag("file_info")) {
|
|
|
|
FILE_INFO fi;
|
|
|
|
int retval = fi.parse(xp);
|
|
|
|
if (retval) continue;
|
|
|
|
if (!fi.sticky) continue;
|
|
|
|
if (!file_present_on_host(fi.name)) {
|
2012-11-07 21:50:41 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] Adding file %s to host file list\n", fi.name
|
|
|
|
);
|
|
|
|
}
|
2012-08-27 17:00:43 +00:00
|
|
|
g_request->file_infos.push_back(fi);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-01-10 00:43:33 +00:00
|
|
|
const double MIN_REQ_SECS = 0;
|
|
|
|
const double MAX_REQ_SECS = (28*SECONDS_IN_DAY);
|
2008-02-21 20:10:10 +00:00
|
|
|
|
2010-04-08 18:27:27 +00:00
|
|
|
// get limits on:
|
|
|
|
// # jobs per day
|
|
|
|
// # jobs per RPC
|
|
|
|
// # jobs in progress
|
2009-06-22 21:11:19 +00:00
|
|
|
//
|
|
|
|
void WORK_REQ::get_job_limits() {
|
|
|
|
int n;
|
|
|
|
n = g_reply->host.p_ncpus;
|
2009-06-01 22:15:14 +00:00
|
|
|
if (g_request->global_prefs.max_ncpus_pct && g_request->global_prefs.max_ncpus_pct < 100) {
|
2009-06-22 21:11:19 +00:00
|
|
|
n = (int)((n*g_request->global_prefs.max_ncpus_pct)/100.);
|
2009-06-01 22:15:14 +00:00
|
|
|
}
|
2009-06-22 21:11:19 +00:00
|
|
|
if (n > config.max_ncpus) n = config.max_ncpus;
|
|
|
|
if (n < 1) n = 1;
|
2012-12-21 19:42:26 +00:00
|
|
|
if (n > MAX_CPUS) n = MAX_CPUS;
|
2009-06-22 21:11:19 +00:00
|
|
|
effective_ncpus = n;
|
2009-01-20 00:54:16 +00:00
|
|
|
|
2012-12-08 18:16:28 +00:00
|
|
|
n = g_request->coprocs.ndevs();
|
2010-05-18 19:22:34 +00:00
|
|
|
if (n > MAX_GPUS) n = MAX_GPUS;
|
2009-06-22 21:11:19 +00:00
|
|
|
effective_ngpus = n;
|
2009-01-20 00:54:16 +00:00
|
|
|
|
2009-06-22 21:11:19 +00:00
|
|
|
int mult = effective_ncpus + config.gpu_multiplier * effective_ngpus;
|
2009-09-21 19:58:21 +00:00
|
|
|
if (config.non_cpu_intensive) {
|
|
|
|
mult = 1;
|
|
|
|
effective_ncpus = 1;
|
|
|
|
if (effective_ngpus) effective_ngpus = 1;
|
|
|
|
}
|
2009-06-19 17:21:34 +00:00
|
|
|
|
|
|
|
if (config.max_wus_to_send) {
|
|
|
|
g_wreq->max_jobs_per_rpc = mult * config.max_wus_to_send;
|
|
|
|
} else {
|
|
|
|
g_wreq->max_jobs_per_rpc = 999999;
|
|
|
|
}
|
2010-09-13 21:20:30 +00:00
|
|
|
|
2010-06-25 18:54:37 +00:00
|
|
|
if (config.debug_quota) {
|
2010-06-29 03:20:19 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2011-05-17 19:11:44 +00:00
|
|
|
"[quota] effective ncpus %d ngpus %d\n",
|
|
|
|
effective_ncpus, effective_ngpus
|
2010-06-25 18:54:37 +00:00
|
|
|
);
|
|
|
|
}
|
2011-05-17 19:11:44 +00:00
|
|
|
config.max_jobs_in_progress.reset(effective_ncpus, effective_ngpus);
|
2005-07-28 10:13:30 +00:00
|
|
|
}
|
|
|
|
|
2013-04-09 19:19:00 +00:00
|
|
|
const char* find_user_friendly_name(int appid) {
|
2009-05-06 21:52:50 +00:00
|
|
|
APP* app = ssp->lookup_app(appid);
|
|
|
|
if (app) return app->user_friendly_name;
|
2008-03-07 21:13:01 +00:00
|
|
|
return "deprecated application";
|
|
|
|
}
|
|
|
|
|
2013-04-09 19:19:00 +00:00
|
|
|
static void update_quota(DB_HOST_APP_VERSION& hav) {
|
|
|
|
if (config.daily_result_quota) {
|
|
|
|
if (hav.max_jobs_per_day == 0) {
|
|
|
|
hav.max_jobs_per_day = config.daily_result_quota;
|
|
|
|
if (config.debug_quota) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[quota] [HAV#%d] Initializing max_results_day to %d\n",
|
|
|
|
hav.app_version_id,
|
|
|
|
config.daily_result_quota
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (g_request->last_rpc_dayofyear != g_request->current_rpc_dayofyear) {
|
|
|
|
if (config.debug_quota) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[quota] [HOST#%d] [HAV#%d] Resetting n_jobs_today\n",
|
|
|
|
g_reply->host.id, hav.app_version_id
|
|
|
|
);
|
|
|
|
}
|
|
|
|
hav.n_jobs_today = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// see how much RAM we can use on this machine
|
|
|
|
//
|
|
|
|
static inline void get_mem_sizes() {
|
|
|
|
g_wreq->ram = g_reply->host.m_nbytes;
|
|
|
|
if (g_wreq->ram <= 0) g_wreq->ram = DEFAULT_RAM_SIZE;
|
|
|
|
g_wreq->usable_ram = g_wreq->ram;
|
|
|
|
double busy_frac = g_request->global_prefs.ram_max_used_busy_frac;
|
|
|
|
double idle_frac = g_request->global_prefs.ram_max_used_idle_frac;
|
|
|
|
double frac = 1;
|
|
|
|
if (busy_frac>0 && idle_frac>0) {
|
|
|
|
frac = std::max(busy_frac, idle_frac);
|
|
|
|
if (frac > 1) frac = 1;
|
|
|
|
g_wreq->usable_ram *= frac;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Decide whether or not this app version is 'reliable'
|
|
|
|
// An app version is reliable if the following conditions are true
|
|
|
|
// (for those that are set in the config file)
|
|
|
|
// 1) The host average turnaround is less than a threshold
|
|
|
|
// 2) consecutive_valid is above a threshold
|
|
|
|
// 3) The host results per day is equal to the max value
|
|
|
|
//
|
|
|
|
void get_reliability_version(HOST_APP_VERSION& hav, double multiplier) {
|
|
|
|
if (hav.turnaround.n > MIN_HOST_SAMPLES && config.reliable_max_avg_turnaround) {
|
|
|
|
|
|
|
|
if (hav.turnaround.get_avg() > config.reliable_max_avg_turnaround*multiplier) {
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [AV#%d] not reliable; avg turnaround: %.3f > %.3f hrs\n",
|
|
|
|
hav.app_version_id,
|
|
|
|
hav.turnaround.get_avg()/3600,
|
|
|
|
config.reliable_max_avg_turnaround*multiplier/3600
|
|
|
|
);
|
|
|
|
}
|
|
|
|
hav.reliable = false;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (hav.consecutive_valid < CONS_VALID_RELIABLE) {
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [AV#%d] not reliable; cons valid %d < %d\n",
|
|
|
|
hav.app_version_id,
|
|
|
|
hav.consecutive_valid, CONS_VALID_RELIABLE
|
|
|
|
);
|
|
|
|
}
|
|
|
|
hav.reliable = false;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (config.daily_result_quota) {
|
|
|
|
if (hav.max_jobs_per_day < config.daily_result_quota) {
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [AV#%d] not reliable; max_jobs_per_day %d>%d\n",
|
|
|
|
hav.app_version_id,
|
|
|
|
hav.max_jobs_per_day,
|
|
|
|
config.daily_result_quota
|
|
|
|
);
|
|
|
|
}
|
|
|
|
hav.reliable = false;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
hav.reliable = true;
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [HOST#%d] app version %d is reliable\n",
|
|
|
|
g_reply->host.id, hav.app_version_id
|
|
|
|
);
|
|
|
|
}
|
|
|
|
g_wreq->has_reliable_version = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// decide whether do unreplicated jobs with this app version
|
|
|
|
//
|
|
|
|
static void set_trust(DB_HOST_APP_VERSION& hav) {
|
|
|
|
hav.trusted = false;
|
|
|
|
if (hav.consecutive_valid < CONS_VALID_UNREPLICATED) {
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] set_trust: cons valid %d < %d, don't use single replication\n",
|
|
|
|
hav.consecutive_valid, CONS_VALID_UNREPLICATED
|
|
|
|
);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
double x = 1./hav.consecutive_valid;
|
|
|
|
if (drand() > x) hav.trusted = true;
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] set_trust: random choice for cons valid %d: %s\n",
|
|
|
|
hav.consecutive_valid, hav.trusted?"yes":"no"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void get_reliability_and_trust() {
|
|
|
|
// Platforms other than Windows, Linux and Intel Macs need a
|
|
|
|
// larger set of computers to be marked reliable
|
|
|
|
//
|
|
|
|
double multiplier = 1.0;
|
|
|
|
if (strstr(g_reply->host.os_name,"Windows")
|
|
|
|
|| strstr(g_reply->host.os_name,"Linux")
|
|
|
|
|| (strstr(g_reply->host.os_name,"Darwin")
|
|
|
|
&& !(strstr(g_reply->host.p_vendor,"Power Macintosh"))
|
|
|
|
)) {
|
|
|
|
multiplier = 1.0;
|
|
|
|
} else {
|
|
|
|
multiplier = 1.8;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned int i=0; i<g_wreq->host_app_versions.size(); i++) {
|
|
|
|
DB_HOST_APP_VERSION& hav = g_wreq->host_app_versions[i];
|
|
|
|
get_reliability_version(hav, multiplier);
|
|
|
|
set_trust(hav);
|
|
|
|
}
|
|
|
|
}
|
2008-03-07 21:13:01 +00:00
|
|
|
|
2005-10-04 21:44:58 +00:00
|
|
|
// Compute the max additional disk usage we can impose on the host.
|
|
|
|
// Depending on the client version, it can either send us
|
|
|
|
// - d_total and d_free (pre 4 oct 2005)
|
|
|
|
// - the above plus d_boinc_used_total and d_boinc_used_project
|
2004-04-04 01:59:47 +00:00
|
|
|
//
|
2008-12-16 16:29:54 +00:00
|
|
|
double max_allowable_disk() {
|
|
|
|
HOST host = g_request->host;
|
|
|
|
GLOBAL_PREFS prefs = g_request->global_prefs;
|
2004-08-14 00:37:38 +00:00
|
|
|
double x1, x2, x3, x;
|
|
|
|
|
2005-12-17 04:53:48 +00:00
|
|
|
// defaults are from config.xml
|
|
|
|
// if not there these are used:
|
|
|
|
// -default_max_used_gb= 100
|
|
|
|
// -default_max_used_pct = 50
|
|
|
|
// -default_min_free_gb = .001
|
2004-08-14 00:37:38 +00:00
|
|
|
//
|
2006-10-06 18:52:50 +00:00
|
|
|
if (prefs.disk_max_used_gb == 0) {
|
2005-12-17 04:53:48 +00:00
|
|
|
prefs.disk_max_used_gb = config.default_disk_max_used_gb;
|
2006-10-06 18:52:50 +00:00
|
|
|
}
|
|
|
|
if (prefs.disk_max_used_pct == 0) {
|
2005-12-17 04:53:48 +00:00
|
|
|
prefs.disk_max_used_pct = config.default_disk_max_used_pct;
|
2006-10-06 18:52:50 +00:00
|
|
|
}
|
|
|
|
if (prefs.disk_min_free_gb < config.default_disk_min_free_gb) {
|
2005-12-17 04:53:48 +00:00
|
|
|
prefs.disk_min_free_gb = config.default_disk_min_free_gb;
|
2006-10-06 18:52:50 +00:00
|
|
|
}
|
2004-08-14 00:37:38 +00:00
|
|
|
|
|
|
|
// no defaults for total/free disk space (host.d_total, d_free)
|
2005-10-04 21:44:58 +00:00
|
|
|
// if they're zero, client will get no work.
|
2004-08-14 00:37:38 +00:00
|
|
|
//
|
|
|
|
|
2005-10-04 21:44:58 +00:00
|
|
|
if (host.d_boinc_used_total) {
|
|
|
|
// The post 4 oct 2005 case.
|
|
|
|
// Compute the max allowable additional disk usage based on prefs
|
|
|
|
//
|
2008-11-01 23:13:55 +00:00
|
|
|
x1 = prefs.disk_max_used_gb*GIGA - host.d_boinc_used_total;
|
2005-10-04 21:44:58 +00:00
|
|
|
x2 = host.d_total*prefs.disk_max_used_pct/100.
|
|
|
|
- host.d_boinc_used_total;
|
2008-11-01 23:13:55 +00:00
|
|
|
x3 = host.d_free - prefs.disk_min_free_gb*GIGA; // may be negative
|
2009-02-26 00:23:23 +00:00
|
|
|
x = std::min(x1, std::min(x2, x3));
|
2005-01-31 19:34:43 +00:00
|
|
|
|
2005-10-04 21:44:58 +00:00
|
|
|
// see which bound is the most stringent
|
|
|
|
//
|
|
|
|
if (x==x1) {
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->disk_limits.max_used = x;
|
2005-10-04 21:44:58 +00:00
|
|
|
} else if (x==x2) {
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->disk_limits.max_frac = x;
|
2005-10-04 21:44:58 +00:00
|
|
|
} else {
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->disk_limits.min_free = x;
|
2005-10-04 21:44:58 +00:00
|
|
|
}
|
2005-02-06 21:26:21 +00:00
|
|
|
} else {
|
2005-10-04 21:44:58 +00:00
|
|
|
// here we don't know how much space BOINC is using.
|
|
|
|
// so we're kinda screwed.
|
|
|
|
// All we can do is assume that BOINC is using zero space.
|
|
|
|
// We can't honor the max_used for max_used_pct preferences.
|
|
|
|
// We can only honor the min_free pref.
|
|
|
|
//
|
2008-11-01 23:13:55 +00:00
|
|
|
x = host.d_free - prefs.disk_min_free_gb*GIGA; // may be negative
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->disk_limits.min_free = x;
|
2005-10-04 21:44:58 +00:00
|
|
|
x1 = x2 = x3 = 0;
|
2005-02-10 20:31:11 +00:00
|
|
|
}
|
2005-01-31 19:34:43 +00:00
|
|
|
|
2004-08-14 00:37:38 +00:00
|
|
|
if (x < 0) {
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-27 18:57:27 +00:00
|
|
|
"[send] No disk space available: disk_max_used_gb %.2fGB disk_max_used_pct %.2f disk_min_free_gb %.2fGB\n",
|
|
|
|
prefs.disk_max_used_gb/GIGA,
|
|
|
|
prefs.disk_max_used_pct,
|
|
|
|
prefs.disk_min_free_gb/GIGA
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-27 18:57:27 +00:00
|
|
|
"[send] No disk space available: host.d_total %.2fGB host.d_free %.2fGB host.d_boinc_used_total %.2fGB\n",
|
|
|
|
host.d_total/GIGA,
|
|
|
|
host.d_free/GIGA,
|
|
|
|
host.d_boinc_used_total/GIGA
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-27 18:57:27 +00:00
|
|
|
"[send] No disk space available: x1 %.2fGB x2 %.2fGB x3 %.2fGB x %.2fGB\n",
|
|
|
|
x1/GIGA, x2/GIGA, x3/GIGA, x/GIGA
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2008-12-16 16:29:54 +00:00
|
|
|
g_wreq->disk.set_insufficient(-x);
|
2008-11-26 20:37:11 +00:00
|
|
|
x = 0;
|
2004-08-14 00:37:38 +00:00
|
|
|
}
|
|
|
|
return x;
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
|
|
|
|
2008-12-15 21:14:32 +00:00
|
|
|
static double estimate_duration_unscaled(WORKUNIT& wu, BEST_APP_VERSION& bav) {
|
|
|
|
double rsc_fpops_est = wu.rsc_fpops_est;
|
|
|
|
if (rsc_fpops_est <= 0) rsc_fpops_est = 1e12;
|
2010-04-10 05:49:51 +00:00
|
|
|
return rsc_fpops_est/bav.host_usage.projected_flops;
|
2008-12-15 21:14:32 +00:00
|
|
|
}
|
2004-04-04 01:59:47 +00:00
|
|
|
|
2009-03-05 23:08:53 +00:00
|
|
|
static inline void get_running_frac() {
|
2008-12-18 18:19:42 +00:00
|
|
|
double rf;
|
2010-01-13 17:28:59 +00:00
|
|
|
if (g_request->core_client_version<=41900) {
|
2008-12-18 18:19:42 +00:00
|
|
|
rf = g_reply->host.on_frac;
|
|
|
|
} else {
|
|
|
|
rf = g_reply->host.active_frac * g_reply->host.on_frac;
|
|
|
|
}
|
|
|
|
|
2010-04-08 23:14:47 +00:00
|
|
|
// clamp running_frac to a reasonable range
|
2008-12-18 18:19:42 +00:00
|
|
|
//
|
|
|
|
if (rf > 1) {
|
2009-01-29 20:42:45 +00:00
|
|
|
if (config.debug_send) {
|
2010-06-29 03:20:19 +00:00
|
|
|
log_messages.printf(MSG_NORMAL, "[send] running_frac=%f; setting to 1\n", rf);
|
2009-01-29 20:42:45 +00:00
|
|
|
}
|
2008-12-18 18:19:42 +00:00
|
|
|
rf = 1;
|
|
|
|
} else if (rf < .1) {
|
2009-01-29 20:42:45 +00:00
|
|
|
if (config.debug_send) {
|
2010-06-29 03:20:19 +00:00
|
|
|
log_messages.printf(MSG_NORMAL, "[send] running_frac=%f; setting to 0.1\n", rf);
|
2009-01-29 20:42:45 +00:00
|
|
|
}
|
2008-12-18 18:19:42 +00:00
|
|
|
rf = .1;
|
|
|
|
}
|
2009-01-29 20:42:45 +00:00
|
|
|
g_wreq->running_frac = rf;
|
|
|
|
}
|
|
|
|
|
|
|
|
// estimate the amount of real time to complete this WU,
|
|
|
|
// taking into account active_frac etc.
|
|
|
|
// Note: don't factor in resource_share_fraction.
|
2010-04-08 18:27:27 +00:00
|
|
|
// The core client doesn't necessarily round-robin across all projects.
|
2009-01-29 20:42:45 +00:00
|
|
|
//
|
|
|
|
double estimate_duration(WORKUNIT& wu, BEST_APP_VERSION& bav) {
|
|
|
|
double edu = estimate_duration_unscaled(wu, bav);
|
|
|
|
double ed = edu/g_wreq->running_frac;
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-24 21:51:19 +00:00
|
|
|
"[send] est. duration for WU %d: unscaled %.2f scaled %.2f\n",
|
2009-01-15 20:23:20 +00:00
|
|
|
wu.id, edu, ed
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2008-12-18 18:19:42 +00:00
|
|
|
return ed;
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
|
|
|
|
2013-04-20 19:25:55 +00:00
|
|
|
// Parse user's project prferences.
|
2011-11-09 07:41:49 +00:00
|
|
|
// TODO: use XML_PARSER
|
|
|
|
//
|
2008-12-18 21:25:51 +00:00
|
|
|
static void get_prefs_info() {
|
2006-05-02 22:17:09 +00:00
|
|
|
char buf[8096];
|
2009-02-26 00:23:23 +00:00
|
|
|
std::string str;
|
2007-09-21 18:10:54 +00:00
|
|
|
unsigned int pos = 0;
|
2010-06-25 18:54:37 +00:00
|
|
|
int temp_int=0;
|
2008-03-10 17:03:15 +00:00
|
|
|
bool flag;
|
2006-10-22 00:42:44 +00:00
|
|
|
|
2013-06-04 03:24:48 +00:00
|
|
|
extract_venue(g_reply->user.project_prefs, g_reply->host.venue, buf, sizeof(buf));
|
2007-12-24 21:34:21 +00:00
|
|
|
str = buf;
|
|
|
|
|
2006-10-22 00:42:44 +00:00
|
|
|
// scan user's project prefs for elements of the form <app_id>N</app_id>,
|
|
|
|
// indicating the apps they want to run.
|
|
|
|
//
|
2008-12-18 21:25:51 +00:00
|
|
|
g_wreq->preferred_apps.clear();
|
2007-09-21 18:10:54 +00:00
|
|
|
while (parse_int(str.substr(pos,str.length()-pos).c_str(), "<app_id>", temp_int)) {
|
2006-10-22 01:46:33 +00:00
|
|
|
APP_INFO ai;
|
|
|
|
ai.appid = temp_int;
|
2008-03-07 21:13:01 +00:00
|
|
|
ai.work_available = false;
|
2008-12-18 21:25:51 +00:00
|
|
|
g_wreq->preferred_apps.push_back(ai);
|
2006-10-22 00:42:44 +00:00
|
|
|
|
2007-09-21 18:10:54 +00:00
|
|
|
pos = str.find("<app_id>", pos) + 1;
|
|
|
|
}
|
2009-05-06 21:52:50 +00:00
|
|
|
if (parse_bool(buf,"allow_non_preferred_apps", flag)) {
|
|
|
|
g_wreq->allow_non_preferred_apps = flag;
|
2008-03-07 21:13:01 +00:00
|
|
|
}
|
2009-05-06 21:52:50 +00:00
|
|
|
if (parse_bool(buf,"allow_beta_work", flag)) {
|
2008-12-18 21:25:51 +00:00
|
|
|
g_wreq->allow_beta_work = flag;
|
2009-05-06 21:52:50 +00:00
|
|
|
}
|
|
|
|
if (parse_bool(buf,"no_gpus", flag)) {
|
2009-09-28 04:24:18 +00:00
|
|
|
// deprecated, but need to handle
|
|
|
|
if (flag) {
|
2012-06-22 07:35:54 +00:00
|
|
|
for (int i=1; i<NPROC_TYPES; i++) {
|
|
|
|
g_wreq->dont_use_proc_type[i] = true;
|
|
|
|
}
|
2009-09-28 04:24:18 +00:00
|
|
|
}
|
2008-12-18 21:25:51 +00:00
|
|
|
}
|
2009-05-06 21:52:50 +00:00
|
|
|
if (parse_bool(buf,"no_cpu", flag)) {
|
2012-06-22 07:35:54 +00:00
|
|
|
g_wreq->dont_use_proc_type[PROC_TYPE_CPU] = flag;
|
2009-03-18 21:14:44 +00:00
|
|
|
}
|
2009-09-28 04:24:18 +00:00
|
|
|
if (parse_bool(buf,"no_cuda", flag)) {
|
2012-06-25 23:09:45 +00:00
|
|
|
g_wreq->dont_use_proc_type[PROC_TYPE_NVIDIA_GPU] = flag;
|
2009-09-28 04:24:18 +00:00
|
|
|
}
|
|
|
|
if (parse_bool(buf,"no_ati", flag)) {
|
2012-06-25 23:09:45 +00:00
|
|
|
g_wreq->dont_use_proc_type[PROC_TYPE_AMD_GPU] = flag;
|
2012-06-22 07:35:54 +00:00
|
|
|
}
|
2013-04-20 19:25:55 +00:00
|
|
|
if (parse_bool(buf,"no_intel_gpu", flag)) {
|
2012-06-25 23:09:45 +00:00
|
|
|
g_wreq->dont_use_proc_type[PROC_TYPE_INTEL_GPU] = flag;
|
2009-09-28 04:24:18 +00:00
|
|
|
}
|
2008-12-18 21:25:51 +00:00
|
|
|
}
|
|
|
|
|
2010-06-25 22:00:09 +00:00
|
|
|
void update_n_jobs_today() {
|
|
|
|
for (unsigned int i=0; i<g_wreq->host_app_versions.size(); i++) {
|
|
|
|
DB_HOST_APP_VERSION& hav = g_wreq->host_app_versions[i];
|
|
|
|
update_quota(hav);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-01-30 21:25:24 +00:00
|
|
|
static inline void update_estimated_delay(BEST_APP_VERSION& bav, double dt) {
|
2012-06-25 23:09:45 +00:00
|
|
|
int pt = bav.host_usage.proc_type;
|
|
|
|
if (pt == PROC_TYPE_CPU) {
|
2012-06-22 07:35:54 +00:00
|
|
|
g_request->cpu_estimated_delay += dt*bav.host_usage.avg_ncpus/g_request->host.p_ncpus;
|
2012-06-25 23:09:45 +00:00
|
|
|
} else {
|
|
|
|
COPROC* cp = g_request->coprocs.type_to_coproc(pt);
|
|
|
|
cp->estimated_delay += dt*bav.host_usage.gpu_usage/cp->count;
|
2012-06-22 07:35:54 +00:00
|
|
|
}
|
2009-01-30 21:25:24 +00:00
|
|
|
}
|
|
|
|
|
2004-04-04 01:59:47 +00:00
|
|
|
// insert "text" right after "after" in the given buffer
|
|
|
|
//
|
2010-04-08 18:27:27 +00:00
|
|
|
static int insert_after(char* buffer, const char* after, const char* text) {
|
2004-04-04 01:59:47 +00:00
|
|
|
char* p;
|
2008-03-31 16:19:45 +00:00
|
|
|
char temp[BLOB_SIZE];
|
2004-04-04 01:59:47 +00:00
|
|
|
|
2010-07-22 22:57:15 +00:00
|
|
|
if (strlen(buffer) + strlen(text) >= BLOB_SIZE-1) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-07-08 18:02:07 +00:00
|
|
|
"insert_after: overflow: %d %d\n",
|
|
|
|
(int)strlen(buffer),
|
|
|
|
(int)strlen(text)
|
2005-09-26 23:28:48 +00:00
|
|
|
);
|
2004-04-04 01:59:47 +00:00
|
|
|
return ERR_BUFFER_OVERFLOW;
|
|
|
|
}
|
|
|
|
p = strstr(buffer, after);
|
|
|
|
if (!p) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2005-09-26 23:28:48 +00:00
|
|
|
"insert_after: %s not found in %s\n", after, buffer
|
|
|
|
);
|
2010-11-08 17:51:57 +00:00
|
|
|
return ERR_XML_PARSE;
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
|
|
|
p += strlen(after);
|
|
|
|
strcpy(temp, p);
|
|
|
|
strcpy(p, text);
|
|
|
|
strcat(p, temp);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-02-06 21:26:21 +00:00
|
|
|
// add elements to WU's xml_doc,
|
|
|
|
// in preparation for sending it to a client
|
2004-04-04 01:59:47 +00:00
|
|
|
//
|
2010-04-08 18:27:27 +00:00
|
|
|
static int insert_wu_tags(WORKUNIT& wu, APP& app) {
|
2008-03-31 16:19:45 +00:00
|
|
|
char buf[BLOB_SIZE];
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2004-04-04 01:59:47 +00:00
|
|
|
sprintf(buf,
|
|
|
|
" <rsc_fpops_est>%f</rsc_fpops_est>\n"
|
|
|
|
" <rsc_fpops_bound>%f</rsc_fpops_bound>\n"
|
|
|
|
" <rsc_memory_bound>%f</rsc_memory_bound>\n"
|
|
|
|
" <rsc_disk_bound>%f</rsc_disk_bound>\n"
|
|
|
|
" <name>%s</name>\n"
|
|
|
|
" <app_name>%s</app_name>\n",
|
|
|
|
wu.rsc_fpops_est,
|
|
|
|
wu.rsc_fpops_bound,
|
|
|
|
wu.rsc_memory_bound,
|
|
|
|
wu.rsc_disk_bound,
|
|
|
|
wu.name,
|
|
|
|
app.name
|
|
|
|
);
|
|
|
|
return insert_after(wu.xml_doc, "<workunit>\n", buf);
|
|
|
|
}
|
|
|
|
|
2010-04-08 18:27:27 +00:00
|
|
|
// Add the given workunit, app, and app version to a reply.
|
2004-04-04 01:59:47 +00:00
|
|
|
//
|
2010-04-08 18:27:27 +00:00
|
|
|
static int add_wu_to_reply(
|
2011-07-01 02:12:11 +00:00
|
|
|
WORKUNIT& wu, SCHEDULER_REPLY&, APP* app, BEST_APP_VERSION* bavp
|
2004-04-04 01:59:47 +00:00
|
|
|
) {
|
|
|
|
int retval;
|
2005-02-02 18:13:00 +00:00
|
|
|
WORKUNIT wu2, wu3;
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2008-03-27 18:25:29 +00:00
|
|
|
APP_VERSION* avp = bavp->avp;
|
|
|
|
|
2004-04-04 01:59:47 +00:00
|
|
|
// add the app, app_version, and workunit to the reply,
|
|
|
|
// but only if they aren't already there
|
|
|
|
//
|
- scheduler: add support for anonymous-platform coproc apps.
Old: although the request message contained all info
about the app version (flops, coproc usage etc.)
the server ignored this info,
and assumed that all anonymous platform apps where CPU.
With 6.6 client, this could produce infinite work fetch:
- client uses anon platform, has coproc app
- client has idle CPU, requests CPU work
- scheduler sends it jobs, thinking they will be done by CPU app
- client asks for more work etc.
New: scheduler parses full info on anon platform app versions:
plan class, FLOPS, coprocs.
It uses this info to make scheduling decisions;
in particular, if the request is for CUDA work,
if will only send jobs that use a CUDA app version.
The <result> records it returns contain info
(plan_class) that tells the client which app_version to use.
This will work correctly even if the client has multiple app versions
for the same app (e.g., a CPU version and a GPU version)
svn path=/trunk/boinc/; revision=17506
2009-03-05 17:30:10 +00:00
|
|
|
if (avp) {
|
2005-02-02 18:13:00 +00:00
|
|
|
APP_VERSION av2=*avp, *avp2=&av2;
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2009-03-02 23:47:11 +00:00
|
|
|
if (strlen(config.replace_download_url_by_timezone)) {
|
2008-12-19 18:14:02 +00:00
|
|
|
process_av_timezone(avp, av2);
|
2005-02-02 18:13:00 +00:00
|
|
|
}
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->insert_app_unique(*app);
|
2008-03-28 18:00:27 +00:00
|
|
|
av2.bavp = bavp;
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->insert_app_version_unique(*avp2);
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-04-16 18:07:08 +00:00
|
|
|
"[send] Sending app_version %s %d %d %s; projected %.2f GFLOPS\n",
|
|
|
|
app->name,
|
2008-12-17 20:53:46 +00:00
|
|
|
avp2->platformid, avp2->version_num, avp2->plan_class,
|
2010-04-10 05:49:51 +00:00
|
|
|
bavp->host_usage.projected_flops/1e9
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
|
|
|
|
2010-04-08 18:27:27 +00:00
|
|
|
// modify the WU's xml_doc; add <name>, <rsc_*> etc.
|
2004-04-04 01:59:47 +00:00
|
|
|
//
|
|
|
|
wu2 = wu; // make copy since we're going to modify its XML field
|
2010-04-08 23:14:47 +00:00
|
|
|
|
|
|
|
// adjust FPOPS figures for anonymous platform
|
|
|
|
//
|
|
|
|
if (bavp->cavp) {
|
|
|
|
wu2.rsc_fpops_est *= bavp->cavp->rsc_fpops_scale;
|
|
|
|
wu2.rsc_fpops_bound *= bavp->cavp->rsc_fpops_scale;
|
|
|
|
}
|
2004-04-04 01:59:47 +00:00
|
|
|
retval = insert_wu_tags(wu2, *app);
|
|
|
|
if (retval) {
|
2010-11-08 17:51:57 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"insert_wu_tags failed: %s\n", boincerror(retval)
|
|
|
|
);
|
2004-04-04 01:59:47 +00:00
|
|
|
return retval;
|
|
|
|
}
|
2008-12-19 18:14:02 +00:00
|
|
|
wu3 = wu2;
|
2009-03-02 23:47:11 +00:00
|
|
|
if (strlen(config.replace_download_url_by_timezone)) {
|
2008-12-19 18:14:02 +00:00
|
|
|
process_wu_timezone(wu2, wu3);
|
2005-02-02 18:13:00 +00:00
|
|
|
}
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->insert_workunit_unique(wu3);
|
2005-02-14 20:11:01 +00:00
|
|
|
|
|
|
|
// switch to tighter policy for estimating delay
|
|
|
|
//
|
2004-04-04 01:59:47 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-04-08 18:27:27 +00:00
|
|
|
// add <name> tags to result's xml_doc_in
|
|
|
|
//
|
|
|
|
static int insert_name_tags(RESULT& result, WORKUNIT const& wu) {
|
2004-04-04 01:59:47 +00:00
|
|
|
char buf[256];
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
sprintf(buf, "<name>%s</name>\n", result.name);
|
|
|
|
retval = insert_after(result.xml_doc_in, "<result>\n", buf);
|
|
|
|
if (retval) return retval;
|
|
|
|
sprintf(buf, "<wu_name>%s</wu_name>\n", wu.name);
|
|
|
|
retval = insert_after(result.xml_doc_in, "<result>\n", buf);
|
|
|
|
if (retval) return retval;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-04-08 18:27:27 +00:00
|
|
|
static int insert_deadline_tag(RESULT& result) {
|
2004-04-04 01:59:47 +00:00
|
|
|
char buf[256];
|
|
|
|
sprintf(buf, "<report_deadline>%d</report_deadline>\n", result.report_deadline);
|
|
|
|
int retval = insert_after(result.xml_doc_in, "<result>\n", buf);
|
|
|
|
if (retval) return retval;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-10-26 07:15:22 +00:00
|
|
|
// update workunit fields when send an instance of it:
|
2011-06-06 03:40:42 +00:00
|
|
|
// - transition time
|
|
|
|
// - app_version_id, if app uses homogeneous app version
|
2011-10-26 07:15:22 +00:00
|
|
|
// - hr_class, if we're using HR
|
|
|
|
//
|
2011-11-03 06:46:05 +00:00
|
|
|
// In the latter two cases, the update is conditional on the field
|
|
|
|
// fields either being zero or the desired value.
|
|
|
|
// Some other scheduler instance might have updated it since we read the WU,
|
|
|
|
// and the transitioner might have set it to zero.
|
2011-06-06 03:40:42 +00:00
|
|
|
//
|
|
|
|
int update_wu_on_send(WORKUNIT wu, time_t x, APP& app, BEST_APP_VERSION& bav) {
|
2004-04-04 01:59:47 +00:00
|
|
|
DB_WORKUNIT dbwu;
|
2011-10-26 07:15:22 +00:00
|
|
|
char buf[256], buf2[256], where_clause[256];
|
|
|
|
int retval;
|
2004-04-04 01:59:47 +00:00
|
|
|
|
2004-08-21 00:40:01 +00:00
|
|
|
dbwu.id = wu.id;
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2005-07-14 10:02:20 +00:00
|
|
|
// SQL note: can't use min() here
|
|
|
|
//
|
|
|
|
sprintf(buf,
|
|
|
|
"transition_time=if(transition_time<%d, transition_time, %d)",
|
|
|
|
(int)x, (int)x
|
|
|
|
);
|
2011-10-26 07:15:22 +00:00
|
|
|
strcpy(where_clause, "");
|
2011-11-03 06:46:05 +00:00
|
|
|
if (app.homogeneous_app_version) {
|
2011-06-06 03:40:42 +00:00
|
|
|
sprintf(buf2, ", app_version_id=%d", bav.avp->id);
|
|
|
|
strcat(buf, buf2);
|
2011-11-03 06:46:05 +00:00
|
|
|
sprintf(where_clause,
|
|
|
|
"(app_version_id=0 or app_version_id=%d)", bav.avp->id
|
|
|
|
);
|
2011-06-06 03:40:42 +00:00
|
|
|
}
|
2011-11-03 06:46:05 +00:00
|
|
|
if (app_hr_type(app)) {
|
2011-10-26 07:15:22 +00:00
|
|
|
int host_hr_class = hr_class(g_request->host, app_hr_type(app));
|
|
|
|
sprintf(buf2, ", hr_class=%d", host_hr_class);
|
|
|
|
strcat(buf, buf2);
|
|
|
|
if (strlen(where_clause)) {
|
|
|
|
strcat(where_clause, " and ");
|
|
|
|
}
|
2011-11-06 04:53:03 +00:00
|
|
|
sprintf(buf2, "(hr_class=0 or hr_class=%d)", host_hr_class);
|
|
|
|
strcat(where_clause, buf2);
|
2011-10-26 07:15:22 +00:00
|
|
|
}
|
2011-10-27 03:55:18 +00:00
|
|
|
retval = dbwu.update_field(buf, strlen(where_clause)?where_clause:NULL);
|
2011-10-26 07:15:22 +00:00
|
|
|
if (retval) return retval;
|
|
|
|
if (boinc_db.affected_rows() != 1) {
|
|
|
|
return ERR_DB_NOT_FOUND;
|
|
|
|
}
|
|
|
|
return 0;
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// return true iff a result for same WU is already being sent
|
|
|
|
//
|
2008-12-19 18:14:02 +00:00
|
|
|
bool wu_already_in_reply(WORKUNIT& wu) {
|
2004-04-04 01:59:47 +00:00
|
|
|
unsigned int i;
|
2008-12-16 16:29:54 +00:00
|
|
|
for (i=0; i<g_reply->results.size(); i++) {
|
|
|
|
if (wu.id == g_reply->results[i].workunitid) {
|
2004-04-04 01:59:47 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void lock_sema() {
|
|
|
|
lock_semaphore(sema_key);
|
|
|
|
}
|
|
|
|
|
|
|
|
void unlock_sema() {
|
|
|
|
unlock_semaphore(sema_key);
|
|
|
|
}
|
|
|
|
|
2012-06-22 07:35:54 +00:00
|
|
|
static inline bool have_apps(int pt) {
|
2011-10-03 23:43:53 +00:00
|
|
|
if (g_wreq->anonymous_platform) {
|
2012-06-25 23:09:45 +00:00
|
|
|
return g_wreq->client_has_apps_for_proc_type[pt];
|
2011-10-03 23:43:53 +00:00
|
|
|
} else {
|
2012-06-22 07:35:54 +00:00
|
|
|
return ssp->have_apps_for_proc_type[pt];
|
2011-10-03 23:43:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-02-06 21:26:21 +00:00
|
|
|
// return true if additional work is needed,
|
|
|
|
// and there's disk space left,
|
|
|
|
// and we haven't exceeded result per RPC limit,
|
|
|
|
// and we haven't exceeded results per day limit
|
|
|
|
//
|
2008-12-19 18:14:02 +00:00
|
|
|
bool work_needed(bool locality_sched) {
|
2005-02-09 20:06:15 +00:00
|
|
|
if (locality_sched) {
|
|
|
|
// if we've failed to send a result because of a transient condition,
|
|
|
|
// return false to preserve invariant
|
|
|
|
//
|
2008-12-16 16:29:54 +00:00
|
|
|
if (g_wreq->disk.insufficient || g_wreq->speed.insufficient || g_wreq->mem.insufficient || g_wreq->no_allowed_apps_available) {
|
2009-03-07 01:00:05 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] stopping work search - locality condition\n"
|
|
|
|
);
|
|
|
|
}
|
2005-02-09 20:06:15 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2005-02-26 00:24:37 +00:00
|
|
|
|
2010-06-01 23:41:07 +00:00
|
|
|
// see if we've reached limits on in-progress jobs
|
|
|
|
//
|
|
|
|
bool some_type_allowed = false;
|
2011-05-17 19:11:44 +00:00
|
|
|
|
|
|
|
// check GPU limit
|
|
|
|
//
|
2010-06-01 23:41:07 +00:00
|
|
|
if (config.max_jobs_in_progress.exceeded(NULL, true)) {
|
2010-09-12 01:10:39 +00:00
|
|
|
if (config.debug_quota) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[quota] reached limit on GPU jobs in progress\n"
|
|
|
|
);
|
2011-09-06 22:53:48 +00:00
|
|
|
config.max_jobs_in_progress.print_log();
|
2010-09-12 01:10:39 +00:00
|
|
|
}
|
2010-06-01 23:41:07 +00:00
|
|
|
g_wreq->clear_gpu_req();
|
|
|
|
if (g_wreq->effective_ngpus) {
|
|
|
|
g_wreq->max_jobs_on_host_gpu_exceeded = true;
|
2007-05-14 15:21:38 +00:00
|
|
|
}
|
2010-06-01 23:41:07 +00:00
|
|
|
} else {
|
|
|
|
some_type_allowed = true;
|
2007-05-14 15:21:38 +00:00
|
|
|
}
|
2011-05-17 19:11:44 +00:00
|
|
|
|
|
|
|
// check CPU limit
|
|
|
|
//
|
2010-06-01 23:41:07 +00:00
|
|
|
if (config.max_jobs_in_progress.exceeded(NULL, false)) {
|
2010-09-12 01:10:39 +00:00
|
|
|
if (config.debug_quota) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[quota] reached limit on CPU jobs in progress\n"
|
|
|
|
);
|
2011-09-06 22:53:48 +00:00
|
|
|
config.max_jobs_in_progress.print_log();
|
2010-09-12 01:10:39 +00:00
|
|
|
}
|
2009-06-01 22:15:14 +00:00
|
|
|
g_wreq->clear_cpu_req();
|
|
|
|
g_wreq->max_jobs_on_host_cpu_exceeded = true;
|
2010-06-01 23:41:07 +00:00
|
|
|
} else {
|
|
|
|
some_type_allowed = true;
|
2009-06-01 22:15:14 +00:00
|
|
|
}
|
2011-05-17 19:11:44 +00:00
|
|
|
|
2010-06-01 23:41:07 +00:00
|
|
|
if (!some_type_allowed) {
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] in-progress job limit exceeded\n"
|
|
|
|
);
|
2009-06-22 21:11:19 +00:00
|
|
|
}
|
2010-06-01 23:41:07 +00:00
|
|
|
g_wreq->max_jobs_on_host_exceeded = true;
|
|
|
|
return false;
|
2009-06-01 22:15:14 +00:00
|
|
|
}
|
|
|
|
|
2010-06-01 23:41:07 +00:00
|
|
|
// see if we've reached max jobs per RPC
|
|
|
|
//
|
2009-06-01 22:15:14 +00:00
|
|
|
if (g_wreq->njobs_sent >= g_wreq->max_jobs_per_rpc) {
|
2010-06-25 18:54:37 +00:00
|
|
|
if (config.debug_quota) {
|
2009-03-07 01:00:05 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-06-25 18:54:37 +00:00
|
|
|
"[quota] stopping work search - njobs %d >= max_jobs_per_rpc %d\n",
|
2009-06-01 22:15:14 +00:00
|
|
|
g_wreq->njobs_sent, g_wreq->max_jobs_per_rpc
|
2009-03-07 01:00:05 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
2009-01-10 00:43:33 +00:00
|
|
|
|
2012-08-31 06:11:06 +00:00
|
|
|
#if 0
|
2011-09-06 19:34:15 +00:00
|
|
|
if (config.debug_send) {
|
2012-06-22 07:35:54 +00:00
|
|
|
char buf[256], buf2[256];
|
|
|
|
strcpy(buf, "");
|
2012-06-26 20:30:56 +00:00
|
|
|
for (int i=1; i<NPROC_TYPES; i++) {
|
2012-06-22 07:35:54 +00:00
|
|
|
sprintf(buf2, " %s (%.2f, %.2f)",
|
|
|
|
proc_type_name(i),
|
|
|
|
g_wreq->req_secs[i],
|
|
|
|
g_wreq->req_instances[i]
|
|
|
|
);
|
|
|
|
strcat(buf, buf2);
|
|
|
|
}
|
2011-09-06 19:34:15 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-06-26 20:30:56 +00:00
|
|
|
"[send] work_needed: spec req %d sec to fill %.2f; %s\n",
|
2011-09-06 19:34:15 +00:00
|
|
|
g_wreq->rsc_spec_request,
|
|
|
|
g_wreq->seconds_to_fill,
|
2012-06-22 07:35:54 +00:00
|
|
|
buf
|
2011-09-06 19:34:15 +00:00
|
|
|
);
|
|
|
|
}
|
2009-01-23 22:52:35 +00:00
|
|
|
#endif
|
2009-01-10 00:43:33 +00:00
|
|
|
if (g_wreq->rsc_spec_request) {
|
2012-06-22 07:35:54 +00:00
|
|
|
for (int i=0; i<NPROC_TYPES; i++) {
|
|
|
|
if (g_wreq->need_proc_type(i) && have_apps(i)) {
|
|
|
|
return true;
|
|
|
|
}
|
2009-08-17 17:07:38 +00:00
|
|
|
}
|
2009-01-10 00:43:33 +00:00
|
|
|
} else {
|
2009-01-23 22:52:35 +00:00
|
|
|
if (g_wreq->seconds_to_fill > 0) {
|
|
|
|
return true;
|
|
|
|
}
|
2009-01-10 00:43:33 +00:00
|
|
|
}
|
2009-03-07 01:00:05 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL, "[send] don't need more work\n");
|
|
|
|
}
|
2009-01-10 00:43:33 +00:00
|
|
|
return false;
|
2004-09-10 00:41:48 +00:00
|
|
|
}
|
|
|
|
|
2010-03-29 22:28:20 +00:00
|
|
|
// return the app version ID, or -2/-3/-4 if anonymous platform
|
|
|
|
//
|
|
|
|
inline static int get_app_version_id(BEST_APP_VERSION* bavp) {
|
|
|
|
if (bavp->avp) {
|
|
|
|
return bavp->avp->id;
|
|
|
|
} else {
|
2010-04-08 23:14:47 +00:00
|
|
|
return bavp->cavp->host_usage.resource_type();
|
2010-03-29 22:28:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-03-03 16:38:54 +00:00
|
|
|
int add_result_to_reply(
|
2011-10-26 16:51:10 +00:00
|
|
|
SCHED_DB_RESULT& result,
|
|
|
|
WORKUNIT& wu,
|
|
|
|
BEST_APP_VERSION* bavp,
|
2009-03-03 16:38:54 +00:00
|
|
|
bool locality_scheduling
|
|
|
|
) {
|
2004-09-10 00:41:48 +00:00
|
|
|
int retval;
|
2005-09-12 23:49:16 +00:00
|
|
|
bool resent_result = false;
|
2008-03-27 18:25:29 +00:00
|
|
|
APP* app = ssp->lookup_app(wu.appid);
|
2004-09-10 00:41:48 +00:00
|
|
|
|
2011-11-09 23:50:09 +00:00
|
|
|
result.hostid = g_reply->host.id;
|
|
|
|
result.userid = g_reply->user.id;
|
|
|
|
result.sent_time = time(0);
|
|
|
|
result.report_deadline = result.sent_time + wu.delay_bound;
|
|
|
|
result.flops_estimate = bavp->host_usage.peak_flops;
|
|
|
|
result.app_version_id = get_app_version_id(bavp);
|
|
|
|
|
2011-10-26 07:15:22 +00:00
|
|
|
// update WU DB record.
|
|
|
|
// This can fail in normal operation
|
|
|
|
// (other scheduler already updated hr_class or app_version_id)
|
|
|
|
// so do it before updating the result.
|
2005-02-02 22:58:46 +00:00
|
|
|
//
|
2011-10-26 07:15:22 +00:00
|
|
|
retval = update_wu_on_send(
|
|
|
|
wu, result.report_deadline + config.report_grace_period, *app, *bavp
|
|
|
|
);
|
|
|
|
if (retval == ERR_DB_NOT_FOUND) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"add_result_to_reply: WU already sent to other HR class or app version\n"
|
|
|
|
);
|
|
|
|
return retval;
|
|
|
|
} else if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"add_result_to_reply: WU update failed: %d\n",
|
|
|
|
retval
|
|
|
|
);
|
|
|
|
return retval;
|
2005-02-02 22:58:46 +00:00
|
|
|
}
|
2004-09-10 00:41:48 +00:00
|
|
|
|
2011-10-26 07:15:22 +00:00
|
|
|
// update result DB record.
|
|
|
|
// This can also fail in normal operation.
|
|
|
|
// In this case, in principle we should undo
|
|
|
|
// the changes we just made to the WU (or use a transaction)
|
|
|
|
// but I don't think it actually matters.
|
2004-09-10 00:41:48 +00:00
|
|
|
//
|
2006-03-17 04:47:51 +00:00
|
|
|
int old_server_state = result.server_state;
|
2005-08-02 19:38:43 +00:00
|
|
|
|
2005-07-28 22:21:46 +00:00
|
|
|
if (result.server_state != RESULT_SERVER_STATE_IN_PROGRESS) {
|
2009-08-31 19:35:46 +00:00
|
|
|
// We're sending this result for the first time
|
2007-11-30 23:02:55 +00:00
|
|
|
//
|
2005-07-28 22:21:46 +00:00
|
|
|
result.server_state = RESULT_SERVER_STATE_IN_PROGRESS;
|
2005-08-04 00:12:50 +00:00
|
|
|
} else {
|
2007-11-30 23:02:55 +00:00
|
|
|
// Result was already sent to this host but was lost,
|
2009-08-31 19:35:46 +00:00
|
|
|
// so we're resending it.
|
2005-08-02 19:38:43 +00:00
|
|
|
//
|
2005-09-12 23:49:16 +00:00
|
|
|
resent_result = true;
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[send] [RESULT#%u] [HOST#%d] (resend lost work)\n",
|
2008-12-16 16:29:54 +00:00
|
|
|
result.id, g_reply->host.id
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2005-07-28 22:21:46 +00:00
|
|
|
}
|
2011-02-15 22:07:14 +00:00
|
|
|
retval = result.mark_as_sent(old_server_state, config.report_grace_period);
|
2008-10-27 21:23:07 +00:00
|
|
|
if (retval == ERR_DB_NOT_FOUND) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[RESULT#%u] [HOST#%d]: CAN'T SEND, already sent to another host\n",
|
2008-12-16 16:29:54 +00:00
|
|
|
result.id, g_reply->host.id
|
2006-03-17 14:31:25 +00:00
|
|
|
);
|
|
|
|
} else if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"add_result_to_reply: can't update result: %s\n", boincerror(retval)
|
2005-05-09 20:20:33 +00:00
|
|
|
);
|
|
|
|
}
|
2006-03-17 14:31:25 +00:00
|
|
|
if (retval) return retval;
|
2004-09-10 00:41:48 +00:00
|
|
|
|
2011-10-26 07:15:22 +00:00
|
|
|
// done with DB updates.
|
|
|
|
//
|
|
|
|
|
|
|
|
retval = add_wu_to_reply(wu, *g_reply, app, bavp);
|
|
|
|
if (retval) return retval;
|
|
|
|
|
|
|
|
// Adjust available disk space.
|
|
|
|
// In the locality scheduling locality case,
|
|
|
|
// reduce the available space by less than the workunit rsc_disk_bound,
|
|
|
|
// if the host already has the file or the file was not already sent.
|
|
|
|
//
|
|
|
|
if (!locality_scheduling || decrement_disk_space_locality(wu)) {
|
|
|
|
g_wreq->disk_available -= wu.rsc_disk_bound;
|
|
|
|
}
|
|
|
|
|
2009-01-10 00:43:33 +00:00
|
|
|
double est_dur = estimate_duration(wu, *bavp);
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[send] [HOST#%d] sending [RESULT#%u %s] (est. dur. %.2f seconds)\n",
|
2009-01-10 00:43:33 +00:00
|
|
|
g_reply->host.id, result.id, result.name, est_dur
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2004-09-10 00:41:48 +00:00
|
|
|
|
|
|
|
// The following overwrites the result's xml_doc field.
|
|
|
|
// But that's OK cuz we're done with DB updates
|
|
|
|
//
|
|
|
|
retval = insert_name_tags(result, wu);
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2007-02-15 21:11:05 +00:00
|
|
|
"add_result_to_reply: can't insert name tags: %d\n",
|
2005-05-09 20:20:33 +00:00
|
|
|
retval
|
2004-09-10 00:41:48 +00:00
|
|
|
);
|
2005-05-09 20:20:33 +00:00
|
|
|
return retval;
|
2004-09-10 00:41:48 +00:00
|
|
|
}
|
|
|
|
retval = insert_deadline_tag(result);
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"add_result_to_reply: can't insert deadline tag: %s\n", boincerror(retval)
|
2004-09-10 00:41:48 +00:00
|
|
|
);
|
2005-05-09 20:20:33 +00:00
|
|
|
return retval;
|
2004-09-10 00:41:48 +00:00
|
|
|
}
|
2011-06-06 03:40:42 +00:00
|
|
|
result.bav = *bavp;
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->insert_result(result);
|
2009-01-10 00:43:33 +00:00
|
|
|
if (g_wreq->rsc_spec_request) {
|
2012-06-22 07:35:54 +00:00
|
|
|
int pt = bavp->host_usage.proc_type;
|
|
|
|
if (pt == PROC_TYPE_CPU) {
|
|
|
|
g_wreq->req_secs[PROC_TYPE_CPU] -= est_dur;
|
|
|
|
g_wreq->req_instances[PROC_TYPE_CPU] -= bavp->host_usage.avg_ncpus;
|
2009-01-10 00:43:33 +00:00
|
|
|
} else {
|
2012-06-22 07:35:54 +00:00
|
|
|
g_wreq->req_secs[pt] -= est_dur;
|
|
|
|
g_wreq->req_instances[pt] -= bavp->host_usage.gpu_usage;
|
2009-01-10 00:43:33 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
g_wreq->seconds_to_fill -= est_dur;
|
|
|
|
}
|
2009-01-30 21:25:24 +00:00
|
|
|
update_estimated_delay(*bavp, est_dur);
|
2009-06-01 22:15:14 +00:00
|
|
|
g_wreq->njobs_sent++;
|
2010-06-01 23:41:07 +00:00
|
|
|
config.max_jobs_in_progress.register_job(app, bavp->host_usage.uses_gpu());
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (!resent_result) {
|
|
|
|
DB_HOST_APP_VERSION* havp = bavp->host_app_version();
|
|
|
|
if (havp) {
|
|
|
|
havp->n_jobs_today++;
|
|
|
|
}
|
|
|
|
}
|
- scheduler: add <workload_sim> config option.
If set, the scheduler will use EDF simulation,
together with the in-progress workload reported by the client,
to avoid sending results that
1) will miss their deadline, or
2) will cause an in-progress result to miss its deadline, or
3) will make an in-progress result miss its deadline
by more than is already predicted.
If this option is not set, or if the client request doesn't
include a workload description (i.e. the client is old)
use the existing approach, which assumes there's no workload.
NOTE: this is experimental. Production projects should not use it.
- EDF sim: write debug stuff to stderr instead of stdout
- Account manager:
- if an account is detach_when_done, set dont_request_more_work
- check done_request_more_work even for first-time projects
- update_uotd: generate a file for use by Google gadget
- user_links(): use full URLs (so can use in Google gadget)
client/
acct_mgr.C
work_fetch.C
html/
inc/
uotd.inc
util.inc
user/
uotd_gadget.php (new)
sched/
Makefile.am
edf_sim.C
sched_config.C,h
sched_resend.C
sched_send.C,h
server_types.C,h
svn path=/trunk/boinc/; revision=12639
2007-05-10 21:50:52 +00:00
|
|
|
|
|
|
|
// add this result to workload for simulation
|
|
|
|
//
|
2008-12-16 16:29:54 +00:00
|
|
|
if (config.workload_sim && g_request->have_other_results_list) {
|
2008-12-15 21:14:32 +00:00
|
|
|
IP_RESULT ipr ("", time(0)+wu.delay_bound, est_dur);
|
2008-12-16 16:29:54 +00:00
|
|
|
g_request->ip_results.push_back(ipr);
|
- scheduler: add <workload_sim> config option.
If set, the scheduler will use EDF simulation,
together with the in-progress workload reported by the client,
to avoid sending results that
1) will miss their deadline, or
2) will cause an in-progress result to miss its deadline, or
3) will make an in-progress result miss its deadline
by more than is already predicted.
If this option is not set, or if the client request doesn't
include a workload description (i.e. the client is old)
use the existing approach, which assumes there's no workload.
NOTE: this is experimental. Production projects should not use it.
- EDF sim: write debug stuff to stderr instead of stdout
- Account manager:
- if an account is detach_when_done, set dont_request_more_work
- check done_request_more_work even for first-time projects
- update_uotd: generate a file for use by Google gadget
- user_links(): use full URLs (so can use in Google gadget)
client/
acct_mgr.C
work_fetch.C
html/
inc/
uotd.inc
util.inc
user/
uotd_gadget.php (new)
sched/
Makefile.am
edf_sim.C
sched_config.C,h
sched_resend.C
sched_send.C,h
server_types.C,h
svn path=/trunk/boinc/; revision=12639
2007-05-10 21:50:52 +00:00
|
|
|
}
|
2007-07-05 04:18:48 +00:00
|
|
|
|
2008-06-04 23:04:12 +00:00
|
|
|
// mark job as done if debugging flag is set;
|
|
|
|
// this is used by sched_driver.C (performance testing)
|
2007-07-05 04:18:48 +00:00
|
|
|
//
|
|
|
|
if (mark_jobs_done) {
|
|
|
|
DB_WORKUNIT dbwu;
|
|
|
|
char buf[256];
|
|
|
|
sprintf(buf,
|
|
|
|
"server_state=%d outcome=%d",
|
|
|
|
RESULT_SERVER_STATE_OVER, RESULT_OUTCOME_SUCCESS
|
|
|
|
);
|
|
|
|
result.update_field(buf);
|
|
|
|
|
|
|
|
dbwu.id = wu.id;
|
- server code: at some point I made a global var "SCHED_CONFIG config",
mostly so that the parse function could assume
that everything was initially zero.
However, various back-end functions pass around SCHED_CONFIG&
as an argument (also named "config").
This creates a shadow, which is always bad.
Worse is the possibility that some projects have back-end programs
that have a SCHED_CONFIG variable that's automatic,
and therefore isn't zero initially,
and therefore isn't parsing correctly.
To fix this, I changed the 2 vectors in SCHED_CONFIG into pointers,
and have the parse routine zero the structure.
I was tempted to remove the SCHED_CONFIG& args to back-end functions,
but this would have broken some projects' code.
I did, however, change the name from config to config_loc
to avoid shadowing.
Also fixed various other compiler warnings.
svn path=/trunk/boinc/; revision=15541
2008-07-02 17:24:53 +00:00
|
|
|
sprintf(buf, "transition_time=%ld", time(0));
|
2007-07-05 04:18:48 +00:00
|
|
|
dbwu.update_field(buf);
|
|
|
|
|
|
|
|
}
|
2008-06-04 23:04:12 +00:00
|
|
|
|
|
|
|
// If we're sending an unreplicated job to an untrusted host,
|
|
|
|
// mark it as replicated
|
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
if (wu.target_nresults == 1 && app->target_nresults > 1) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (bavp->trusted) {
|
2008-08-06 18:36:30 +00:00
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[send] [WU#%u] using trusted app version, not replicating\n", wu.id
|
2008-08-06 18:36:30 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
DB_WORKUNIT dbwu;
|
|
|
|
char buf[256];
|
2008-11-07 04:13:08 +00:00
|
|
|
sprintf(buf,
|
|
|
|
"target_nresults=%d, min_quorum=%d, transition_time=%ld",
|
2008-08-06 18:36:30 +00:00
|
|
|
app->target_nresults, app->target_nresults, time(0)
|
2008-06-04 23:04:12 +00:00
|
|
|
);
|
2008-08-06 18:36:30 +00:00
|
|
|
dbwu.id = wu.id;
|
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[send] [WU#%u] sending to untrusted host, replicating\n", wu.id
|
2008-08-06 18:36:30 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
retval = dbwu.update_field(buf);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"WU update failed: %s", boincerror(retval)
|
2008-08-06 18:36:30 +00:00
|
|
|
);
|
|
|
|
}
|
2008-06-04 23:04:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-08-27 17:00:43 +00:00
|
|
|
// if the app uses locality scheduling lite,
|
|
|
|
// add the job's files to the list of those on host
|
|
|
|
//
|
|
|
|
if (app->locality_scheduling == LOCALITY_SCHED_LITE) {
|
|
|
|
add_job_files_to_host(wu);
|
|
|
|
}
|
|
|
|
|
2004-09-10 00:41:48 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-06-16 22:07:19 +00:00
|
|
|
// Send high-priority messages about things the user can change easily
|
|
|
|
// (namely the driver version)
|
|
|
|
// and low-priority messages about things that can't easily be changed,
|
|
|
|
// but which may be interfering with getting tasks or latest apps
|
2008-04-26 23:34:38 +00:00
|
|
|
//
|
2011-10-03 23:43:53 +00:00
|
|
|
static void send_gpu_property_messages(
|
2010-06-24 22:58:05 +00:00
|
|
|
GPU_REQUIREMENTS& req, double ram, int version, const char* rsc_name
|
2010-06-16 22:07:19 +00:00
|
|
|
) {
|
|
|
|
char buf[256];
|
|
|
|
if (ram < req.min_ram) {
|
|
|
|
sprintf(buf,
|
|
|
|
"A minimum of %d MB (preferably %d MB) of video RAM is needed to process tasks using your computer's %s",
|
|
|
|
(int) (req.min_ram/MEGA),
|
|
|
|
(int) (req.opt_ram/MEGA),
|
|
|
|
rsc_name
|
|
|
|
);
|
|
|
|
g_reply->insert_message(buf, "low");
|
|
|
|
} else {
|
|
|
|
if (version) {
|
|
|
|
if (version < req.min_driver_version) {
|
|
|
|
sprintf(buf,
|
2010-07-13 02:49:35 +00:00
|
|
|
"%s: %s",
|
|
|
|
rsc_name,
|
|
|
|
_("Upgrade to the latest driver to process tasks using your computer's GPU")
|
2010-06-16 22:07:19 +00:00
|
|
|
);
|
2010-06-29 03:23:13 +00:00
|
|
|
g_reply->insert_message(buf, "notice");
|
2010-06-16 22:07:19 +00:00
|
|
|
} else if (version < req.opt_driver_version) {
|
|
|
|
sprintf(buf,
|
2010-07-13 02:49:35 +00:00
|
|
|
"%s: %s",
|
|
|
|
rsc_name,
|
|
|
|
_("Upgrade to the latest driver to use all of this project's GPU applications")
|
2010-06-16 22:07:19 +00:00
|
|
|
);
|
|
|
|
g_reply->insert_message(buf, "low");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-03 23:43:53 +00:00
|
|
|
// send messages complaining about lack of GPU or the properties of GPUs
|
2010-06-16 22:07:19 +00:00
|
|
|
//
|
2011-10-03 23:43:53 +00:00
|
|
|
void send_gpu_messages() {
|
2010-06-16 22:07:19 +00:00
|
|
|
// Mac client with GPU but too-old client
|
|
|
|
//
|
2011-03-25 22:47:49 +00:00
|
|
|
if (g_request->coprocs.nvidia.count
|
2012-06-25 23:09:45 +00:00
|
|
|
&& ssp->have_apps_for_proc_type[PROC_TYPE_NVIDIA_GPU]
|
2010-06-16 22:07:19 +00:00
|
|
|
&& strstr(g_request->host.os_name, "Darwin")
|
|
|
|
&& g_request->core_client_version < 61028
|
|
|
|
) {
|
|
|
|
g_reply->insert_message(
|
2010-07-13 02:49:35 +00:00
|
|
|
_("A newer version of BOINC is needed to use your NVIDIA GPU; please upgrade to the current version"),
|
2010-06-29 03:23:13 +00:00
|
|
|
"notice"
|
2010-06-16 22:07:19 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
// GPU-only project, client lacks GPU
|
|
|
|
//
|
2012-06-25 23:09:45 +00:00
|
|
|
bool usable_gpu = false;
|
|
|
|
for (int i=1; i<NPROC_TYPES; i++) {
|
|
|
|
COPROC* cp = g_request->coprocs.type_to_coproc(i);
|
|
|
|
if (ssp->have_apps_for_proc_type[i] && cp->count) {
|
|
|
|
usable_gpu = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2012-06-22 07:35:54 +00:00
|
|
|
if (!ssp->have_apps_for_proc_type[PROC_TYPE_CPU] && !usable_gpu) {
|
|
|
|
char buf[256];
|
|
|
|
strcpy(buf, "");
|
|
|
|
for (int i=1; i<NPROC_TYPES; i++) {
|
|
|
|
if (ssp->have_apps_for_proc_type[i]) {
|
|
|
|
if (strlen(buf)) {
|
|
|
|
strcat(buf, " or ");
|
|
|
|
}
|
|
|
|
strcat(buf, proc_type_name(i));
|
2010-06-16 22:07:19 +00:00
|
|
|
}
|
|
|
|
}
|
2012-08-05 01:40:19 +00:00
|
|
|
char msg[1024];
|
|
|
|
sprintf(msg,
|
2012-06-22 07:35:54 +00:00
|
|
|
_("An %s GPU is required to run tasks for this project"),
|
2012-08-05 01:40:19 +00:00
|
|
|
buf
|
2012-06-22 07:35:54 +00:00
|
|
|
);
|
2012-08-05 01:40:19 +00:00
|
|
|
g_reply->insert_message(msg, "notice");
|
2010-06-16 22:07:19 +00:00
|
|
|
}
|
|
|
|
|
2012-06-25 23:09:45 +00:00
|
|
|
if (g_request->coprocs.nvidia.count && ssp->have_apps_for_proc_type[PROC_TYPE_NVIDIA_GPU]) {
|
|
|
|
send_gpu_property_messages(gpu_requirements[PROC_TYPE_NVIDIA_GPU],
|
2012-04-10 00:32:35 +00:00
|
|
|
g_request->coprocs.nvidia.prop.totalGlobalMem,
|
2011-03-25 22:47:49 +00:00
|
|
|
g_request->coprocs.nvidia.display_driver_version,
|
2012-06-25 23:09:45 +00:00
|
|
|
proc_type_name(PROC_TYPE_NVIDIA_GPU)
|
2010-06-16 22:07:19 +00:00
|
|
|
);
|
|
|
|
}
|
2012-06-25 23:09:45 +00:00
|
|
|
if (g_request->coprocs.ati.count && ssp->have_apps_for_proc_type[PROC_TYPE_AMD_GPU]) {
|
|
|
|
send_gpu_property_messages(gpu_requirements[PROC_TYPE_AMD_GPU],
|
2010-06-16 22:07:19 +00:00
|
|
|
g_request->coprocs.ati.attribs.localRAM*MEGA,
|
|
|
|
g_request->coprocs.ati.version_num,
|
2012-06-25 23:09:45 +00:00
|
|
|
proc_type_name(PROC_TYPE_AMD_GPU)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
if (g_request->coprocs.intel_gpu.count && ssp->have_apps_for_proc_type[PROC_TYPE_INTEL_GPU]) {
|
|
|
|
send_gpu_property_messages(gpu_requirements[PROC_TYPE_INTEL_GPU],
|
|
|
|
g_request->coprocs.intel_gpu.opencl_prop.global_mem_size,
|
|
|
|
0,
|
|
|
|
proc_type_name(PROC_TYPE_INTEL_GPU)
|
2010-06-16 22:07:19 +00:00
|
|
|
);
|
|
|
|
}
|
2012-06-25 23:09:45 +00:00
|
|
|
|
2011-10-03 23:43:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// send messages to user about why jobs were or weren't sent,
|
|
|
|
// recommendations for GPU driver upgrades, etc.
|
|
|
|
//
|
|
|
|
static void send_user_messages() {
|
|
|
|
char buf[512];
|
|
|
|
unsigned int i;
|
|
|
|
int j;
|
2010-06-16 22:07:19 +00:00
|
|
|
|
2011-10-03 23:43:53 +00:00
|
|
|
// GPU messages aren't relevant if anonymous platform
|
|
|
|
//
|
|
|
|
if (!g_wreq->anonymous_platform) {
|
|
|
|
send_gpu_messages();
|
|
|
|
}
|
2010-06-16 22:07:19 +00:00
|
|
|
|
2008-08-14 22:06:51 +00:00
|
|
|
// If work was sent from apps the user did not select, explain.
|
|
|
|
// NOTE: this will have to be done differently with matchmaker scheduling
|
2008-03-07 21:13:01 +00:00
|
|
|
//
|
2010-01-05 11:03:10 +00:00
|
|
|
if (!config.locality_scheduling && !config.locality_scheduler_fraction && !config.matchmaker) {
|
2009-06-01 22:15:14 +00:00
|
|
|
if (g_wreq->njobs_sent && !g_wreq->user_apps_only) {
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2011-03-04 19:40:59 +00:00
|
|
|
"No tasks are available for the applications you have selected",
|
2010-06-08 18:56:53 +00:00
|
|
|
"low"
|
2008-08-14 22:06:51 +00:00
|
|
|
);
|
2008-03-07 21:13:01 +00:00
|
|
|
|
2008-08-14 22:06:51 +00:00
|
|
|
// Inform the user about applications with no work
|
|
|
|
//
|
2008-12-18 21:25:51 +00:00
|
|
|
for (i=0; i<g_wreq->preferred_apps.size(); i++) {
|
|
|
|
if (!g_wreq->preferred_apps[i].work_available) {
|
|
|
|
APP* app = ssp->lookup_app(g_wreq->preferred_apps[i].appid);
|
2008-08-14 22:06:51 +00:00
|
|
|
// don't write message if the app is deprecated
|
|
|
|
//
|
|
|
|
if (app) {
|
|
|
|
char explanation[256];
|
|
|
|
sprintf(explanation,
|
2011-03-04 19:40:59 +00:00
|
|
|
"No tasks are available for %s",
|
2008-12-18 21:25:51 +00:00
|
|
|
find_user_friendly_name(g_wreq->preferred_apps[i].appid)
|
2008-08-14 22:06:51 +00:00
|
|
|
);
|
2010-06-08 18:56:53 +00:00
|
|
|
g_reply->insert_message( explanation, "low");
|
2008-08-14 22:06:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2008-03-07 21:13:01 +00:00
|
|
|
|
2008-08-14 22:06:51 +00:00
|
|
|
// Tell the user about applications they didn't qualify for
|
|
|
|
//
|
|
|
|
for (j=0; j<preferred_app_message_index; j++){
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->insert_message(g_wreq->no_work_messages.at(j));
|
2008-08-14 22:06:51 +00:00
|
|
|
}
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2011-03-04 19:40:59 +00:00
|
|
|
"Your preferences allow tasks from applications other than those selected",
|
2010-01-18 04:52:58 +00:00
|
|
|
"low"
|
2009-02-26 03:03:35 +00:00
|
|
|
);
|
|
|
|
g_reply->insert_message(
|
2011-03-04 19:40:59 +00:00
|
|
|
"Sending tasks from other applications", "low"
|
2008-08-14 22:06:51 +00:00
|
|
|
);
|
2008-03-07 21:13:01 +00:00
|
|
|
}
|
2008-08-14 22:06:51 +00:00
|
|
|
}
|
2008-03-07 21:13:01 +00:00
|
|
|
|
2007-08-16 17:33:41 +00:00
|
|
|
// if client asked for work and we're not sending any, explain why
|
|
|
|
//
|
2012-11-02 22:38:54 +00:00
|
|
|
if (g_wreq->njobs_sent == 0 && g_request->work_req_seconds) {
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->set_delay(DELAY_NO_WORK_TEMP);
|
2011-03-04 19:40:59 +00:00
|
|
|
g_reply->insert_message("No tasks sent", "low");
|
2009-01-20 21:31:13 +00:00
|
|
|
|
|
|
|
// Tell the user about applications with no work
|
|
|
|
//
|
2008-12-18 21:25:51 +00:00
|
|
|
for (i=0; i<g_wreq->preferred_apps.size(); i++) {
|
2010-06-08 18:56:53 +00:00
|
|
|
if (!g_wreq->preferred_apps[i].work_available) {
|
|
|
|
APP* app = ssp->lookup_app(g_wreq->preferred_apps[i].appid);
|
|
|
|
// don't write message if the app is deprecated
|
|
|
|
if (app != NULL) {
|
2011-03-04 19:40:59 +00:00
|
|
|
sprintf(buf, "No tasks are available for %s",
|
2010-06-08 18:56:53 +00:00
|
|
|
find_user_friendly_name(
|
|
|
|
g_wreq->preferred_apps[i].appid
|
|
|
|
)
|
2008-10-27 21:23:07 +00:00
|
|
|
);
|
2010-06-08 18:56:53 +00:00
|
|
|
g_reply->insert_message(buf, "low");
|
|
|
|
}
|
|
|
|
}
|
2008-03-07 21:13:01 +00:00
|
|
|
}
|
2009-01-20 21:31:13 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
for (i=0; i<g_wreq->no_work_messages.size(); i++){
|
2009-05-06 21:52:50 +00:00
|
|
|
g_reply->insert_message(g_wreq->no_work_messages.at(i));
|
2008-03-07 21:13:01 +00:00
|
|
|
}
|
2011-08-27 02:54:39 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
if (g_wreq->no_allowed_apps_available) {
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2011-03-04 19:40:59 +00:00
|
|
|
_("No tasks are available for the applications you have selected."),
|
|
|
|
"low"
|
2006-05-02 22:17:09 +00:00
|
|
|
);
|
|
|
|
}
|
2008-12-16 16:29:54 +00:00
|
|
|
if (g_wreq->speed.insufficient) {
|
2010-01-13 17:28:59 +00:00
|
|
|
if (g_request->core_client_version>41900) {
|
2010-06-08 18:56:53 +00:00
|
|
|
sprintf(buf,
|
2010-06-16 22:07:19 +00:00
|
|
|
"Tasks won't finish in time: BOINC runs %.1f%% of the time; computation is enabled %.1f%% of that",
|
2010-05-03 17:20:44 +00:00
|
|
|
100*g_reply->host.on_frac, 100*g_reply->host.active_frac
|
2005-03-19 18:24:24 +00:00
|
|
|
);
|
2007-08-16 17:33:41 +00:00
|
|
|
} else {
|
2010-06-08 18:56:53 +00:00
|
|
|
sprintf(buf,
|
2010-06-16 22:07:19 +00:00
|
|
|
"Tasks won't finish in time: Computer available %.1f%% of the time",
|
2010-05-03 17:20:44 +00:00
|
|
|
100*g_reply->host.on_frac
|
2005-03-19 18:24:24 +00:00
|
|
|
);
|
|
|
|
}
|
2010-06-08 18:56:53 +00:00
|
|
|
g_reply->insert_message(buf, "low");
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
2008-12-16 16:29:54 +00:00
|
|
|
if (g_wreq->hr_reject_temp) {
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2010-06-08 18:56:53 +00:00
|
|
|
"Tasks are committed to other platforms",
|
|
|
|
"low"
|
2004-05-18 18:33:01 +00:00
|
|
|
);
|
|
|
|
}
|
2008-12-16 16:29:54 +00:00
|
|
|
if (g_wreq->hr_reject_perm) {
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2010-07-13 02:49:35 +00:00
|
|
|
_("Your computer type is not supported by this project"),
|
2010-06-29 03:23:13 +00:00
|
|
|
"notice"
|
2005-11-30 22:52:23 +00:00
|
|
|
);
|
|
|
|
}
|
2009-01-20 21:31:13 +00:00
|
|
|
if (g_wreq->outdated_client) {
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2010-07-13 02:49:35 +00:00
|
|
|
_("Newer BOINC version required; please install current version"),
|
2010-06-29 03:23:13 +00:00
|
|
|
"notice"
|
2004-04-04 01:59:47 +00:00
|
|
|
);
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->set_delay(DELAY_NO_WORK_PERM);
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2011-03-04 19:40:59 +00:00
|
|
|
"Not sending tasks because newer client version required\n"
|
2004-04-04 01:59:47 +00:00
|
|
|
);
|
|
|
|
}
|
2012-06-22 07:35:54 +00:00
|
|
|
for (i=0; i<NPROC_TYPES; i++) {
|
|
|
|
if (g_wreq->dont_use_proc_type[i] && ssp->have_apps_for_proc_type[i]) {
|
|
|
|
sprintf(buf,
|
|
|
|
_("Tasks for %s are available, but your preferences are set to not accept them"),
|
|
|
|
proc_type_name(i)
|
|
|
|
);
|
|
|
|
g_reply->insert_message(buf, "low");
|
|
|
|
}
|
2009-03-18 21:14:44 +00:00
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
DB_HOST_APP_VERSION* havp = quota_exceeded_version();
|
|
|
|
if (havp) {
|
2011-08-30 21:34:27 +00:00
|
|
|
sprintf(buf, "This computer has finished a daily quota of %d tasks",
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
havp->max_jobs_per_day
|
2009-01-20 00:54:16 +00:00
|
|
|
);
|
2010-06-08 18:56:53 +00:00
|
|
|
g_reply->insert_message(buf, "low");
|
2010-06-25 18:54:37 +00:00
|
|
|
if (config.debug_quota) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[quota] Daily quota %d exceeded for app version %d\n",
|
|
|
|
havp->max_jobs_per_day, havp->app_version_id
|
|
|
|
);
|
|
|
|
}
|
2010-06-03 04:59:27 +00:00
|
|
|
g_reply->set_delay(DELAY_NO_WORK_CACHE);
|
2004-05-27 18:13:00 +00:00
|
|
|
}
|
2010-06-16 22:07:19 +00:00
|
|
|
if (g_wreq->max_jobs_on_host_exceeded
|
|
|
|
|| g_wreq->max_jobs_on_host_cpu_exceeded
|
|
|
|
|| g_wreq->max_jobs_on_host_gpu_exceeded
|
|
|
|
) {
|
|
|
|
sprintf(buf, "This computer has reached a limit on tasks in progress");
|
2010-06-08 18:56:53 +00:00
|
|
|
g_reply->insert_message(buf, "low");
|
2009-06-01 22:15:14 +00:00
|
|
|
g_reply->set_delay(DELAY_NO_WORK_CACHE);
|
2008-05-23 16:13:30 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-01-10 00:43:33 +00:00
|
|
|
static double clamp_req_sec(double x) {
|
|
|
|
if (x < MIN_REQ_SECS) return MIN_REQ_SECS;
|
|
|
|
if (x > MAX_REQ_SECS) return MAX_REQ_SECS;
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
|
2010-06-01 19:56:54 +00:00
|
|
|
// prepare to send jobs, both resent and new;
|
2009-03-05 23:08:53 +00:00
|
|
|
// decipher request type, fill in WORK_REQ
|
|
|
|
//
|
|
|
|
void send_work_setup() {
|
2009-08-26 18:21:36 +00:00
|
|
|
unsigned int i;
|
|
|
|
|
2010-06-03 19:24:57 +00:00
|
|
|
g_wreq->seconds_to_fill = clamp_req_sec(g_request->work_req_seconds);
|
2012-06-22 07:35:54 +00:00
|
|
|
g_wreq->req_secs[PROC_TYPE_CPU] = clamp_req_sec(g_request->cpu_req_secs);
|
|
|
|
g_wreq->req_instances[PROC_TYPE_CPU] = g_request->cpu_req_instances;
|
2010-07-23 17:43:20 +00:00
|
|
|
g_wreq->anonymous_platform = is_anonymous(g_request->platforms.list[0]);
|
2010-06-03 19:24:57 +00:00
|
|
|
|
2012-04-26 21:40:20 +00:00
|
|
|
// decide on attributes of HOST_APP_VERSIONS
|
|
|
|
//
|
|
|
|
get_reliability_and_trust();
|
|
|
|
|
|
|
|
// parse project preferences (e.g. no GPUs)
|
|
|
|
//
|
|
|
|
get_prefs_info();
|
|
|
|
|
2010-06-01 19:56:54 +00:00
|
|
|
if (g_wreq->anonymous_platform) {
|
|
|
|
estimate_flops_anon_platform();
|
2011-10-03 23:43:53 +00:00
|
|
|
|
2012-06-22 07:35:54 +00:00
|
|
|
for (i=0; i<NPROC_TYPES; i++) {
|
2012-06-25 23:09:45 +00:00
|
|
|
g_wreq->client_has_apps_for_proc_type[i] = false;
|
2012-06-22 07:35:54 +00:00
|
|
|
}
|
2011-10-03 23:43:53 +00:00
|
|
|
for (i=0; i<g_request->client_app_versions.size(); i++) {
|
|
|
|
CLIENT_APP_VERSION& cav = g_request->client_app_versions[i];
|
2012-06-22 07:35:54 +00:00
|
|
|
int pt = cav.host_usage.proc_type;
|
2012-06-25 23:09:45 +00:00
|
|
|
g_wreq->client_has_apps_for_proc_type[pt] = true;
|
2011-10-03 23:43:53 +00:00
|
|
|
}
|
2010-06-01 19:56:54 +00:00
|
|
|
}
|
2012-06-25 23:09:45 +00:00
|
|
|
for (i=1; i<NPROC_TYPES; i++) {
|
|
|
|
gpu_requirements[i].clear();
|
|
|
|
}
|
2010-06-01 19:56:54 +00:00
|
|
|
|
2009-03-05 23:08:53 +00:00
|
|
|
g_wreq->disk_available = max_allowable_disk();
|
|
|
|
get_mem_sizes();
|
|
|
|
get_running_frac();
|
2009-06-22 21:11:19 +00:00
|
|
|
g_wreq->get_job_limits();
|
2009-03-05 23:08:53 +00:00
|
|
|
|
2012-06-25 23:09:45 +00:00
|
|
|
// do sanity checking on GPU scheduling parameters
|
|
|
|
//
|
|
|
|
for (i=1; i<NPROC_TYPES; i++) {
|
|
|
|
COPROC* cp = g_request->coprocs.type_to_coproc(i);
|
|
|
|
if (cp->count) {
|
|
|
|
g_wreq->req_secs[i] = clamp_req_sec(cp->req_secs);
|
|
|
|
g_wreq->req_instances[i] = cp->req_instances;
|
|
|
|
if (cp->estimated_delay < 0) {
|
|
|
|
cp->estimated_delay = g_request->cpu_estimated_delay;
|
|
|
|
}
|
2009-08-17 17:07:38 +00:00
|
|
|
}
|
|
|
|
}
|
2012-06-22 07:35:54 +00:00
|
|
|
g_wreq->rsc_spec_request = false;
|
|
|
|
for (i=0; i<NPROC_TYPES; i++) {
|
|
|
|
if (g_wreq->req_secs[i]) {
|
|
|
|
g_wreq->rsc_spec_request = true;
|
|
|
|
break;
|
|
|
|
}
|
2009-01-10 00:43:33 +00:00
|
|
|
}
|
2009-08-26 18:21:36 +00:00
|
|
|
|
2010-06-01 19:56:54 +00:00
|
|
|
for (i=0; i<g_request->other_results.size(); i++) {
|
|
|
|
OTHER_RESULT& r = g_request->other_results[i];
|
2010-06-01 23:41:07 +00:00
|
|
|
APP* app = NULL;
|
|
|
|
bool uses_gpu = false;
|
2010-07-23 17:43:20 +00:00
|
|
|
bool have_cav = false;
|
2010-06-25 18:54:37 +00:00
|
|
|
if (r.app_version >= 0
|
|
|
|
&& r.app_version < (int)g_request->client_app_versions.size()
|
|
|
|
) {
|
2010-06-01 23:41:07 +00:00
|
|
|
CLIENT_APP_VERSION& cav = g_request->client_app_versions[r.app_version];
|
|
|
|
app = cav.app;
|
2010-07-23 17:43:20 +00:00
|
|
|
if (app) {
|
|
|
|
have_cav = true;
|
|
|
|
uses_gpu = cav.host_usage.uses_gpu();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!have_cav) {
|
2010-06-01 23:41:07 +00:00
|
|
|
if (r.have_plan_class && app_plan_uses_gpu(r.plan_class)) {
|
|
|
|
uses_gpu = true;
|
2010-06-01 19:56:54 +00:00
|
|
|
}
|
|
|
|
}
|
2010-06-01 23:41:07 +00:00
|
|
|
config.max_jobs_in_progress.register_job(app, uses_gpu);
|
2010-06-01 19:56:54 +00:00
|
|
|
}
|
|
|
|
|
2009-08-26 18:21:36 +00:00
|
|
|
// print details of request to log
|
|
|
|
//
|
2011-05-17 19:11:44 +00:00
|
|
|
if (config.debug_quota) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[quota] max jobs per RPC: %d\n", g_wreq->max_jobs_per_rpc
|
|
|
|
);
|
|
|
|
config.max_jobs_in_progress.print_log();
|
|
|
|
}
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
2009-08-26 18:21:36 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] %s matchmaker scheduling; %s EDF sim\n",
|
|
|
|
config.matchmaker?"Using":"Not using",
|
|
|
|
config.workload_sim?"Using":"Not using"
|
|
|
|
);
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-30 21:25:24 +00:00
|
|
|
"[send] CPU: req %.2f sec, %.2f instances; est delay %.2f\n",
|
2012-06-22 07:35:54 +00:00
|
|
|
g_wreq->req_secs[PROC_TYPE_CPU],
|
|
|
|
g_wreq->req_instances[PROC_TYPE_CPU],
|
2009-01-30 21:25:24 +00:00
|
|
|
g_request->cpu_estimated_delay
|
2009-01-10 00:43:33 +00:00
|
|
|
);
|
2012-06-25 23:09:45 +00:00
|
|
|
for (i=1; i<NPROC_TYPES; i++) {
|
|
|
|
COPROC* cp = g_request->coprocs.type_to_coproc(i);
|
|
|
|
if (cp->count) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] %s: req %.2f sec, %.2f instances; est delay %.2f\n",
|
|
|
|
proc_type_name(i),
|
|
|
|
g_wreq->req_secs[i],
|
|
|
|
g_wreq->req_instances[i],
|
|
|
|
cp->estimated_delay
|
|
|
|
);
|
|
|
|
}
|
2009-08-17 17:07:38 +00:00
|
|
|
}
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] work_req_seconds: %.2f secs\n",
|
2009-01-10 00:43:33 +00:00
|
|
|
g_wreq->seconds_to_fill
|
|
|
|
);
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-27 18:57:27 +00:00
|
|
|
"[send] available disk %.2f GB, work_buf_min %d\n",
|
2008-12-16 16:29:54 +00:00
|
|
|
g_wreq->disk_available/GIGA,
|
|
|
|
(int)g_request->global_prefs.work_buf_min()
|
2008-05-02 17:48:29 +00:00
|
|
|
);
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-04-08 23:14:47 +00:00
|
|
|
"[send] active_frac %f on_frac %f\n",
|
2008-12-18 18:19:42 +00:00
|
|
|
g_reply->host.active_frac,
|
2010-04-08 23:14:47 +00:00
|
|
|
g_reply->host.on_frac
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
2009-08-26 18:21:36 +00:00
|
|
|
if (g_wreq->anonymous_platform) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-02-02 00:18:42 +00:00
|
|
|
"[send] Anonymous platform app versions:\n"
|
2009-08-26 18:21:36 +00:00
|
|
|
);
|
|
|
|
for (i=0; i<g_request->client_app_versions.size(); i++) {
|
|
|
|
CLIENT_APP_VERSION& cav = g_request->client_app_versions[i];
|
2012-06-22 07:35:54 +00:00
|
|
|
char buf[256];
|
|
|
|
strcpy(buf, "");
|
|
|
|
int pt = cav.host_usage.proc_type;
|
|
|
|
if (pt) {
|
|
|
|
sprintf(buf, " %.2f %s GPU",
|
|
|
|
cav.host_usage.gpu_usage,
|
|
|
|
proc_type_name(pt)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2009-08-26 18:21:36 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-06-22 07:35:54 +00:00
|
|
|
" app: %s version %d cpus %.2f%s flops %fG\n",
|
2010-04-11 04:42:52 +00:00
|
|
|
cav.app_name,
|
|
|
|
cav.version_num,
|
|
|
|
cav.host_usage.avg_ncpus,
|
2012-06-22 07:35:54 +00:00
|
|
|
buf,
|
2010-04-11 04:42:52 +00:00
|
|
|
cav.host_usage.projected_flops/1e9
|
2009-08-26 18:21:36 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
2012-08-29 18:08:15 +00:00
|
|
|
#if 0
|
2012-02-02 00:18:42 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] p_vm_extensions_disabled: %s\n",
|
|
|
|
g_request->host.p_vm_extensions_disabled?"yes":"no"
|
|
|
|
);
|
2012-08-29 18:08:15 +00:00
|
|
|
#endif
|
2012-02-02 00:18:42 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] CPU features: %s\n", g_request->host.p_features
|
|
|
|
);
|
2008-04-26 23:34:38 +00:00
|
|
|
}
|
2009-03-05 23:08:53 +00:00
|
|
|
}
|
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// If a record is not in DB, create it.
|
2010-04-08 23:14:47 +00:00
|
|
|
//
|
2011-06-06 03:40:42 +00:00
|
|
|
int update_host_app_versions(vector<SCHED_DB_RESULT>& results, int hostid) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
vector<DB_HOST_APP_VERSION> new_havs;
|
2010-04-08 23:14:47 +00:00
|
|
|
unsigned int i, j;
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
for (i=0; i<results.size(); i++) {
|
|
|
|
RESULT& r = results[i];
|
|
|
|
int gavid = generalized_app_version_id(r.app_version_id, r.appid);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
DB_HOST_APP_VERSION* havp = gavid_to_havp(gavid);
|
2010-04-21 19:33:20 +00:00
|
|
|
if (!havp) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
bool found = false;
|
|
|
|
for (j=0; j<new_havs.size(); j++) {
|
|
|
|
DB_HOST_APP_VERSION& hav = new_havs[j];
|
|
|
|
if (hav.app_version_id == gavid) {
|
|
|
|
found = true;
|
2011-06-06 04:10:59 +00:00
|
|
|
hav.n_jobs_today++;
|
2010-04-08 23:14:47 +00:00
|
|
|
}
|
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (!found) {
|
|
|
|
DB_HOST_APP_VERSION hav;
|
2010-04-15 21:58:44 +00:00
|
|
|
hav.clear();
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
hav.host_id = hostid;
|
|
|
|
hav.app_version_id = gavid;
|
2011-06-06 04:10:59 +00:00
|
|
|
hav.n_jobs_today = 1;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
new_havs.push_back(hav);
|
|
|
|
}
|
2010-04-08 23:14:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// create new records
|
2010-04-08 23:14:47 +00:00
|
|
|
//
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
for (i=0; i<new_havs.size(); i++) {
|
|
|
|
DB_HOST_APP_VERSION& hav = new_havs[i];
|
|
|
|
|
|
|
|
retval = hav.insert();
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"hav.insert(): %s\n", boincerror(retval)
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
);
|
2010-04-08 23:14:47 +00:00
|
|
|
} else {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (config.debug_credit) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[credit] created host_app_version record (%d, %d)\n",
|
|
|
|
hav.host_id, hav.app_version_id
|
2010-04-08 23:14:47 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-03-05 23:08:53 +00:00
|
|
|
void send_work() {
|
2010-03-29 22:28:20 +00:00
|
|
|
int retval;
|
|
|
|
|
2010-06-16 22:07:19 +00:00
|
|
|
g_wreq->no_jobs_available = true;
|
|
|
|
|
2009-03-05 23:08:53 +00:00
|
|
|
if (all_apps_use_hr && hr_unknown_platform(g_request->host)) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"Not sending work because unknown HR class\n"
|
|
|
|
);
|
|
|
|
g_wreq->hr_reject_perm = true;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.enable_assignment) {
|
2008-12-19 18:14:02 +00:00
|
|
|
if (send_assigned_jobs()) {
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_assignment) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[assign] [HOST#%d] sent assigned jobs\n", g_reply->host.id
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2010-04-08 18:27:27 +00:00
|
|
|
goto done;
|
2008-04-26 23:34:38 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-01-30 22:39:13 +00:00
|
|
|
if (config.enable_assignment_multi) {
|
|
|
|
if (send_assigned_jobs_multi()) {
|
|
|
|
if (config.debug_assignment) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[assign] [HOST#%d] sent assigned jobs\n", g_reply->host.id
|
|
|
|
);
|
|
|
|
}
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
if (config.workload_sim && g_request->have_other_results_list) {
|
2008-04-26 23:34:38 +00:00
|
|
|
init_ip_results(
|
2009-06-22 21:11:19 +00:00
|
|
|
g_request->global_prefs.work_buf_min(),
|
|
|
|
g_wreq->effective_ncpus, g_request->ip_results
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2012-09-01 04:58:12 +00:00
|
|
|
// send non-CPU-intensive jobs if needed
|
|
|
|
//
|
2012-09-19 23:21:04 +00:00
|
|
|
if (ssp->have_nci_app && g_request->work_req_seconds > 0) {
|
2012-09-01 04:58:12 +00:00
|
|
|
send_nci();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!work_needed(false)) {
|
|
|
|
send_user_messages();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2009-03-03 00:12:55 +00:00
|
|
|
if (config.locality_scheduler_fraction > 0) {
|
|
|
|
if (drand() < config.locality_scheduler_fraction) {
|
2010-04-06 20:32:02 +00:00
|
|
|
if (config.debug_locality) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[mixed] sending locality work first\n"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
send_work_locality();
|
|
|
|
if (config.debug_locality) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[mixed] sending non-locality work second\n"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
send_work_old();
|
2009-03-03 00:12:55 +00:00
|
|
|
} else {
|
2010-04-06 20:32:02 +00:00
|
|
|
if (config.debug_locality) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[mixed] sending non-locality work first\n"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
send_work_old();
|
|
|
|
if (config.debug_locality) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[mixed] sending locality work second\n"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
send_work_locality();
|
2009-03-03 00:12:55 +00:00
|
|
|
}
|
|
|
|
} else if (config.locality_scheduling) {
|
2008-12-19 18:14:02 +00:00
|
|
|
send_work_locality();
|
2008-05-23 16:13:30 +00:00
|
|
|
} else if (config.matchmaker) {
|
2013-04-09 18:10:50 +00:00
|
|
|
send_work_score();
|
2008-05-23 16:13:30 +00:00
|
|
|
} else {
|
2008-12-16 16:29:54 +00:00
|
|
|
send_work_old();
|
2008-04-26 23:34:38 +00:00
|
|
|
}
|
|
|
|
|
2010-04-08 18:27:27 +00:00
|
|
|
done:
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
retval = update_host_app_versions(g_reply->results, g_reply->host.id);
|
2010-03-29 22:28:20 +00:00
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"update_host_app_versions() failed: %s\n", boincerror(retval)
|
2010-03-29 22:28:20 +00:00
|
|
|
);
|
|
|
|
}
|
2010-06-16 22:07:19 +00:00
|
|
|
send_user_messages();
|
2008-04-26 23:34:38 +00:00
|
|
|
}
|
|
|
|
|
2005-01-02 18:29:53 +00:00
|
|
|
const char *BOINC_RCSID_32dcd335e7 = "$Id$";
|