2008-08-06 18:36:30 +00:00
|
|
|
// This file is part of BOINC.
|
2005-01-20 23:22:22 +00:00
|
|
|
// http://boinc.berkeley.edu
|
2008-08-06 18:36:30 +00:00
|
|
|
// Copyright (C) 2008 University of California
|
2004-07-09 21:17:42 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
2004-07-09 21:17:42 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
2005-01-20 23:22:22 +00:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
2004-04-04 01:59:47 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
2004-04-04 01:59:47 +00:00
|
|
|
|
2009-06-01 22:15:14 +00:00
|
|
|
// scheduler code related to sending jobs.
|
|
|
|
// NOTE: there should be nothing here specific to particular
|
|
|
|
// scheduling policies (array scan, matchmaking, locality)
|
2005-02-10 06:21:58 +00:00
|
|
|
|
2005-11-21 18:34:44 +00:00
|
|
|
#include "config.h"
|
2004-04-04 01:59:47 +00:00
|
|
|
#include <vector>
|
2008-02-25 18:05:04 +00:00
|
|
|
#include <list>
|
2004-04-04 01:59:47 +00:00
|
|
|
#include <string>
|
2004-06-24 08:49:37 +00:00
|
|
|
#include <ctime>
|
|
|
|
#include <cstdio>
|
2008-02-27 23:26:38 +00:00
|
|
|
#include <cstring>
|
2005-02-02 18:13:00 +00:00
|
|
|
#include <stdlib.h>
|
2009-03-03 00:12:55 +00:00
|
|
|
#include <sys/time.h>
|
2005-02-10 06:21:58 +00:00
|
|
|
#include <unistd.h>
|
|
|
|
|
2004-04-04 01:59:47 +00:00
|
|
|
#include "error_numbers.h"
|
2005-02-10 20:31:11 +00:00
|
|
|
#include "parse.h"
|
2006-07-11 21:56:53 +00:00
|
|
|
#include "util.h"
|
2007-02-21 16:26:51 +00:00
|
|
|
#include "str_util.h"
|
2009-01-13 23:06:02 +00:00
|
|
|
#include "synch.h"
|
2005-02-10 20:31:11 +00:00
|
|
|
|
2010-03-29 22:28:20 +00:00
|
|
|
#include "credit.h"
|
2009-08-10 04:49:02 +00:00
|
|
|
#include "sched_types.h"
|
2004-04-04 01:59:47 +00:00
|
|
|
#include "sched_shmem.h"
|
|
|
|
#include "sched_config.h"
|
|
|
|
#include "sched_util.h"
|
2009-08-10 04:49:02 +00:00
|
|
|
#include "sched_main.h"
|
2005-08-04 03:50:04 +00:00
|
|
|
#include "sched_array.h"
|
2004-04-08 08:15:23 +00:00
|
|
|
#include "sched_msgs.h"
|
2007-05-31 18:14:45 +00:00
|
|
|
#include "sched_hr.h"
|
2007-06-20 22:34:06 +00:00
|
|
|
#include "hr.h"
|
2004-09-10 21:02:11 +00:00
|
|
|
#include "sched_locality.h"
|
2005-02-10 20:31:11 +00:00
|
|
|
#include "sched_timezone.h"
|
2008-02-21 00:47:50 +00:00
|
|
|
#include "sched_assign.h"
|
2009-07-29 18:55:50 +00:00
|
|
|
#include "sched_customize.h"
|
2010-04-08 23:14:47 +00:00
|
|
|
#include "sched_version.h"
|
2004-04-04 01:59:47 +00:00
|
|
|
|
2007-05-31 18:14:45 +00:00
|
|
|
#include "sched_send.h"
|
2005-02-10 06:21:58 +00:00
|
|
|
|
2004-07-03 21:38:22 +00:00
|
|
|
#ifdef _USING_FCGI_
|
2008-09-09 19:10:42 +00:00
|
|
|
#include "boinc_fcgi.h"
|
2004-07-03 21:38:22 +00:00
|
|
|
#endif
|
|
|
|
|
- scheduler: add support for anonymous-platform coproc apps.
Old: although the request message contained all info
about the app version (flops, coproc usage etc.)
the server ignored this info,
and assumed that all anonymous platform apps where CPU.
With 6.6 client, this could produce infinite work fetch:
- client uses anon platform, has coproc app
- client has idle CPU, requests CPU work
- scheduler sends it jobs, thinking they will be done by CPU app
- client asks for more work etc.
New: scheduler parses full info on anon platform app versions:
plan class, FLOPS, coprocs.
It uses this info to make scheduling decisions;
in particular, if the request is for CUDA work,
if will only send jobs that use a CUDA app version.
The <result> records it returns contain info
(plan_class) that tells the client which app_version to use.
This will work correctly even if the client has multiple app versions
for the same app (e.g., a CPU version and a GPU version)
svn path=/trunk/boinc/; revision=17506
2009-03-05 17:30:10 +00:00
|
|
|
// if host sends us an impossible RAM size, use this instead
|
|
|
|
//
|
|
|
|
const double DEFAULT_RAM_SIZE = 64000000;
|
2008-02-27 22:26:37 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
void send_work_matchmaker();
|
2008-02-25 18:05:04 +00:00
|
|
|
|
2008-04-26 23:34:38 +00:00
|
|
|
int preferred_app_message_index=0;
|
|
|
|
|
2007-11-30 23:02:55 +00:00
|
|
|
const char* infeasible_string(int code) {
|
|
|
|
switch (code) {
|
|
|
|
case INFEASIBLE_MEM: return "Not enough memory";
|
|
|
|
case INFEASIBLE_DISK: return "Not enough disk";
|
|
|
|
case INFEASIBLE_CPU: return "CPU too slow";
|
|
|
|
case INFEASIBLE_APP_SETTING: return "App not selected";
|
|
|
|
case INFEASIBLE_WORKLOAD: return "Existing workload";
|
|
|
|
case INFEASIBLE_DUP: return "Already in reply";
|
|
|
|
case INFEASIBLE_HR: return "Homogeneous redundancy";
|
2008-03-07 21:13:01 +00:00
|
|
|
case INFEASIBLE_BANDWIDTH: return "Download bandwidth too low";
|
2007-11-30 23:02:55 +00:00
|
|
|
}
|
|
|
|
return "Unknown";
|
|
|
|
}
|
|
|
|
|
2009-01-10 00:43:33 +00:00
|
|
|
const double MIN_REQ_SECS = 0;
|
|
|
|
const double MAX_REQ_SECS = (28*SECONDS_IN_DAY);
|
2008-02-21 20:10:10 +00:00
|
|
|
|
2009-11-10 22:19:36 +00:00
|
|
|
const int MAX_GPUS = 8;
|
|
|
|
// don't believe clients who claim they have more GPUs than this
|
2009-01-20 00:54:16 +00:00
|
|
|
|
2010-04-08 18:27:27 +00:00
|
|
|
// get limits on:
|
|
|
|
// # jobs per day
|
|
|
|
// # jobs per RPC
|
|
|
|
// # jobs in progress
|
2009-06-22 21:11:19 +00:00
|
|
|
//
|
|
|
|
void WORK_REQ::get_job_limits() {
|
|
|
|
int n;
|
|
|
|
n = g_reply->host.p_ncpus;
|
2009-06-01 22:15:14 +00:00
|
|
|
if (g_request->global_prefs.max_ncpus_pct && g_request->global_prefs.max_ncpus_pct < 100) {
|
2009-06-22 21:11:19 +00:00
|
|
|
n = (int)((n*g_request->global_prefs.max_ncpus_pct)/100.);
|
2009-06-01 22:15:14 +00:00
|
|
|
}
|
2009-06-22 21:11:19 +00:00
|
|
|
if (n > config.max_ncpus) n = config.max_ncpus;
|
|
|
|
if (n < 1) n = 1;
|
|
|
|
effective_ncpus = n;
|
2009-01-20 00:54:16 +00:00
|
|
|
|
2010-05-18 19:22:34 +00:00
|
|
|
n = g_request->coprocs.cuda.count + g_request->coprocs.ati.count;
|
|
|
|
if (n > MAX_GPUS) n = MAX_GPUS;
|
2009-06-22 21:11:19 +00:00
|
|
|
effective_ngpus = n;
|
2009-01-20 00:54:16 +00:00
|
|
|
|
2009-06-22 21:11:19 +00:00
|
|
|
int mult = effective_ncpus + config.gpu_multiplier * effective_ngpus;
|
2009-09-21 19:58:21 +00:00
|
|
|
if (config.non_cpu_intensive) {
|
|
|
|
mult = 1;
|
|
|
|
effective_ncpus = 1;
|
|
|
|
if (effective_ngpus) effective_ngpus = 1;
|
|
|
|
}
|
2009-06-19 17:21:34 +00:00
|
|
|
|
|
|
|
if (config.max_wus_to_send) {
|
|
|
|
g_wreq->max_jobs_per_rpc = mult * config.max_wus_to_send;
|
|
|
|
} else {
|
|
|
|
g_wreq->max_jobs_per_rpc = 999999;
|
|
|
|
}
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2010-06-01 23:41:07 +00:00
|
|
|
config.max_jobs_in_progress.reset(g_reply->host, g_request->coprocs);
|
2005-07-28 10:13:30 +00:00
|
|
|
}
|
|
|
|
|
- server code: at some point I made a global var "SCHED_CONFIG config",
mostly so that the parse function could assume
that everything was initially zero.
However, various back-end functions pass around SCHED_CONFIG&
as an argument (also named "config").
This creates a shadow, which is always bad.
Worse is the possibility that some projects have back-end programs
that have a SCHED_CONFIG variable that's automatic,
and therefore isn't zero initially,
and therefore isn't parsing correctly.
To fix this, I changed the 2 vectors in SCHED_CONFIG into pointers,
and have the parse routine zero the structure.
I was tempted to remove the SCHED_CONFIG& args to back-end functions,
but this would have broken some projects' code.
I did, however, change the name from config to config_loc
to avoid shadowing.
Also fixed various other compiler warnings.
svn path=/trunk/boinc/; revision=15541
2008-07-02 17:24:53 +00:00
|
|
|
static const char* find_user_friendly_name(int appid) {
|
2009-05-06 21:52:50 +00:00
|
|
|
APP* app = ssp->lookup_app(appid);
|
|
|
|
if (app) return app->user_friendly_name;
|
2008-03-07 21:13:01 +00:00
|
|
|
return "deprecated application";
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-10-04 21:44:58 +00:00
|
|
|
// Compute the max additional disk usage we can impose on the host.
|
|
|
|
// Depending on the client version, it can either send us
|
|
|
|
// - d_total and d_free (pre 4 oct 2005)
|
|
|
|
// - the above plus d_boinc_used_total and d_boinc_used_project
|
2004-04-04 01:59:47 +00:00
|
|
|
//
|
2008-12-16 16:29:54 +00:00
|
|
|
double max_allowable_disk() {
|
|
|
|
HOST host = g_request->host;
|
|
|
|
GLOBAL_PREFS prefs = g_request->global_prefs;
|
2004-08-14 00:37:38 +00:00
|
|
|
double x1, x2, x3, x;
|
|
|
|
|
2005-12-17 04:53:48 +00:00
|
|
|
// defaults are from config.xml
|
|
|
|
// if not there these are used:
|
|
|
|
// -default_max_used_gb= 100
|
|
|
|
// -default_max_used_pct = 50
|
|
|
|
// -default_min_free_gb = .001
|
2004-08-14 00:37:38 +00:00
|
|
|
//
|
2006-10-06 18:52:50 +00:00
|
|
|
if (prefs.disk_max_used_gb == 0) {
|
2005-12-17 04:53:48 +00:00
|
|
|
prefs.disk_max_used_gb = config.default_disk_max_used_gb;
|
2006-10-06 18:52:50 +00:00
|
|
|
}
|
|
|
|
if (prefs.disk_max_used_pct == 0) {
|
2005-12-17 04:53:48 +00:00
|
|
|
prefs.disk_max_used_pct = config.default_disk_max_used_pct;
|
2006-10-06 18:52:50 +00:00
|
|
|
}
|
|
|
|
if (prefs.disk_min_free_gb < config.default_disk_min_free_gb) {
|
2005-12-17 04:53:48 +00:00
|
|
|
prefs.disk_min_free_gb = config.default_disk_min_free_gb;
|
2006-10-06 18:52:50 +00:00
|
|
|
}
|
2004-08-14 00:37:38 +00:00
|
|
|
|
|
|
|
// no defaults for total/free disk space (host.d_total, d_free)
|
2005-10-04 21:44:58 +00:00
|
|
|
// if they're zero, client will get no work.
|
2004-08-14 00:37:38 +00:00
|
|
|
//
|
|
|
|
|
2005-10-04 21:44:58 +00:00
|
|
|
if (host.d_boinc_used_total) {
|
|
|
|
// The post 4 oct 2005 case.
|
|
|
|
// Compute the max allowable additional disk usage based on prefs
|
|
|
|
//
|
2008-11-01 23:13:55 +00:00
|
|
|
x1 = prefs.disk_max_used_gb*GIGA - host.d_boinc_used_total;
|
2005-10-04 21:44:58 +00:00
|
|
|
x2 = host.d_total*prefs.disk_max_used_pct/100.
|
|
|
|
- host.d_boinc_used_total;
|
2008-11-01 23:13:55 +00:00
|
|
|
x3 = host.d_free - prefs.disk_min_free_gb*GIGA; // may be negative
|
2009-02-26 00:23:23 +00:00
|
|
|
x = std::min(x1, std::min(x2, x3));
|
2005-01-31 19:34:43 +00:00
|
|
|
|
2005-10-04 21:44:58 +00:00
|
|
|
// see which bound is the most stringent
|
|
|
|
//
|
|
|
|
if (x==x1) {
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->disk_limits.max_used = x;
|
2005-10-04 21:44:58 +00:00
|
|
|
} else if (x==x2) {
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->disk_limits.max_frac = x;
|
2005-10-04 21:44:58 +00:00
|
|
|
} else {
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->disk_limits.min_free = x;
|
2005-10-04 21:44:58 +00:00
|
|
|
}
|
2005-02-06 21:26:21 +00:00
|
|
|
} else {
|
2005-10-04 21:44:58 +00:00
|
|
|
// here we don't know how much space BOINC is using.
|
|
|
|
// so we're kinda screwed.
|
|
|
|
// All we can do is assume that BOINC is using zero space.
|
|
|
|
// We can't honor the max_used for max_used_pct preferences.
|
|
|
|
// We can only honor the min_free pref.
|
|
|
|
//
|
2008-11-01 23:13:55 +00:00
|
|
|
x = host.d_free - prefs.disk_min_free_gb*GIGA; // may be negative
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->disk_limits.min_free = x;
|
2005-10-04 21:44:58 +00:00
|
|
|
x1 = x2 = x3 = 0;
|
2005-02-10 20:31:11 +00:00
|
|
|
}
|
2005-01-31 19:34:43 +00:00
|
|
|
|
2004-08-14 00:37:38 +00:00
|
|
|
if (x < 0) {
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-27 18:57:27 +00:00
|
|
|
"[send] No disk space available: disk_max_used_gb %.2fGB disk_max_used_pct %.2f disk_min_free_gb %.2fGB\n",
|
|
|
|
prefs.disk_max_used_gb/GIGA,
|
|
|
|
prefs.disk_max_used_pct,
|
|
|
|
prefs.disk_min_free_gb/GIGA
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-27 18:57:27 +00:00
|
|
|
"[send] No disk space available: host.d_total %.2fGB host.d_free %.2fGB host.d_boinc_used_total %.2fGB\n",
|
|
|
|
host.d_total/GIGA,
|
|
|
|
host.d_free/GIGA,
|
|
|
|
host.d_boinc_used_total/GIGA
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-27 18:57:27 +00:00
|
|
|
"[send] No disk space available: x1 %.2fGB x2 %.2fGB x3 %.2fGB x %.2fGB\n",
|
|
|
|
x1/GIGA, x2/GIGA, x3/GIGA, x/GIGA
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2008-12-16 16:29:54 +00:00
|
|
|
g_wreq->disk.set_insufficient(-x);
|
2008-11-26 20:37:11 +00:00
|
|
|
x = 0;
|
2004-08-14 00:37:38 +00:00
|
|
|
}
|
|
|
|
return x;
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
|
|
|
|
2008-12-15 21:14:32 +00:00
|
|
|
static double estimate_duration_unscaled(WORKUNIT& wu, BEST_APP_VERSION& bav) {
|
|
|
|
double rsc_fpops_est = wu.rsc_fpops_est;
|
|
|
|
if (rsc_fpops_est <= 0) rsc_fpops_est = 1e12;
|
2010-04-10 05:49:51 +00:00
|
|
|
return rsc_fpops_est/bav.host_usage.projected_flops;
|
2008-12-15 21:14:32 +00:00
|
|
|
}
|
2004-04-04 01:59:47 +00:00
|
|
|
|
2009-03-05 23:08:53 +00:00
|
|
|
static inline void get_running_frac() {
|
2008-12-18 18:19:42 +00:00
|
|
|
double rf;
|
2010-01-13 17:28:59 +00:00
|
|
|
if (g_request->core_client_version<=41900) {
|
2008-12-18 18:19:42 +00:00
|
|
|
rf = g_reply->host.on_frac;
|
|
|
|
} else {
|
|
|
|
rf = g_reply->host.active_frac * g_reply->host.on_frac;
|
|
|
|
}
|
|
|
|
|
2010-04-08 23:14:47 +00:00
|
|
|
// clamp running_frac to a reasonable range
|
2008-12-18 18:19:42 +00:00
|
|
|
//
|
|
|
|
if (rf > 1) {
|
2009-01-29 20:42:45 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL, "running_frac=%f; setting to 1\n", rf);
|
|
|
|
}
|
2008-12-18 18:19:42 +00:00
|
|
|
rf = 1;
|
|
|
|
} else if (rf < .1) {
|
2009-01-29 20:42:45 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL, "running_frac=%f; setting to 0.1\n", rf);
|
|
|
|
}
|
2008-12-18 18:19:42 +00:00
|
|
|
rf = .1;
|
|
|
|
}
|
2009-01-29 20:42:45 +00:00
|
|
|
g_wreq->running_frac = rf;
|
|
|
|
}
|
|
|
|
|
|
|
|
// estimate the amount of real time to complete this WU,
|
|
|
|
// taking into account active_frac etc.
|
|
|
|
// Note: don't factor in resource_share_fraction.
|
2010-04-08 18:27:27 +00:00
|
|
|
// The core client doesn't necessarily round-robin across all projects.
|
2009-01-29 20:42:45 +00:00
|
|
|
//
|
|
|
|
double estimate_duration(WORKUNIT& wu, BEST_APP_VERSION& bav) {
|
|
|
|
double edu = estimate_duration_unscaled(wu, bav);
|
|
|
|
double ed = edu/g_wreq->running_frac;
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-24 21:51:19 +00:00
|
|
|
"[send] est. duration for WU %d: unscaled %.2f scaled %.2f\n",
|
2009-01-15 20:23:20 +00:00
|
|
|
wu.id, edu, ed
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2008-12-18 18:19:42 +00:00
|
|
|
return ed;
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
|
|
|
|
2008-12-18 21:25:51 +00:00
|
|
|
static void get_prefs_info() {
|
2006-05-02 22:17:09 +00:00
|
|
|
char buf[8096];
|
2009-02-26 00:23:23 +00:00
|
|
|
std::string str;
|
2007-09-21 18:10:54 +00:00
|
|
|
unsigned int pos = 0;
|
|
|
|
int temp_int;
|
2008-03-10 17:03:15 +00:00
|
|
|
bool flag;
|
2006-10-22 00:42:44 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
extract_venue(g_reply->user.project_prefs, g_reply->host.venue, buf);
|
2007-12-24 21:34:21 +00:00
|
|
|
str = buf;
|
|
|
|
|
2006-10-22 00:42:44 +00:00
|
|
|
// scan user's project prefs for elements of the form <app_id>N</app_id>,
|
|
|
|
// indicating the apps they want to run.
|
|
|
|
//
|
2008-12-18 21:25:51 +00:00
|
|
|
g_wreq->preferred_apps.clear();
|
2007-09-21 18:10:54 +00:00
|
|
|
while (parse_int(str.substr(pos,str.length()-pos).c_str(), "<app_id>", temp_int)) {
|
2006-10-22 01:46:33 +00:00
|
|
|
APP_INFO ai;
|
|
|
|
ai.appid = temp_int;
|
2008-03-07 21:13:01 +00:00
|
|
|
ai.work_available = false;
|
2008-12-18 21:25:51 +00:00
|
|
|
g_wreq->preferred_apps.push_back(ai);
|
2006-10-22 00:42:44 +00:00
|
|
|
|
2007-09-21 18:10:54 +00:00
|
|
|
pos = str.find("<app_id>", pos) + 1;
|
|
|
|
}
|
2009-05-06 21:52:50 +00:00
|
|
|
if (parse_bool(buf,"allow_non_preferred_apps", flag)) {
|
|
|
|
g_wreq->allow_non_preferred_apps = flag;
|
2008-03-07 21:13:01 +00:00
|
|
|
}
|
2009-05-06 21:52:50 +00:00
|
|
|
if (parse_bool(buf,"allow_beta_work", flag)) {
|
2008-12-18 21:25:51 +00:00
|
|
|
g_wreq->allow_beta_work = flag;
|
2009-05-06 21:52:50 +00:00
|
|
|
}
|
|
|
|
if (parse_bool(buf,"no_gpus", flag)) {
|
2009-09-28 04:24:18 +00:00
|
|
|
// deprecated, but need to handle
|
|
|
|
if (flag) {
|
|
|
|
g_wreq->no_cuda = true;
|
|
|
|
g_wreq->no_ati = true;
|
|
|
|
}
|
2008-12-18 21:25:51 +00:00
|
|
|
}
|
2009-05-06 21:52:50 +00:00
|
|
|
if (parse_bool(buf,"no_cpu", flag)) {
|
2009-03-18 21:14:44 +00:00
|
|
|
g_wreq->no_cpu = flag;
|
|
|
|
}
|
2009-09-28 04:24:18 +00:00
|
|
|
if (parse_bool(buf,"no_cuda", flag)) {
|
|
|
|
g_wreq->no_cuda = flag;
|
|
|
|
}
|
|
|
|
if (parse_bool(buf,"no_ati", flag)) {
|
|
|
|
g_wreq->no_ati = flag;
|
|
|
|
}
|
2008-12-18 21:25:51 +00:00
|
|
|
}
|
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// Decide whether or not this app version is 'reliable'
|
|
|
|
// An app version is reliable if the following conditions are true
|
|
|
|
// (for those that are set in the config file)
|
|
|
|
// 1) The host average turnaround is less than a threshold
|
2010-04-21 19:33:20 +00:00
|
|
|
// 2) consecutive_valid is above a threshold
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// 3) The host results per day is equal to the max value
|
2008-12-18 21:25:51 +00:00
|
|
|
//
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
void get_reliability_version(HOST_APP_VERSION& hav, double multiplier) {
|
|
|
|
if (hav.turnaround.n > MIN_HOST_SAMPLES && config.reliable_max_avg_turnaround) {
|
2008-02-21 20:10:10 +00:00
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (hav.turnaround.get_avg() > config.reliable_max_avg_turnaround*multiplier) {
|
2009-07-28 23:07:11 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-04-16 18:07:08 +00:00
|
|
|
"[send] [AV#%d] not reliable; avg turnaround: %.3f > %.3f hrs\n",
|
|
|
|
hav.app_version_id,
|
|
|
|
hav.turnaround.get_avg()/3600,
|
|
|
|
config.reliable_max_avg_turnaround*multiplier/3600
|
2009-07-28 23:07:11 +00:00
|
|
|
);
|
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
hav.reliable = false;
|
2009-07-28 23:07:11 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2010-04-21 19:33:20 +00:00
|
|
|
if (hav.consecutive_valid < CONS_VALID_RELIABLE) {
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [AV#%d] not reliable; cons valid %d < %d\n",
|
|
|
|
hav.app_version_id,
|
|
|
|
hav.consecutive_valid, CONS_VALID_RELIABLE
|
|
|
|
);
|
2009-07-28 23:07:11 +00:00
|
|
|
}
|
2010-04-21 19:33:20 +00:00
|
|
|
hav.reliable = false;
|
|
|
|
return;
|
2008-05-02 17:48:29 +00:00
|
|
|
}
|
2009-07-28 23:07:11 +00:00
|
|
|
if (config.daily_result_quota) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (hav.max_jobs_per_day < config.daily_result_quota) {
|
2009-07-28 23:07:11 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-04-16 18:07:08 +00:00
|
|
|
"[send] [AV#%d] not reliable; max_jobs_per_day %d>%d\n",
|
|
|
|
hav.app_version_id,
|
|
|
|
hav.max_jobs_per_day,
|
|
|
|
config.daily_result_quota
|
2009-07-28 23:07:11 +00:00
|
|
|
);
|
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
hav.reliable = false;
|
2009-07-28 23:07:11 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
hav.reliable = true;
|
2008-05-02 17:48:29 +00:00
|
|
|
if (config.debug_send) {
|
2009-08-26 18:21:36 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
"[send] [HOST#%d] app version %d is reliable\n",
|
|
|
|
g_reply->host.id, hav.app_version_id
|
2009-08-26 18:21:36 +00:00
|
|
|
);
|
2006-10-22 00:42:44 +00:00
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
g_wreq->has_reliable_version = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// decide whether do unreplicated jobs with this app version
|
|
|
|
//
|
|
|
|
static void set_trust(DB_HOST_APP_VERSION& hav) {
|
|
|
|
hav.trusted = false;
|
2010-04-21 19:33:20 +00:00
|
|
|
if (hav.consecutive_valid < CONS_VALID_UNREPLICATED) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-04-21 19:33:20 +00:00
|
|
|
"[send] set_trust: cons valid %d < %d, don't use single replication\n",
|
|
|
|
hav.consecutive_valid, CONS_VALID_UNREPLICATED
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
2010-05-18 22:32:05 +00:00
|
|
|
double x = 1./hav.consecutive_valid;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (drand() > x) hav.trusted = true;
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-04-21 19:33:20 +00:00
|
|
|
"[send] set_trust: random choice for cons valid %d: %s\n",
|
|
|
|
hav.consecutive_valid, hav.trusted?"yes":"no"
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void update_quota(DB_HOST_APP_VERSION& hav) {
|
|
|
|
if (config.daily_result_quota) {
|
|
|
|
if (hav.max_jobs_per_day == 0 || hav.max_jobs_per_day > config.daily_result_quota) {
|
|
|
|
hav.max_jobs_per_day = config.daily_result_quota;
|
2010-04-16 18:07:08 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [HAV#%d] Initializing max_results_day to %d\n",
|
|
|
|
hav.app_version_id,
|
|
|
|
config.daily_result_quota
|
|
|
|
);
|
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (g_request->last_rpc_dayofyear != g_request->current_rpc_dayofyear) {
|
|
|
|
log_messages.printf(MSG_DEBUG,
|
|
|
|
"[HOST#%d] [HAV#%d] Resetting njobs_today\n",
|
|
|
|
g_reply->host.id, hav.app_version_id
|
|
|
|
);
|
|
|
|
hav.n_jobs_today = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void get_reliability_and_trust() {
|
|
|
|
// Platforms other than Windows, Linux and Intel Macs need a
|
|
|
|
// larger set of computers to be marked reliable
|
|
|
|
//
|
|
|
|
double multiplier = 1.0;
|
|
|
|
if (strstr(g_reply->host.os_name,"Windows")
|
|
|
|
|| strstr(g_reply->host.os_name,"Linux")
|
|
|
|
|| (strstr(g_reply->host.os_name,"Darwin")
|
|
|
|
&& !(strstr(g_reply->host.p_vendor,"Power Macintosh"))
|
|
|
|
)) {
|
|
|
|
multiplier = 1.0;
|
|
|
|
} else {
|
|
|
|
multiplier = 1.8;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned int i=0; i<g_wreq->host_app_versions.size(); i++) {
|
|
|
|
DB_HOST_APP_VERSION& hav = g_wreq->host_app_versions[i];
|
|
|
|
get_reliability_version(hav, multiplier);
|
|
|
|
set_trust(hav);
|
|
|
|
update_quota(hav);
|
|
|
|
}
|
2006-05-02 22:17:09 +00:00
|
|
|
}
|
|
|
|
|
2008-04-30 20:31:33 +00:00
|
|
|
// Return true if the user has set application preferences,
|
|
|
|
// and this job is not for a selected app
|
2007-05-09 17:45:18 +00:00
|
|
|
//
|
2008-12-16 16:29:54 +00:00
|
|
|
bool app_not_selected(WORKUNIT& wu) {
|
2006-11-07 17:40:55 +00:00
|
|
|
unsigned int i;
|
2007-05-09 17:45:18 +00:00
|
|
|
|
2008-12-18 21:25:51 +00:00
|
|
|
if (g_wreq->preferred_apps.size() == 0) return false;
|
|
|
|
for (i=0; i<g_wreq->preferred_apps.size(); i++) {
|
|
|
|
if (wu.appid == g_wreq->preferred_apps[i].appid) {
|
2009-05-06 21:52:50 +00:00
|
|
|
g_wreq->preferred_apps[i].work_available = true;
|
2008-04-30 20:31:33 +00:00
|
|
|
return false;
|
2008-04-26 23:34:38 +00:00
|
|
|
}
|
2007-05-09 17:45:18 +00:00
|
|
|
}
|
2008-04-30 20:31:33 +00:00
|
|
|
return true;
|
2006-10-22 00:42:44 +00:00
|
|
|
}
|
|
|
|
|
2007-08-16 17:33:41 +00:00
|
|
|
// see how much RAM we can use on this machine
|
|
|
|
//
|
2009-03-05 23:08:53 +00:00
|
|
|
static inline void get_mem_sizes() {
|
2009-01-29 20:42:45 +00:00
|
|
|
g_wreq->ram = g_reply->host.m_nbytes;
|
|
|
|
if (g_wreq->ram <= 0) g_wreq->ram = DEFAULT_RAM_SIZE;
|
|
|
|
g_wreq->usable_ram = g_wreq->ram;
|
2008-12-16 16:29:54 +00:00
|
|
|
double busy_frac = g_request->global_prefs.ram_max_used_busy_frac;
|
|
|
|
double idle_frac = g_request->global_prefs.ram_max_used_idle_frac;
|
2006-10-04 17:01:36 +00:00
|
|
|
double frac = 1;
|
|
|
|
if (busy_frac>0 && idle_frac>0) {
|
|
|
|
frac = std::max(busy_frac, idle_frac);
|
|
|
|
if (frac > 1) frac = 1;
|
2009-01-29 20:42:45 +00:00
|
|
|
g_wreq->usable_ram *= frac;
|
2006-10-04 17:01:36 +00:00
|
|
|
}
|
2007-08-16 17:33:41 +00:00
|
|
|
}
|
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
static inline int check_memory(WORKUNIT& wu) {
|
2009-01-29 20:42:45 +00:00
|
|
|
double diff = wu.rsc_memory_bound - g_wreq->usable_ram;
|
2007-08-16 17:33:41 +00:00
|
|
|
if (diff > 0) {
|
2008-03-07 21:13:01 +00:00
|
|
|
char message[256];
|
|
|
|
sprintf(message,
|
|
|
|
"%s needs %0.2f MB RAM but only %0.2f MB is available for use.",
|
|
|
|
find_user_friendly_name(wu.appid),
|
2009-01-29 20:42:45 +00:00
|
|
|
wu.rsc_memory_bound/MEGA, g_wreq->usable_ram/MEGA
|
2008-03-07 21:13:01 +00:00
|
|
|
);
|
2009-08-21 20:38:39 +00:00
|
|
|
add_no_work_message(message);
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [WU#%d %s] needs %0.2fMB RAM; [HOST#%d] has %0.2fMB, %0.2fMB usable\n",
|
2008-04-26 23:34:38 +00:00
|
|
|
wu.id, wu.name, wu.rsc_memory_bound/MEGA,
|
2009-01-29 20:42:45 +00:00
|
|
|
g_reply->host.id, g_wreq->ram/MEGA, g_wreq->usable_ram/MEGA
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2008-12-16 16:29:54 +00:00
|
|
|
g_wreq->mem.set_insufficient(wu.rsc_memory_bound);
|
|
|
|
g_reply->set_delay(DELAY_NO_WORK_TEMP);
|
2007-05-30 17:25:51 +00:00
|
|
|
return INFEASIBLE_MEM;
|
2005-02-08 19:54:10 +00:00
|
|
|
}
|
2007-05-30 17:25:51 +00:00
|
|
|
return 0;
|
2007-05-09 17:45:18 +00:00
|
|
|
}
|
2005-02-08 19:54:10 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
static inline int check_disk(WORKUNIT& wu) {
|
|
|
|
double diff = wu.rsc_disk_bound - g_wreq->disk_available;
|
2007-08-16 17:33:41 +00:00
|
|
|
if (diff > 0) {
|
2008-03-07 21:13:01 +00:00
|
|
|
char message[256];
|
|
|
|
sprintf(message,
|
|
|
|
"%s needs %0.2fMB more disk space. You currently have %0.2f MB available and it needs %0.2f MB.",
|
|
|
|
find_user_friendly_name(wu.appid),
|
2008-12-16 16:29:54 +00:00
|
|
|
diff/MEGA, g_wreq->disk_available/MEGA, wu.rsc_disk_bound/MEGA
|
2008-03-07 21:13:01 +00:00
|
|
|
);
|
2009-08-21 20:38:39 +00:00
|
|
|
add_no_work_message(message);
|
2008-03-07 21:13:01 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
g_wreq->disk.set_insufficient(diff);
|
2007-05-30 17:25:51 +00:00
|
|
|
return INFEASIBLE_DISK;
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
2007-05-30 17:25:51 +00:00
|
|
|
return 0;
|
2007-05-09 17:45:18 +00:00
|
|
|
}
|
2004-04-04 01:59:47 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
static inline int check_bandwidth(WORKUNIT& wu) {
|
2008-03-07 21:13:01 +00:00
|
|
|
if (wu.rsc_bandwidth_bound == 0) return 0;
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2008-06-17 16:09:27 +00:00
|
|
|
// if n_bwdown is zero, the host has never downloaded anything,
|
|
|
|
// so skip this check
|
|
|
|
//
|
2008-12-16 16:29:54 +00:00
|
|
|
if (g_reply->host.n_bwdown == 0) return 0;
|
2008-06-17 16:09:27 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
double diff = wu.rsc_bandwidth_bound - g_reply->host.n_bwdown;
|
2008-03-07 21:13:01 +00:00
|
|
|
if (diff > 0) {
|
|
|
|
char message[256];
|
|
|
|
sprintf(message,
|
2008-06-17 16:15:39 +00:00
|
|
|
"%s requires %0.2f KB/sec download bandwidth. Your computer has been measured at %0.2f KB/sec.",
|
2008-03-07 21:13:01 +00:00
|
|
|
find_user_friendly_name(wu.appid),
|
2008-12-16 16:29:54 +00:00
|
|
|
wu.rsc_bandwidth_bound/KILO, g_reply->host.n_bwdown/KILO
|
2008-03-07 21:13:01 +00:00
|
|
|
);
|
2009-08-21 20:38:39 +00:00
|
|
|
add_no_work_message(message);
|
2008-03-07 21:13:01 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
g_wreq->bandwidth.set_insufficient(diff);
|
2008-03-07 21:13:01 +00:00
|
|
|
return INFEASIBLE_BANDWIDTH;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-07-15 21:43:45 +00:00
|
|
|
// Determine if the app is "hard",
|
|
|
|
// and we should send it only to high-end hosts.
|
|
|
|
// Currently this is specified by setting weight=-1;
|
|
|
|
// this is a kludge for SETI@home/Astropulse.
|
|
|
|
//
|
|
|
|
static inline bool hard_app(APP& app) {
|
|
|
|
return (app.weight == -1);
|
|
|
|
}
|
|
|
|
|
2009-01-30 21:25:24 +00:00
|
|
|
static inline double get_estimated_delay(BEST_APP_VERSION& bav) {
|
2009-03-05 00:10:16 +00:00
|
|
|
if (bav.host_usage.ncudas) {
|
2010-05-18 19:22:34 +00:00
|
|
|
return g_request->coprocs.cuda.estimated_delay;
|
2009-08-17 17:07:38 +00:00
|
|
|
} else if (bav.host_usage.natis) {
|
2010-05-18 19:22:34 +00:00
|
|
|
return g_request->coprocs.ati.estimated_delay;
|
2009-01-30 21:25:24 +00:00
|
|
|
} else {
|
|
|
|
return g_request->cpu_estimated_delay;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void update_estimated_delay(BEST_APP_VERSION& bav, double dt) {
|
2009-03-05 00:10:16 +00:00
|
|
|
if (bav.host_usage.ncudas) {
|
2010-05-18 19:22:34 +00:00
|
|
|
g_request->coprocs.cuda.estimated_delay += dt;
|
2009-08-17 17:07:38 +00:00
|
|
|
} else if (bav.host_usage.natis) {
|
2010-05-18 19:22:34 +00:00
|
|
|
g_request->coprocs.ati.estimated_delay += dt;
|
2009-01-30 21:25:24 +00:00
|
|
|
} else {
|
|
|
|
g_request->cpu_estimated_delay += dt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-08-31 19:35:46 +00:00
|
|
|
// return the delay bound to use for this job/host.
|
|
|
|
// Actually, return two: optimistic (lower) and pessimistic (higher).
|
|
|
|
// If the deadline check with the optimistic bound fails,
|
|
|
|
// try the pessimistic bound.
|
|
|
|
//
|
|
|
|
static void get_delay_bound_range(
|
|
|
|
WORKUNIT& wu,
|
|
|
|
int res_server_state, int res_priority, double res_report_deadline,
|
|
|
|
BEST_APP_VERSION& bav,
|
|
|
|
double& opt, double& pess
|
|
|
|
) {
|
|
|
|
if (res_server_state == RESULT_SERVER_STATE_IN_PROGRESS) {
|
|
|
|
double now = dtime();
|
|
|
|
if (res_report_deadline < now) {
|
|
|
|
// if original deadline has passed, return zeros
|
|
|
|
// This will skip deadline check.
|
|
|
|
opt = pess = 0;
|
|
|
|
}
|
|
|
|
opt = res_report_deadline - now;
|
|
|
|
pess = wu.delay_bound;
|
|
|
|
} else {
|
|
|
|
opt = pess = wu.delay_bound;
|
|
|
|
|
|
|
|
// If the workunit needs reliable and is being sent to a reliable host,
|
|
|
|
// then shorten the delay bound by the percent specified
|
|
|
|
//
|
|
|
|
if (config.reliable_on_priority && res_priority >= config.reliable_on_priority && config.reliable_reduced_delay_bound > 0.01
|
|
|
|
) {
|
|
|
|
opt = wu.delay_bound*config.reliable_reduced_delay_bound;
|
|
|
|
double est_wallclock_duration = estimate_duration(wu, bav);
|
|
|
|
// Check to see how reasonable this reduced time is.
|
|
|
|
// Increase it to twice the estimated delay bound
|
|
|
|
// if all the following apply:
|
|
|
|
//
|
|
|
|
// 1) Twice the estimate is longer then the reduced delay bound
|
|
|
|
// 2) Twice the estimate is less then the original delay bound
|
|
|
|
// 3) Twice the estimate is less then the twice the reduced delay bound
|
|
|
|
if (est_wallclock_duration*2 > opt
|
|
|
|
&& est_wallclock_duration*2 < wu.delay_bound
|
|
|
|
&& est_wallclock_duration*2 < wu.delay_bound*config.reliable_reduced_delay_bound*2
|
|
|
|
) {
|
|
|
|
opt = est_wallclock_duration*2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// return 0 if the job, with the given delay bound,
|
|
|
|
// will complete by its deadline, and won't cause other jobs to miss deadlines.
|
|
|
|
//
|
2007-05-30 17:25:51 +00:00
|
|
|
static inline int check_deadline(
|
2008-12-15 21:14:32 +00:00
|
|
|
WORKUNIT& wu, APP& app, BEST_APP_VERSION& bav
|
2007-05-30 17:25:51 +00:00
|
|
|
) {
|
2008-07-14 19:13:19 +00:00
|
|
|
if (config.ignore_delay_bound) return 0;
|
|
|
|
|
2005-02-16 23:17:43 +00:00
|
|
|
// skip delay check if host currently doesn't have any work
|
2008-08-12 19:06:35 +00:00
|
|
|
// and it's not a hard app.
|
2005-02-16 23:17:43 +00:00
|
|
|
// (i.e. everyone gets one result, no matter how slow they are)
|
|
|
|
//
|
2009-01-30 21:25:24 +00:00
|
|
|
if (get_estimated_delay(bav) == 0 && !hard_app(app)) return 0;
|
2008-07-14 19:13:19 +00:00
|
|
|
|
2008-08-12 19:06:35 +00:00
|
|
|
// if it's a hard app, don't send it to a host with no credit
|
|
|
|
//
|
2008-12-15 21:14:32 +00:00
|
|
|
if (hard_app(app) && g_reply->host.total_credit == 0) {
|
2008-08-12 19:06:35 +00:00
|
|
|
return INFEASIBLE_CPU;
|
|
|
|
}
|
|
|
|
|
2009-08-31 19:35:46 +00:00
|
|
|
// do EDF simulation if possible; else use cruder approximation
|
|
|
|
//
|
2008-12-26 22:56:42 +00:00
|
|
|
if (config.workload_sim && g_request->have_other_results_list) {
|
|
|
|
double est_dur = estimate_duration(wu, bav);
|
|
|
|
if (g_reply->wreq.edf_reject_test(est_dur, wu.delay_bound)) {
|
|
|
|
return INFEASIBLE_WORKLOAD;
|
|
|
|
}
|
|
|
|
IP_RESULT candidate("", wu.delay_bound, est_dur);
|
|
|
|
strcpy(candidate.name, wu.name);
|
2009-06-22 21:11:19 +00:00
|
|
|
if (check_candidate(candidate, g_wreq->effective_ncpus, g_request->ip_results)) {
|
2008-12-26 22:56:42 +00:00
|
|
|
// it passed the feasibility test,
|
2009-06-22 21:11:19 +00:00
|
|
|
// but don't add it to the workload yet;
|
2008-12-26 22:56:42 +00:00
|
|
|
// wait until we commit to sending it
|
|
|
|
} else {
|
|
|
|
g_reply->wreq.edf_reject(est_dur, wu.delay_bound);
|
|
|
|
g_reply->wreq.speed.set_insufficient(0);
|
|
|
|
return INFEASIBLE_WORKLOAD;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
double ewd = estimate_duration(wu, bav);
|
|
|
|
if (hard_app(app)) ewd *= 1.3;
|
2009-01-30 21:25:24 +00:00
|
|
|
double est_completion_delay = get_estimated_delay(bav) + ewd;
|
2009-08-31 19:35:46 +00:00
|
|
|
double est_report_delay = std::max(
|
|
|
|
est_completion_delay,
|
|
|
|
g_request->global_prefs.work_buf_min()
|
|
|
|
);
|
2008-12-26 22:56:42 +00:00
|
|
|
double diff = est_report_delay - wu.delay_bound;
|
|
|
|
if (diff > 0) {
|
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-24 21:51:19 +00:00
|
|
|
"[send] [WU#%d] deadline miss %d > %d\n",
|
|
|
|
wu.id, (int)est_report_delay, wu.delay_bound
|
2008-12-26 22:56:42 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
g_reply->wreq.speed.set_insufficient(diff);
|
|
|
|
return INFEASIBLE_CPU;
|
2009-01-30 22:30:35 +00:00
|
|
|
} else {
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-03-06 22:21:47 +00:00
|
|
|
"[send] [WU#%d] meets deadline: %.2f + %.2f < %d\n",
|
2009-01-30 22:30:35 +00:00
|
|
|
wu.id, get_estimated_delay(bav), ewd, wu.delay_bound
|
|
|
|
);
|
|
|
|
}
|
2004-07-15 18:54:17 +00:00
|
|
|
}
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
2007-05-30 17:25:51 +00:00
|
|
|
return 0;
|
2007-05-09 17:45:18 +00:00
|
|
|
}
|
|
|
|
|
2008-04-30 20:31:33 +00:00
|
|
|
// Fast checks (no DB access) to see if the job can be sent to the host.
|
2007-05-30 17:25:51 +00:00
|
|
|
// Reasons why not include:
|
2007-05-09 17:45:18 +00:00
|
|
|
// 1) the host doesn't have enough memory;
|
|
|
|
// 2) the host doesn't have enough disk space;
|
|
|
|
// 3) based on CPU speed, resource share and estimated delay,
|
|
|
|
// the host probably won't get the result done within the delay bound
|
|
|
|
// 4) app isn't in user's "approved apps" list
|
|
|
|
//
|
2009-08-31 19:35:46 +00:00
|
|
|
// If the job is feasible, return 0 and fill in wu.delay_bound
|
|
|
|
// with the delay bound we've decided to use.
|
|
|
|
//
|
|
|
|
int wu_is_infeasible_fast(
|
|
|
|
WORKUNIT& wu,
|
|
|
|
int res_server_state, int res_priority, double res_report_deadline,
|
|
|
|
APP& app, BEST_APP_VERSION& bav
|
|
|
|
) {
|
2007-05-30 17:25:51 +00:00
|
|
|
int retval;
|
2007-05-31 18:14:45 +00:00
|
|
|
|
2009-07-29 18:34:27 +00:00
|
|
|
// project-specific check
|
|
|
|
//
|
|
|
|
if (wu_is_infeasible_custom(wu, app, bav)) {
|
|
|
|
return INFEASIBLE_CUSTOM;
|
|
|
|
}
|
|
|
|
|
2010-04-05 18:59:16 +00:00
|
|
|
// homogeneous redundancy: can't send if app uses HR and
|
|
|
|
// 1) host is of unknown HR class
|
2007-05-31 18:14:45 +00:00
|
|
|
//
|
2007-10-04 21:55:37 +00:00
|
|
|
if (app_hr_type(app)) {
|
2010-04-05 18:59:16 +00:00
|
|
|
if (hr_unknown_class(g_reply->host, app_hr_type(app))) {
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [HOST#%d] [WU#%d %s] host is of unknown class in HR type %d\n",
|
2008-12-15 21:14:32 +00:00
|
|
|
g_reply->host.id, wu.id, wu.name, app_hr_type(app)
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2007-10-04 21:55:37 +00:00
|
|
|
return INFEASIBLE_HR;
|
|
|
|
}
|
2008-12-19 18:14:02 +00:00
|
|
|
if (already_sent_to_different_platform_quick(wu, app)) {
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [HOST#%d] [WU#%d %s] failed quick HR check: WU is class %d, host is class %d\n",
|
2008-12-15 21:14:32 +00:00
|
|
|
g_reply->host.id, wu.id, wu.name, wu.hr_class, hr_class(g_request->host, app_hr_type(app))
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2007-05-31 18:14:45 +00:00
|
|
|
return INFEASIBLE_HR;
|
|
|
|
}
|
|
|
|
}
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2007-05-31 18:14:45 +00:00
|
|
|
if (config.one_result_per_user_per_wu || config.one_result_per_host_per_wu) {
|
2008-12-19 18:14:02 +00:00
|
|
|
if (wu_already_in_reply(wu)) {
|
2007-05-31 18:14:45 +00:00
|
|
|
return INFEASIBLE_DUP;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
retval = check_memory(wu);
|
2007-05-30 17:25:51 +00:00
|
|
|
if (retval) return retval;
|
2008-12-16 16:29:54 +00:00
|
|
|
retval = check_disk(wu);
|
2007-05-30 17:25:51 +00:00
|
|
|
if (retval) return retval;
|
2008-12-16 16:29:54 +00:00
|
|
|
retval = check_bandwidth(wu);
|
2008-03-07 21:13:01 +00:00
|
|
|
if (retval) return retval;
|
- scheduler: add <workload_sim> config option.
If set, the scheduler will use EDF simulation,
together with the in-progress workload reported by the client,
to avoid sending results that
1) will miss their deadline, or
2) will cause an in-progress result to miss its deadline, or
3) will make an in-progress result miss its deadline
by more than is already predicted.
If this option is not set, or if the client request doesn't
include a workload description (i.e. the client is old)
use the existing approach, which assumes there's no workload.
NOTE: this is experimental. Production projects should not use it.
- EDF sim: write debug stuff to stderr instead of stdout
- Account manager:
- if an account is detach_when_done, set dont_request_more_work
- check done_request_more_work even for first-time projects
- update_uotd: generate a file for use by Google gadget
- user_links(): use full URLs (so can use in Google gadget)
client/
acct_mgr.C
work_fetch.C
html/
inc/
uotd.inc
util.inc
user/
uotd_gadget.php (new)
sched/
Makefile.am
edf_sim.C
sched_config.C,h
sched_resend.C
sched_send.C,h
server_types.C,h
svn path=/trunk/boinc/; revision=12639
2007-05-10 21:50:52 +00:00
|
|
|
|
2009-10-06 20:45:09 +00:00
|
|
|
if (config.non_cpu_intensive) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-08-31 19:35:46 +00:00
|
|
|
// do deadline check last because EDF sim uses some CPU
|
|
|
|
//
|
|
|
|
double opt, pess;
|
|
|
|
get_delay_bound_range(
|
|
|
|
wu, res_server_state, res_priority, res_report_deadline, bav, opt, pess
|
|
|
|
);
|
|
|
|
wu.delay_bound = (int)opt;
|
|
|
|
if (opt == 0) {
|
|
|
|
// this is a resend; skip deadline check
|
|
|
|
return 0;
|
|
|
|
}
|
2008-12-26 22:56:42 +00:00
|
|
|
retval = check_deadline(wu, app, bav);
|
2009-08-31 19:35:46 +00:00
|
|
|
if (retval && (opt != pess)) {
|
|
|
|
wu.delay_bound = (int)pess;
|
|
|
|
retval = check_deadline(wu, app, bav);
|
|
|
|
}
|
|
|
|
return retval;
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// insert "text" right after "after" in the given buffer
|
|
|
|
//
|
2010-04-08 18:27:27 +00:00
|
|
|
static int insert_after(char* buffer, const char* after, const char* text) {
|
2004-04-04 01:59:47 +00:00
|
|
|
char* p;
|
2008-03-31 16:19:45 +00:00
|
|
|
char temp[BLOB_SIZE];
|
2004-04-04 01:59:47 +00:00
|
|
|
|
2008-03-31 16:19:45 +00:00
|
|
|
if (strlen(buffer) + strlen(text) > BLOB_SIZE-1) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2008-08-19 03:00:17 +00:00
|
|
|
"insert_after: overflow: %d %d\n", strlen(buffer), strlen(text)
|
2005-09-26 23:28:48 +00:00
|
|
|
);
|
2004-04-04 01:59:47 +00:00
|
|
|
return ERR_BUFFER_OVERFLOW;
|
|
|
|
}
|
|
|
|
p = strstr(buffer, after);
|
|
|
|
if (!p) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2005-09-26 23:28:48 +00:00
|
|
|
"insert_after: %s not found in %s\n", after, buffer
|
|
|
|
);
|
2004-04-04 01:59:47 +00:00
|
|
|
return ERR_NULL;
|
|
|
|
}
|
|
|
|
p += strlen(after);
|
|
|
|
strcpy(temp, p);
|
|
|
|
strcpy(p, text);
|
|
|
|
strcat(p, temp);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-02-06 21:26:21 +00:00
|
|
|
// add elements to WU's xml_doc,
|
|
|
|
// in preparation for sending it to a client
|
2004-04-04 01:59:47 +00:00
|
|
|
//
|
2010-04-08 18:27:27 +00:00
|
|
|
static int insert_wu_tags(WORKUNIT& wu, APP& app) {
|
2008-03-31 16:19:45 +00:00
|
|
|
char buf[BLOB_SIZE];
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2004-04-04 01:59:47 +00:00
|
|
|
sprintf(buf,
|
|
|
|
" <rsc_fpops_est>%f</rsc_fpops_est>\n"
|
|
|
|
" <rsc_fpops_bound>%f</rsc_fpops_bound>\n"
|
|
|
|
" <rsc_memory_bound>%f</rsc_memory_bound>\n"
|
|
|
|
" <rsc_disk_bound>%f</rsc_disk_bound>\n"
|
|
|
|
" <name>%s</name>\n"
|
|
|
|
" <app_name>%s</app_name>\n",
|
|
|
|
wu.rsc_fpops_est,
|
|
|
|
wu.rsc_fpops_bound,
|
|
|
|
wu.rsc_memory_bound,
|
|
|
|
wu.rsc_disk_bound,
|
|
|
|
wu.name,
|
|
|
|
app.name
|
|
|
|
);
|
|
|
|
return insert_after(wu.xml_doc, "<workunit>\n", buf);
|
|
|
|
}
|
|
|
|
|
2010-04-08 18:27:27 +00:00
|
|
|
// Add the given workunit, app, and app version to a reply.
|
2004-04-04 01:59:47 +00:00
|
|
|
//
|
2010-04-08 18:27:27 +00:00
|
|
|
static int add_wu_to_reply(
|
2008-03-27 18:25:29 +00:00
|
|
|
WORKUNIT& wu, SCHEDULER_REPLY& reply, APP* app, BEST_APP_VERSION* bavp
|
2004-04-04 01:59:47 +00:00
|
|
|
) {
|
|
|
|
int retval;
|
2005-02-02 18:13:00 +00:00
|
|
|
WORKUNIT wu2, wu3;
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2008-03-27 18:25:29 +00:00
|
|
|
APP_VERSION* avp = bavp->avp;
|
|
|
|
|
2004-04-04 01:59:47 +00:00
|
|
|
// add the app, app_version, and workunit to the reply,
|
|
|
|
// but only if they aren't already there
|
|
|
|
//
|
- scheduler: add support for anonymous-platform coproc apps.
Old: although the request message contained all info
about the app version (flops, coproc usage etc.)
the server ignored this info,
and assumed that all anonymous platform apps where CPU.
With 6.6 client, this could produce infinite work fetch:
- client uses anon platform, has coproc app
- client has idle CPU, requests CPU work
- scheduler sends it jobs, thinking they will be done by CPU app
- client asks for more work etc.
New: scheduler parses full info on anon platform app versions:
plan class, FLOPS, coprocs.
It uses this info to make scheduling decisions;
in particular, if the request is for CUDA work,
if will only send jobs that use a CUDA app version.
The <result> records it returns contain info
(plan_class) that tells the client which app_version to use.
This will work correctly even if the client has multiple app versions
for the same app (e.g., a CPU version and a GPU version)
svn path=/trunk/boinc/; revision=17506
2009-03-05 17:30:10 +00:00
|
|
|
if (avp) {
|
2005-02-02 18:13:00 +00:00
|
|
|
APP_VERSION av2=*avp, *avp2=&av2;
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2009-03-02 23:47:11 +00:00
|
|
|
if (strlen(config.replace_download_url_by_timezone)) {
|
2008-12-19 18:14:02 +00:00
|
|
|
process_av_timezone(avp, av2);
|
2005-02-02 18:13:00 +00:00
|
|
|
}
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->insert_app_unique(*app);
|
2008-03-28 18:00:27 +00:00
|
|
|
av2.bavp = bavp;
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->insert_app_version_unique(*avp2);
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-04-16 18:07:08 +00:00
|
|
|
"[send] Sending app_version %s %d %d %s; projected %.2f GFLOPS\n",
|
|
|
|
app->name,
|
2008-12-17 20:53:46 +00:00
|
|
|
avp2->platformid, avp2->version_num, avp2->plan_class,
|
2010-04-10 05:49:51 +00:00
|
|
|
bavp->host_usage.projected_flops/1e9
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
|
|
|
|
2010-04-08 18:27:27 +00:00
|
|
|
// modify the WU's xml_doc; add <name>, <rsc_*> etc.
|
2004-04-04 01:59:47 +00:00
|
|
|
//
|
|
|
|
wu2 = wu; // make copy since we're going to modify its XML field
|
2010-04-08 23:14:47 +00:00
|
|
|
|
|
|
|
// adjust FPOPS figures for anonymous platform
|
|
|
|
//
|
|
|
|
if (bavp->cavp) {
|
|
|
|
wu2.rsc_fpops_est *= bavp->cavp->rsc_fpops_scale;
|
|
|
|
wu2.rsc_fpops_bound *= bavp->cavp->rsc_fpops_scale;
|
|
|
|
}
|
2004-04-04 01:59:47 +00:00
|
|
|
retval = insert_wu_tags(wu2, *app);
|
|
|
|
if (retval) {
|
2008-08-19 03:00:17 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL, "insert_wu_tags failed %d\n", retval);
|
2004-04-04 01:59:47 +00:00
|
|
|
return retval;
|
|
|
|
}
|
2008-12-19 18:14:02 +00:00
|
|
|
wu3 = wu2;
|
2009-03-02 23:47:11 +00:00
|
|
|
if (strlen(config.replace_download_url_by_timezone)) {
|
2008-12-19 18:14:02 +00:00
|
|
|
process_wu_timezone(wu2, wu3);
|
2005-02-02 18:13:00 +00:00
|
|
|
}
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->insert_workunit_unique(wu3);
|
2005-02-14 20:11:01 +00:00
|
|
|
|
|
|
|
// switch to tighter policy for estimating delay
|
|
|
|
//
|
2004-04-04 01:59:47 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-04-08 18:27:27 +00:00
|
|
|
// add <name> tags to result's xml_doc_in
|
|
|
|
//
|
|
|
|
static int insert_name_tags(RESULT& result, WORKUNIT const& wu) {
|
2004-04-04 01:59:47 +00:00
|
|
|
char buf[256];
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
sprintf(buf, "<name>%s</name>\n", result.name);
|
|
|
|
retval = insert_after(result.xml_doc_in, "<result>\n", buf);
|
|
|
|
if (retval) return retval;
|
|
|
|
sprintf(buf, "<wu_name>%s</wu_name>\n", wu.name);
|
|
|
|
retval = insert_after(result.xml_doc_in, "<result>\n", buf);
|
|
|
|
if (retval) return retval;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-04-08 18:27:27 +00:00
|
|
|
static int insert_deadline_tag(RESULT& result) {
|
2004-04-04 01:59:47 +00:00
|
|
|
char buf[256];
|
|
|
|
sprintf(buf, "<report_deadline>%d</report_deadline>\n", result.report_deadline);
|
|
|
|
int retval = insert_after(result.xml_doc_in, "<result>\n", buf);
|
|
|
|
if (retval) return retval;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-08-04 03:50:04 +00:00
|
|
|
int update_wu_transition_time(WORKUNIT wu, time_t x) {
|
2004-04-04 01:59:47 +00:00
|
|
|
DB_WORKUNIT dbwu;
|
2004-08-21 00:40:01 +00:00
|
|
|
char buf[256];
|
2004-04-04 01:59:47 +00:00
|
|
|
|
2004-08-21 00:40:01 +00:00
|
|
|
dbwu.id = wu.id;
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2005-07-14 10:02:20 +00:00
|
|
|
// SQL note: can't use min() here
|
|
|
|
//
|
|
|
|
sprintf(buf,
|
|
|
|
"transition_time=if(transition_time<%d, transition_time, %d)",
|
|
|
|
(int)x, (int)x
|
|
|
|
);
|
2004-08-21 00:40:01 +00:00
|
|
|
return dbwu.update_field(buf);
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// return true iff a result for same WU is already being sent
|
|
|
|
//
|
2008-12-19 18:14:02 +00:00
|
|
|
bool wu_already_in_reply(WORKUNIT& wu) {
|
2004-04-04 01:59:47 +00:00
|
|
|
unsigned int i;
|
2008-12-16 16:29:54 +00:00
|
|
|
for (i=0; i<g_reply->results.size(); i++) {
|
|
|
|
if (wu.id == g_reply->results[i].workunitid) {
|
2004-04-04 01:59:47 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void lock_sema() {
|
|
|
|
lock_semaphore(sema_key);
|
|
|
|
}
|
|
|
|
|
|
|
|
void unlock_sema() {
|
|
|
|
unlock_semaphore(sema_key);
|
|
|
|
}
|
|
|
|
|
2005-02-06 21:26:21 +00:00
|
|
|
// return true if additional work is needed,
|
|
|
|
// and there's disk space left,
|
|
|
|
// and we haven't exceeded result per RPC limit,
|
|
|
|
// and we haven't exceeded results per day limit
|
|
|
|
//
|
2008-12-19 18:14:02 +00:00
|
|
|
bool work_needed(bool locality_sched) {
|
2005-02-09 20:06:15 +00:00
|
|
|
if (locality_sched) {
|
|
|
|
// if we've failed to send a result because of a transient condition,
|
|
|
|
// return false to preserve invariant
|
|
|
|
//
|
2008-12-16 16:29:54 +00:00
|
|
|
if (g_wreq->disk.insufficient || g_wreq->speed.insufficient || g_wreq->mem.insufficient || g_wreq->no_allowed_apps_available) {
|
2009-03-07 01:00:05 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] stopping work search - locality condition\n"
|
|
|
|
);
|
|
|
|
}
|
2005-02-09 20:06:15 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2005-02-26 00:24:37 +00:00
|
|
|
|
2010-06-01 23:41:07 +00:00
|
|
|
// see if we've reached limits on in-progress jobs
|
|
|
|
//
|
|
|
|
bool some_type_allowed = false;
|
|
|
|
if (config.max_jobs_in_progress.exceeded(NULL, true)) {
|
|
|
|
g_wreq->clear_gpu_req();
|
|
|
|
if (g_wreq->effective_ngpus) {
|
|
|
|
g_wreq->max_jobs_on_host_gpu_exceeded = true;
|
2007-05-14 15:21:38 +00:00
|
|
|
}
|
2010-06-01 23:41:07 +00:00
|
|
|
} else {
|
|
|
|
some_type_allowed = true;
|
2007-05-14 15:21:38 +00:00
|
|
|
}
|
2010-06-01 23:41:07 +00:00
|
|
|
if (config.max_jobs_in_progress.exceeded(NULL, false)) {
|
2009-06-01 22:15:14 +00:00
|
|
|
g_wreq->clear_cpu_req();
|
|
|
|
g_wreq->max_jobs_on_host_cpu_exceeded = true;
|
2010-06-01 23:41:07 +00:00
|
|
|
} else {
|
|
|
|
some_type_allowed = true;
|
2009-06-01 22:15:14 +00:00
|
|
|
}
|
2010-06-01 23:41:07 +00:00
|
|
|
if (!some_type_allowed) {
|
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] in-progress job limit exceeded\n"
|
|
|
|
);
|
2009-06-22 21:11:19 +00:00
|
|
|
}
|
2010-06-01 23:41:07 +00:00
|
|
|
g_wreq->max_jobs_on_host_exceeded = true;
|
|
|
|
return false;
|
2009-06-01 22:15:14 +00:00
|
|
|
}
|
|
|
|
|
2010-06-01 23:41:07 +00:00
|
|
|
// see if we've reached max jobs per RPC
|
|
|
|
//
|
2009-06-01 22:15:14 +00:00
|
|
|
if (g_wreq->njobs_sent >= g_wreq->max_jobs_per_rpc) {
|
2009-03-07 01:00:05 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-06-01 22:15:14 +00:00
|
|
|
"[send] stopping work search - njobs %d >= max_jobs_per_rpc %d\n",
|
|
|
|
g_wreq->njobs_sent, g_wreq->max_jobs_per_rpc
|
2009-03-07 01:00:05 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
2009-01-10 00:43:33 +00:00
|
|
|
|
2009-01-23 22:52:35 +00:00
|
|
|
#if 0
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-08-17 17:07:38 +00:00
|
|
|
"work_needed: spec req %d sec to fill %.2f; CPU (%.2f, %.2f) CUDA (%.2f, %.2f) ATI(%.2f, %.2f)\n",
|
2009-01-23 22:52:35 +00:00
|
|
|
g_wreq->rsc_spec_request,
|
|
|
|
g_wreq->seconds_to_fill,
|
|
|
|
g_wreq->cpu_req_secs, g_wreq->cpu_req_instances,
|
2009-08-17 17:07:38 +00:00
|
|
|
g_wreq->cuda_req_secs, g_wreq->cuda_req_instances,
|
|
|
|
g_wreq->ati_req_secs, g_wreq->ati_req_instances
|
2009-01-23 22:52:35 +00:00
|
|
|
);
|
|
|
|
#endif
|
2009-01-10 00:43:33 +00:00
|
|
|
if (g_wreq->rsc_spec_request) {
|
2009-01-23 22:52:35 +00:00
|
|
|
if (g_wreq->need_cpu()) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (g_wreq->need_cuda()) {
|
|
|
|
return true;
|
|
|
|
}
|
2009-08-17 17:07:38 +00:00
|
|
|
if (g_wreq->need_ati()) {
|
|
|
|
return true;
|
|
|
|
}
|
2009-01-10 00:43:33 +00:00
|
|
|
} else {
|
2009-01-23 22:52:35 +00:00
|
|
|
if (g_wreq->seconds_to_fill > 0) {
|
|
|
|
return true;
|
|
|
|
}
|
2009-01-10 00:43:33 +00:00
|
|
|
}
|
2009-03-07 01:00:05 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL, "[send] don't need more work\n");
|
|
|
|
}
|
2009-01-10 00:43:33 +00:00
|
|
|
return false;
|
2004-09-10 00:41:48 +00:00
|
|
|
}
|
|
|
|
|
2010-03-29 22:28:20 +00:00
|
|
|
// return the app version ID, or -2/-3/-4 if anonymous platform
|
|
|
|
//
|
|
|
|
inline static int get_app_version_id(BEST_APP_VERSION* bavp) {
|
|
|
|
if (bavp->avp) {
|
|
|
|
return bavp->avp->id;
|
|
|
|
} else {
|
2010-04-08 23:14:47 +00:00
|
|
|
return bavp->cavp->host_usage.resource_type();
|
2010-03-29 22:28:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-03-03 16:38:54 +00:00
|
|
|
int add_result_to_reply(
|
|
|
|
DB_RESULT& result, WORKUNIT& wu, BEST_APP_VERSION* bavp,
|
|
|
|
bool locality_scheduling
|
|
|
|
) {
|
2004-09-10 00:41:48 +00:00
|
|
|
int retval;
|
2005-09-12 23:49:16 +00:00
|
|
|
bool resent_result = false;
|
2008-03-27 18:25:29 +00:00
|
|
|
APP* app = ssp->lookup_app(wu.appid);
|
2004-09-10 00:41:48 +00:00
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
retval = add_wu_to_reply(wu, *g_reply, app, bavp);
|
2004-09-10 00:41:48 +00:00
|
|
|
if (retval) return retval;
|
|
|
|
|
2009-08-31 19:35:46 +00:00
|
|
|
// Adjust available disk space.
|
|
|
|
// In the scheduling locality case,
|
|
|
|
// reduce the available space by less than the workunit rsc_disk_bound,
|
|
|
|
// if the host already has the file or the file was not already sent.
|
2005-02-02 22:58:46 +00:00
|
|
|
//
|
2009-03-03 16:38:54 +00:00
|
|
|
if (!locality_scheduling || decrement_disk_space_locality(wu)) {
|
2008-12-16 16:29:54 +00:00
|
|
|
g_wreq->disk_available -= wu.rsc_disk_bound;
|
2005-02-02 22:58:46 +00:00
|
|
|
}
|
2004-09-10 00:41:48 +00:00
|
|
|
|
|
|
|
// update the result in DB
|
|
|
|
//
|
2008-12-16 16:29:54 +00:00
|
|
|
result.hostid = g_reply->host.id;
|
|
|
|
result.userid = g_reply->user.id;
|
2005-07-29 08:13:23 +00:00
|
|
|
result.sent_time = time(0);
|
2009-08-31 19:35:46 +00:00
|
|
|
result.report_deadline = result.sent_time + wu.delay_bound;
|
2010-04-10 05:49:51 +00:00
|
|
|
result.flops_estimate = bavp->host_usage.peak_flops;
|
2010-03-29 22:28:20 +00:00
|
|
|
result.app_version_id = get_app_version_id(bavp);
|
2006-03-17 04:47:51 +00:00
|
|
|
int old_server_state = result.server_state;
|
2005-08-02 19:38:43 +00:00
|
|
|
|
2005-07-28 22:21:46 +00:00
|
|
|
if (result.server_state != RESULT_SERVER_STATE_IN_PROGRESS) {
|
2009-08-31 19:35:46 +00:00
|
|
|
// We're sending this result for the first time
|
2007-11-30 23:02:55 +00:00
|
|
|
//
|
2005-07-28 22:21:46 +00:00
|
|
|
result.server_state = RESULT_SERVER_STATE_IN_PROGRESS;
|
2005-08-04 00:12:50 +00:00
|
|
|
} else {
|
2007-11-30 23:02:55 +00:00
|
|
|
// Result was already sent to this host but was lost,
|
2009-08-31 19:35:46 +00:00
|
|
|
// so we're resending it.
|
2005-08-02 19:38:43 +00:00
|
|
|
//
|
2005-09-12 23:49:16 +00:00
|
|
|
resent_result = true;
|
2009-06-01 22:15:14 +00:00
|
|
|
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [RESULT#%d] [HOST#%d] (resend lost work)\n",
|
2008-12-16 16:29:54 +00:00
|
|
|
result.id, g_reply->host.id
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2005-07-28 22:21:46 +00:00
|
|
|
}
|
2006-03-17 04:47:51 +00:00
|
|
|
retval = result.mark_as_sent(old_server_state);
|
2008-10-27 21:23:07 +00:00
|
|
|
if (retval == ERR_DB_NOT_FOUND) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2006-03-17 14:31:25 +00:00
|
|
|
"[RESULT#%d] [HOST#%d]: CAN'T SEND, already sent to another host\n",
|
2008-12-16 16:29:54 +00:00
|
|
|
result.id, g_reply->host.id
|
2006-03-17 14:31:25 +00:00
|
|
|
);
|
|
|
|
} else if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2005-05-09 20:20:33 +00:00
|
|
|
"add_result_to_reply: can't update result: %d\n", retval
|
|
|
|
);
|
|
|
|
}
|
2006-03-17 14:31:25 +00:00
|
|
|
if (retval) return retval;
|
2004-09-10 00:41:48 +00:00
|
|
|
|
2009-01-10 00:43:33 +00:00
|
|
|
double est_dur = estimate_duration(wu, *bavp);
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-10 00:43:33 +00:00
|
|
|
"[HOST#%d] Sending [RESULT#%d %s] (est. dur. %.2f seconds)\n",
|
|
|
|
g_reply->host.id, result.id, result.name, est_dur
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2004-09-10 00:41:48 +00:00
|
|
|
|
2005-07-14 10:02:20 +00:00
|
|
|
retval = update_wu_transition_time(wu, result.report_deadline);
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2005-07-14 10:02:20 +00:00
|
|
|
"add_result_to_reply: can't update WU transition time: %d\n",
|
|
|
|
retval
|
|
|
|
);
|
|
|
|
return retval;
|
2004-09-10 00:41:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// The following overwrites the result's xml_doc field.
|
|
|
|
// But that's OK cuz we're done with DB updates
|
|
|
|
//
|
|
|
|
retval = insert_name_tags(result, wu);
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2007-02-15 21:11:05 +00:00
|
|
|
"add_result_to_reply: can't insert name tags: %d\n",
|
2005-05-09 20:20:33 +00:00
|
|
|
retval
|
2004-09-10 00:41:48 +00:00
|
|
|
);
|
2005-05-09 20:20:33 +00:00
|
|
|
return retval;
|
2004-09-10 00:41:48 +00:00
|
|
|
}
|
|
|
|
retval = insert_deadline_tag(result);
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2007-02-15 21:11:05 +00:00
|
|
|
"add_result_to_reply: can't insert deadline tag: %d\n", retval
|
2004-09-10 00:41:48 +00:00
|
|
|
);
|
2005-05-09 20:20:33 +00:00
|
|
|
return retval;
|
2004-09-10 00:41:48 +00:00
|
|
|
}
|
2008-03-28 19:35:00 +00:00
|
|
|
result.bavp = bavp;
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->insert_result(result);
|
2009-01-10 00:43:33 +00:00
|
|
|
if (g_wreq->rsc_spec_request) {
|
2009-03-05 00:10:16 +00:00
|
|
|
if (bavp->host_usage.ncudas) {
|
2009-01-10 00:43:33 +00:00
|
|
|
g_wreq->cuda_req_secs -= est_dur;
|
2009-03-05 00:10:16 +00:00
|
|
|
g_wreq->cuda_req_instances -= bavp->host_usage.ncudas;
|
2009-08-17 17:07:38 +00:00
|
|
|
} else if (bavp->host_usage.natis) {
|
|
|
|
g_wreq->ati_req_secs -= est_dur;
|
|
|
|
g_wreq->ati_req_instances -= bavp->host_usage.natis;
|
2009-01-10 00:43:33 +00:00
|
|
|
} else {
|
|
|
|
g_wreq->cpu_req_secs -= est_dur;
|
|
|
|
g_wreq->cpu_req_instances -= bavp->host_usage.avg_ncpus;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
g_wreq->seconds_to_fill -= est_dur;
|
|
|
|
}
|
2009-01-30 21:25:24 +00:00
|
|
|
update_estimated_delay(*bavp, est_dur);
|
2009-06-01 22:15:14 +00:00
|
|
|
g_wreq->njobs_sent++;
|
2010-06-01 23:41:07 +00:00
|
|
|
config.max_jobs_in_progress.register_job(app, bavp->host_usage.uses_gpu());
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (!resent_result) {
|
|
|
|
DB_HOST_APP_VERSION* havp = bavp->host_app_version();
|
|
|
|
if (havp) {
|
|
|
|
havp->n_jobs_today++;
|
|
|
|
}
|
|
|
|
}
|
- scheduler: add <workload_sim> config option.
If set, the scheduler will use EDF simulation,
together with the in-progress workload reported by the client,
to avoid sending results that
1) will miss their deadline, or
2) will cause an in-progress result to miss its deadline, or
3) will make an in-progress result miss its deadline
by more than is already predicted.
If this option is not set, or if the client request doesn't
include a workload description (i.e. the client is old)
use the existing approach, which assumes there's no workload.
NOTE: this is experimental. Production projects should not use it.
- EDF sim: write debug stuff to stderr instead of stdout
- Account manager:
- if an account is detach_when_done, set dont_request_more_work
- check done_request_more_work even for first-time projects
- update_uotd: generate a file for use by Google gadget
- user_links(): use full URLs (so can use in Google gadget)
client/
acct_mgr.C
work_fetch.C
html/
inc/
uotd.inc
util.inc
user/
uotd_gadget.php (new)
sched/
Makefile.am
edf_sim.C
sched_config.C,h
sched_resend.C
sched_send.C,h
server_types.C,h
svn path=/trunk/boinc/; revision=12639
2007-05-10 21:50:52 +00:00
|
|
|
|
|
|
|
// add this result to workload for simulation
|
|
|
|
//
|
2008-12-16 16:29:54 +00:00
|
|
|
if (config.workload_sim && g_request->have_other_results_list) {
|
2008-12-15 21:14:32 +00:00
|
|
|
IP_RESULT ipr ("", time(0)+wu.delay_bound, est_dur);
|
2008-12-16 16:29:54 +00:00
|
|
|
g_request->ip_results.push_back(ipr);
|
- scheduler: add <workload_sim> config option.
If set, the scheduler will use EDF simulation,
together with the in-progress workload reported by the client,
to avoid sending results that
1) will miss their deadline, or
2) will cause an in-progress result to miss its deadline, or
3) will make an in-progress result miss its deadline
by more than is already predicted.
If this option is not set, or if the client request doesn't
include a workload description (i.e. the client is old)
use the existing approach, which assumes there's no workload.
NOTE: this is experimental. Production projects should not use it.
- EDF sim: write debug stuff to stderr instead of stdout
- Account manager:
- if an account is detach_when_done, set dont_request_more_work
- check done_request_more_work even for first-time projects
- update_uotd: generate a file for use by Google gadget
- user_links(): use full URLs (so can use in Google gadget)
client/
acct_mgr.C
work_fetch.C
html/
inc/
uotd.inc
util.inc
user/
uotd_gadget.php (new)
sched/
Makefile.am
edf_sim.C
sched_config.C,h
sched_resend.C
sched_send.C,h
server_types.C,h
svn path=/trunk/boinc/; revision=12639
2007-05-10 21:50:52 +00:00
|
|
|
}
|
2007-07-05 04:18:48 +00:00
|
|
|
|
2008-06-04 23:04:12 +00:00
|
|
|
// mark job as done if debugging flag is set;
|
|
|
|
// this is used by sched_driver.C (performance testing)
|
2007-07-05 04:18:48 +00:00
|
|
|
//
|
|
|
|
if (mark_jobs_done) {
|
|
|
|
DB_WORKUNIT dbwu;
|
|
|
|
char buf[256];
|
|
|
|
sprintf(buf,
|
|
|
|
"server_state=%d outcome=%d",
|
|
|
|
RESULT_SERVER_STATE_OVER, RESULT_OUTCOME_SUCCESS
|
|
|
|
);
|
|
|
|
result.update_field(buf);
|
|
|
|
|
|
|
|
dbwu.id = wu.id;
|
- server code: at some point I made a global var "SCHED_CONFIG config",
mostly so that the parse function could assume
that everything was initially zero.
However, various back-end functions pass around SCHED_CONFIG&
as an argument (also named "config").
This creates a shadow, which is always bad.
Worse is the possibility that some projects have back-end programs
that have a SCHED_CONFIG variable that's automatic,
and therefore isn't zero initially,
and therefore isn't parsing correctly.
To fix this, I changed the 2 vectors in SCHED_CONFIG into pointers,
and have the parse routine zero the structure.
I was tempted to remove the SCHED_CONFIG& args to back-end functions,
but this would have broken some projects' code.
I did, however, change the name from config to config_loc
to avoid shadowing.
Also fixed various other compiler warnings.
svn path=/trunk/boinc/; revision=15541
2008-07-02 17:24:53 +00:00
|
|
|
sprintf(buf, "transition_time=%ld", time(0));
|
2007-07-05 04:18:48 +00:00
|
|
|
dbwu.update_field(buf);
|
|
|
|
|
|
|
|
}
|
2008-06-04 23:04:12 +00:00
|
|
|
|
|
|
|
// If we're sending an unreplicated job to an untrusted host,
|
|
|
|
// mark it as replicated
|
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
if (wu.target_nresults == 1 && app->target_nresults > 1) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (bavp->trusted) {
|
2008-08-06 18:36:30 +00:00
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
"[send] [WU#%d] using trusted app version, not replicating\n", wu.id
|
2008-08-06 18:36:30 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
DB_WORKUNIT dbwu;
|
|
|
|
char buf[256];
|
2008-11-07 04:13:08 +00:00
|
|
|
sprintf(buf,
|
|
|
|
"target_nresults=%d, min_quorum=%d, transition_time=%ld",
|
2008-08-06 18:36:30 +00:00
|
|
|
app->target_nresults, app->target_nresults, time(0)
|
2008-06-04 23:04:12 +00:00
|
|
|
);
|
2008-08-06 18:36:30 +00:00
|
|
|
dbwu.id = wu.id;
|
|
|
|
if (config.debug_send) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] [WU#%d] sending to untrusted host, replicating\n", wu.id
|
2008-08-06 18:36:30 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
retval = dbwu.update_field(buf);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"WU update failed: %d", retval
|
|
|
|
);
|
|
|
|
}
|
2008-06-04 23:04:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-09-10 00:41:48 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-04-26 23:34:38 +00:00
|
|
|
// send messages to user about why jobs were or weren't sent
|
|
|
|
//
|
2008-12-16 16:29:54 +00:00
|
|
|
static void explain_to_user() {
|
2007-08-16 17:33:41 +00:00
|
|
|
char helpful[512];
|
2008-04-30 20:31:33 +00:00
|
|
|
unsigned int i;
|
- server code: at some point I made a global var "SCHED_CONFIG config",
mostly so that the parse function could assume
that everything was initially zero.
However, various back-end functions pass around SCHED_CONFIG&
as an argument (also named "config").
This creates a shadow, which is always bad.
Worse is the possibility that some projects have back-end programs
that have a SCHED_CONFIG variable that's automatic,
and therefore isn't zero initially,
and therefore isn't parsing correctly.
To fix this, I changed the 2 vectors in SCHED_CONFIG into pointers,
and have the parse routine zero the structure.
I was tempted to remove the SCHED_CONFIG& args to back-end functions,
but this would have broken some projects' code.
I did, however, change the name from config to config_loc
to avoid shadowing.
Also fixed various other compiler warnings.
svn path=/trunk/boinc/; revision=15541
2008-07-02 17:24:53 +00:00
|
|
|
int j;
|
2007-09-21 18:10:54 +00:00
|
|
|
|
2008-08-14 22:06:51 +00:00
|
|
|
// If work was sent from apps the user did not select, explain.
|
|
|
|
// NOTE: this will have to be done differently with matchmaker scheduling
|
2008-03-07 21:13:01 +00:00
|
|
|
//
|
2010-01-05 11:03:10 +00:00
|
|
|
if (!config.locality_scheduling && !config.locality_scheduler_fraction && !config.matchmaker) {
|
2009-06-01 22:15:14 +00:00
|
|
|
if (g_wreq->njobs_sent && !g_wreq->user_apps_only) {
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2009-08-21 19:14:15 +00:00
|
|
|
"No work can be sent for the applications you have selected",
|
|
|
|
"high"
|
2008-08-14 22:06:51 +00:00
|
|
|
);
|
2008-03-07 21:13:01 +00:00
|
|
|
|
2008-08-14 22:06:51 +00:00
|
|
|
// Inform the user about applications with no work
|
|
|
|
//
|
2008-12-18 21:25:51 +00:00
|
|
|
for (i=0; i<g_wreq->preferred_apps.size(); i++) {
|
|
|
|
if (!g_wreq->preferred_apps[i].work_available) {
|
|
|
|
APP* app = ssp->lookup_app(g_wreq->preferred_apps[i].appid);
|
2008-08-14 22:06:51 +00:00
|
|
|
// don't write message if the app is deprecated
|
|
|
|
//
|
|
|
|
if (app) {
|
|
|
|
char explanation[256];
|
|
|
|
sprintf(explanation,
|
|
|
|
"No work is available for %s",
|
2008-12-18 21:25:51 +00:00
|
|
|
find_user_friendly_name(g_wreq->preferred_apps[i].appid)
|
2008-08-14 22:06:51 +00:00
|
|
|
);
|
2009-08-21 19:14:15 +00:00
|
|
|
g_reply->insert_message( explanation, "high");
|
2008-08-14 22:06:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2008-03-07 21:13:01 +00:00
|
|
|
|
2008-08-14 22:06:51 +00:00
|
|
|
// Tell the user about applications they didn't qualify for
|
|
|
|
//
|
|
|
|
for (j=0; j<preferred_app_message_index; j++){
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->insert_message(g_wreq->no_work_messages.at(j));
|
2008-08-14 22:06:51 +00:00
|
|
|
}
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2010-01-18 04:52:58 +00:00
|
|
|
"Your preferences allow work from applications other than those selected",
|
|
|
|
"low"
|
2009-02-26 03:03:35 +00:00
|
|
|
);
|
|
|
|
g_reply->insert_message(
|
2010-01-18 04:52:58 +00:00
|
|
|
"Sending work from other applications", "low"
|
2008-08-14 22:06:51 +00:00
|
|
|
);
|
2008-03-07 21:13:01 +00:00
|
|
|
}
|
2008-08-14 22:06:51 +00:00
|
|
|
}
|
2008-03-07 21:13:01 +00:00
|
|
|
|
2007-08-16 17:33:41 +00:00
|
|
|
// if client asked for work and we're not sending any, explain why
|
|
|
|
//
|
2009-06-01 22:15:14 +00:00
|
|
|
if (g_wreq->njobs_sent == 0) {
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->set_delay(DELAY_NO_WORK_TEMP);
|
2009-08-21 19:14:15 +00:00
|
|
|
g_reply->insert_message("No work sent", "high");
|
2009-01-20 21:31:13 +00:00
|
|
|
|
|
|
|
// Tell the user about applications with no work
|
|
|
|
//
|
2008-12-18 21:25:51 +00:00
|
|
|
for (i=0; i<g_wreq->preferred_apps.size(); i++) {
|
2009-05-06 21:52:50 +00:00
|
|
|
if (!g_wreq->preferred_apps[i].work_available) {
|
|
|
|
APP* app = ssp->lookup_app(g_wreq->preferred_apps[i].appid);
|
|
|
|
// don't write message if the app is deprecated
|
|
|
|
if (app != NULL) {
|
|
|
|
char explanation[256];
|
|
|
|
sprintf(explanation, "No work is available for %s",
|
2008-12-18 21:25:51 +00:00
|
|
|
find_user_friendly_name(g_wreq->preferred_apps[i].appid)
|
2008-10-27 21:23:07 +00:00
|
|
|
);
|
2009-08-21 19:14:15 +00:00
|
|
|
g_reply->insert_message(explanation, "high");
|
2009-05-06 21:52:50 +00:00
|
|
|
}
|
2010-01-18 04:52:58 +00:00
|
|
|
}
|
2008-03-07 21:13:01 +00:00
|
|
|
}
|
2009-01-20 21:31:13 +00:00
|
|
|
|
|
|
|
// Tell the user about applications they didn't qualify for
|
|
|
|
//
|
2008-12-16 16:29:54 +00:00
|
|
|
for (i=0; i<g_wreq->no_work_messages.size(); i++){
|
2009-05-06 21:52:50 +00:00
|
|
|
g_reply->insert_message(g_wreq->no_work_messages.at(i));
|
2008-03-07 21:13:01 +00:00
|
|
|
}
|
2008-12-16 16:29:54 +00:00
|
|
|
if (g_wreq->no_allowed_apps_available) {
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2010-01-18 04:52:58 +00:00
|
|
|
"No work available for the applications you have selected. Please check your preferences on the web site.",
|
2009-08-21 19:14:15 +00:00
|
|
|
"high"
|
2006-05-02 22:17:09 +00:00
|
|
|
);
|
|
|
|
}
|
2008-12-16 16:29:54 +00:00
|
|
|
if (g_wreq->speed.insufficient) {
|
2010-01-13 17:28:59 +00:00
|
|
|
if (g_request->core_client_version>41900) {
|
2005-03-19 18:24:24 +00:00
|
|
|
sprintf(helpful,
|
|
|
|
"(won't finish in time) "
|
2010-05-03 17:20:44 +00:00
|
|
|
"BOINC runs %.1f%% of the time; computation is enabled %.1f%% of that",
|
|
|
|
100*g_reply->host.on_frac, 100*g_reply->host.active_frac
|
2005-03-19 18:24:24 +00:00
|
|
|
);
|
2007-08-16 17:33:41 +00:00
|
|
|
} else {
|
2005-03-19 18:24:24 +00:00
|
|
|
sprintf(helpful,
|
|
|
|
"(won't finish in time) "
|
2010-05-03 17:20:44 +00:00
|
|
|
"Computer available %.1f%% of the time",
|
|
|
|
100*g_reply->host.on_frac
|
2005-03-19 18:24:24 +00:00
|
|
|
);
|
|
|
|
}
|
2009-08-21 19:14:15 +00:00
|
|
|
g_reply->insert_message(helpful, "high");
|
2004-04-04 01:59:47 +00:00
|
|
|
}
|
2008-12-16 16:29:54 +00:00
|
|
|
if (g_wreq->hr_reject_temp) {
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2009-08-21 19:14:15 +00:00
|
|
|
"(there was work but it was committed to other platforms)",
|
|
|
|
"high"
|
2004-05-18 18:33:01 +00:00
|
|
|
);
|
|
|
|
}
|
2008-12-16 16:29:54 +00:00
|
|
|
if (g_wreq->hr_reject_perm) {
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2009-08-21 19:14:15 +00:00
|
|
|
"(your platform is not supported by this project)",
|
|
|
|
"high"
|
2005-11-30 22:52:23 +00:00
|
|
|
);
|
|
|
|
}
|
2009-01-20 21:31:13 +00:00
|
|
|
if (g_wreq->outdated_client) {
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2009-08-21 19:14:15 +00:00
|
|
|
" (your BOINC client is old - please install current version)",
|
|
|
|
"high"
|
2004-04-04 01:59:47 +00:00
|
|
|
);
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->set_delay(DELAY_NO_WORK_PERM);
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2007-09-21 18:10:54 +00:00
|
|
|
"Not sending work because client is outdated\n"
|
2004-04-04 01:59:47 +00:00
|
|
|
);
|
|
|
|
}
|
2008-12-16 16:29:54 +00:00
|
|
|
if (g_wreq->excessive_work_buf) {
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2009-08-21 19:14:15 +00:00
|
|
|
"(Your network connection interval is longer than WU deadline)",
|
|
|
|
"high"
|
2005-04-18 18:42:29 +00:00
|
|
|
);
|
|
|
|
}
|
2009-09-28 04:24:18 +00:00
|
|
|
if (g_wreq->no_cuda_prefs) {
|
|
|
|
g_reply->insert_message(
|
|
|
|
"Jobs for NVIDIA GPU are available, but your preferences are set to not accept them",
|
|
|
|
"low"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
if (g_wreq->no_ati_prefs) {
|
2009-02-26 03:03:35 +00:00
|
|
|
g_reply->insert_message(
|
2009-09-28 04:24:18 +00:00
|
|
|
"Jobs for ATI GPU are available, but your preferences are set to not accept them",
|
2009-08-21 19:14:15 +00:00
|
|
|
"low"
|
2009-01-12 23:47:52 +00:00
|
|
|
);
|
|
|
|
}
|
2009-03-18 21:14:44 +00:00
|
|
|
if (g_wreq->no_cpu_prefs) {
|
|
|
|
g_reply->insert_message(
|
2009-09-28 04:24:18 +00:00
|
|
|
"Jobs for CPU are available, but your preferences are set to not accept them",
|
2009-08-21 19:14:15 +00:00
|
|
|
"low"
|
2009-03-18 21:14:44 +00:00
|
|
|
);
|
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
DB_HOST_APP_VERSION* havp = quota_exceeded_version();
|
|
|
|
if (havp) {
|
2005-05-12 21:04:39 +00:00
|
|
|
struct tm *rpc_time_tm;
|
|
|
|
int delay_time;
|
|
|
|
|
2009-06-01 22:15:14 +00:00
|
|
|
sprintf(helpful, "(reached daily quota of %d tasks)",
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
havp->max_jobs_per_day
|
2009-01-20 00:54:16 +00:00
|
|
|
);
|
2009-08-21 19:14:15 +00:00
|
|
|
g_reply->insert_message(helpful, "high");
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
"Daily result quota %d exceeded for app version %d\n",
|
|
|
|
havp->max_jobs_per_day, havp->app_version_id
|
2004-05-27 18:13:00 +00:00
|
|
|
);
|
2005-05-12 21:04:39 +00:00
|
|
|
|
|
|
|
// set delay so host won't return until a random time in
|
2008-06-30 23:05:16 +00:00
|
|
|
// the first hour of the next day.
|
|
|
|
// This is to prevent a lot of hosts from flooding the scheduler
|
|
|
|
// with requests at the same time of day.
|
|
|
|
//
|
2008-12-16 16:29:54 +00:00
|
|
|
time_t t = g_reply->host.rpc_time;
|
2008-06-30 23:05:16 +00:00
|
|
|
rpc_time_tm = localtime(&t);
|
|
|
|
delay_time = (23 - rpc_time_tm->tm_hour) * 3600
|
|
|
|
+ (59 - rpc_time_tm->tm_min) * 60
|
|
|
|
+ (60 - rpc_time_tm->tm_sec)
|
|
|
|
+ (int)(3600*(double)rand()/(double)RAND_MAX);
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->set_delay(delay_time);
|
2004-05-27 18:13:00 +00:00
|
|
|
}
|
2009-06-01 22:15:14 +00:00
|
|
|
if (g_wreq->max_jobs_on_host_exceeded) {
|
2010-06-01 23:41:07 +00:00
|
|
|
sprintf(helpful, "(reached limit of tasks in progress)");
|
2009-08-21 19:14:15 +00:00
|
|
|
g_reply->insert_message(helpful, "high");
|
2008-12-16 16:29:54 +00:00
|
|
|
g_reply->set_delay(DELAY_NO_WORK_CACHE);
|
2009-06-01 22:15:14 +00:00
|
|
|
}
|
|
|
|
if (g_wreq->max_jobs_on_host_cpu_exceeded) {
|
2010-06-01 23:41:07 +00:00
|
|
|
sprintf(helpful, "(reached limit of CPU tasks in progress)");
|
2009-08-21 19:14:15 +00:00
|
|
|
g_reply->insert_message(helpful, "high");
|
2009-06-01 22:15:14 +00:00
|
|
|
g_reply->set_delay(DELAY_NO_WORK_CACHE);
|
2008-05-23 16:13:30 +00:00
|
|
|
}
|
2009-06-01 22:15:14 +00:00
|
|
|
if (g_wreq->max_jobs_on_host_gpu_exceeded) {
|
2010-06-01 23:41:07 +00:00
|
|
|
sprintf(helpful, "(reached limit of GPU tasks in progress)");
|
2009-08-21 19:14:15 +00:00
|
|
|
g_reply->insert_message(helpful, "high");
|
2009-06-01 22:15:14 +00:00
|
|
|
g_reply->set_delay(DELAY_NO_WORK_CACHE);
|
2008-05-23 16:13:30 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-01-10 00:43:33 +00:00
|
|
|
static double clamp_req_sec(double x) {
|
|
|
|
if (x < MIN_REQ_SECS) return MIN_REQ_SECS;
|
|
|
|
if (x > MAX_REQ_SECS) return MAX_REQ_SECS;
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
|
2010-06-01 19:56:54 +00:00
|
|
|
// prepare to send jobs, both resent and new;
|
2009-03-05 23:08:53 +00:00
|
|
|
// decipher request type, fill in WORK_REQ
|
|
|
|
//
|
|
|
|
void send_work_setup() {
|
2009-08-26 18:21:36 +00:00
|
|
|
unsigned int i;
|
|
|
|
|
2010-06-01 19:56:54 +00:00
|
|
|
if (g_wreq->anonymous_platform) {
|
|
|
|
estimate_flops_anon_platform();
|
|
|
|
}
|
|
|
|
|
2009-03-05 23:08:53 +00:00
|
|
|
g_wreq->disk_available = max_allowable_disk();
|
|
|
|
get_mem_sizes();
|
|
|
|
get_running_frac();
|
2009-06-22 21:11:19 +00:00
|
|
|
g_wreq->get_job_limits();
|
2009-03-05 23:08:53 +00:00
|
|
|
|
2009-01-10 00:43:33 +00:00
|
|
|
g_wreq->seconds_to_fill = clamp_req_sec(g_request->work_req_seconds);
|
|
|
|
g_wreq->cpu_req_secs = clamp_req_sec(g_request->cpu_req_secs);
|
|
|
|
g_wreq->cpu_req_instances = g_request->cpu_req_instances;
|
2009-01-24 21:51:19 +00:00
|
|
|
g_wreq->anonymous_platform = anonymous(g_request->platforms.list[0]);
|
|
|
|
|
2010-05-18 19:22:34 +00:00
|
|
|
if (g_request->coprocs.cuda.count) {
|
|
|
|
g_wreq->cuda_req_secs = clamp_req_sec(g_request->coprocs.cuda.req_secs);
|
|
|
|
g_wreq->cuda_req_instances = g_request->coprocs.cuda.req_instances;
|
|
|
|
if (g_request->coprocs.cuda.estimated_delay < 0) {
|
|
|
|
g_request->coprocs.cuda.estimated_delay = g_request->cpu_estimated_delay;
|
2009-01-24 21:51:19 +00:00
|
|
|
}
|
2009-01-10 00:43:33 +00:00
|
|
|
}
|
2010-05-18 19:22:34 +00:00
|
|
|
if (g_request->coprocs.ati.count) {
|
|
|
|
g_wreq->ati_req_secs = clamp_req_sec(g_request->coprocs.ati.req_secs);
|
|
|
|
g_wreq->ati_req_instances = g_request->coprocs.ati.req_instances;
|
|
|
|
if (g_request->coprocs.ati.estimated_delay < 0) {
|
|
|
|
g_request->coprocs.ati.estimated_delay = g_request->cpu_estimated_delay;
|
2009-08-17 17:07:38 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (g_wreq->cpu_req_secs || g_wreq->cuda_req_secs || g_wreq->ati_req_secs) {
|
2009-01-10 00:43:33 +00:00
|
|
|
g_wreq->rsc_spec_request = true;
|
|
|
|
} else {
|
2009-01-13 00:56:12 +00:00
|
|
|
g_wreq->rsc_spec_request = false;
|
2009-01-10 00:43:33 +00:00
|
|
|
}
|
2009-08-26 18:21:36 +00:00
|
|
|
|
2010-06-01 19:56:54 +00:00
|
|
|
for (i=0; i<g_request->other_results.size(); i++) {
|
|
|
|
OTHER_RESULT& r = g_request->other_results[i];
|
2010-06-01 23:41:07 +00:00
|
|
|
APP* app = NULL;
|
|
|
|
bool uses_gpu = false;
|
|
|
|
if (r.app_version >= 0 && r.app_version<g_request->client_app_versions.size()) {
|
|
|
|
CLIENT_APP_VERSION& cav = g_request->client_app_versions[r.app_version];
|
|
|
|
app = cav.app;
|
|
|
|
uses_gpu = cav.host_usage.uses_gpu();
|
|
|
|
} else {
|
|
|
|
if (r.have_plan_class && app_plan_uses_gpu(r.plan_class)) {
|
|
|
|
uses_gpu = true;
|
2010-06-01 19:56:54 +00:00
|
|
|
}
|
|
|
|
}
|
2010-06-01 23:41:07 +00:00
|
|
|
config.max_jobs_in_progress.register_job(app, uses_gpu);
|
2010-06-01 19:56:54 +00:00
|
|
|
}
|
|
|
|
|
2009-08-26 18:21:36 +00:00
|
|
|
// print details of request to log
|
|
|
|
//
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_send) {
|
2009-08-26 18:21:36 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] %s matchmaker scheduling; %s EDF sim\n",
|
|
|
|
config.matchmaker?"Using":"Not using",
|
|
|
|
config.workload_sim?"Using":"Not using"
|
|
|
|
);
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-30 21:25:24 +00:00
|
|
|
"[send] CPU: req %.2f sec, %.2f instances; est delay %.2f\n",
|
|
|
|
g_wreq->cpu_req_secs, g_wreq->cpu_req_instances,
|
|
|
|
g_request->cpu_estimated_delay
|
2009-01-10 00:43:33 +00:00
|
|
|
);
|
2010-05-18 19:22:34 +00:00
|
|
|
if (g_request->coprocs.cuda.count) {
|
2009-01-30 21:25:24 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] CUDA: req %.2f sec, %.2f instances; est delay %.2f\n",
|
|
|
|
g_wreq->cuda_req_secs, g_wreq->cuda_req_instances,
|
2010-05-18 19:22:34 +00:00
|
|
|
g_request->coprocs.cuda.estimated_delay
|
2009-01-30 21:25:24 +00:00
|
|
|
);
|
|
|
|
}
|
2010-05-18 19:22:34 +00:00
|
|
|
if (g_request->coprocs.ati.count) {
|
2009-08-17 17:07:38 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] ATI: req %.2f sec, %.2f instances; est delay %.2f\n",
|
|
|
|
g_wreq->ati_req_secs, g_wreq->ati_req_instances,
|
2010-05-18 19:22:34 +00:00
|
|
|
g_request->coprocs.ati.estimated_delay
|
2009-08-17 17:07:38 +00:00
|
|
|
);
|
|
|
|
}
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[send] work_req_seconds: %.2f secs\n",
|
2009-01-10 00:43:33 +00:00
|
|
|
g_wreq->seconds_to_fill
|
|
|
|
);
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-01-27 18:57:27 +00:00
|
|
|
"[send] available disk %.2f GB, work_buf_min %d\n",
|
2008-12-16 16:29:54 +00:00
|
|
|
g_wreq->disk_available/GIGA,
|
|
|
|
(int)g_request->global_prefs.work_buf_min()
|
2008-05-02 17:48:29 +00:00
|
|
|
);
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-04-08 23:14:47 +00:00
|
|
|
"[send] active_frac %f on_frac %f\n",
|
2008-12-18 18:19:42 +00:00
|
|
|
g_reply->host.active_frac,
|
2010-04-08 23:14:47 +00:00
|
|
|
g_reply->host.on_frac
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
2009-08-26 18:21:36 +00:00
|
|
|
if (g_wreq->anonymous_platform) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"Anonymous platform app versions:\n"
|
|
|
|
);
|
|
|
|
for (i=0; i<g_request->client_app_versions.size(); i++) {
|
|
|
|
CLIENT_APP_VERSION& cav = g_request->client_app_versions[i];
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-04-11 04:42:52 +00:00
|
|
|
" app: %s version %d cpus %.2f cudas %.2f atis %.2f flops %fG\n",
|
|
|
|
cav.app_name,
|
|
|
|
cav.version_num,
|
|
|
|
cav.host_usage.avg_ncpus,
|
|
|
|
cav.host_usage.ncudas,
|
|
|
|
cav.host_usage.natis,
|
|
|
|
cav.host_usage.projected_flops/1e9
|
2009-08-26 18:21:36 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
2008-04-26 23:34:38 +00:00
|
|
|
}
|
2009-03-05 23:08:53 +00:00
|
|
|
}
|
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// If a record is not in DB, create it.
|
2010-04-08 23:14:47 +00:00
|
|
|
//
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
int update_host_app_versions(vector<RESULT>& results, int hostid) {
|
|
|
|
vector<DB_HOST_APP_VERSION> new_havs;
|
2010-04-08 23:14:47 +00:00
|
|
|
unsigned int i, j;
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
for (i=0; i<results.size(); i++) {
|
|
|
|
RESULT& r = results[i];
|
|
|
|
int gavid = generalized_app_version_id(r.app_version_id, r.appid);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
DB_HOST_APP_VERSION* havp = gavid_to_havp(gavid);
|
2010-04-21 19:33:20 +00:00
|
|
|
if (!havp) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
bool found = false;
|
|
|
|
for (j=0; j<new_havs.size(); j++) {
|
|
|
|
DB_HOST_APP_VERSION& hav = new_havs[j];
|
|
|
|
if (hav.app_version_id == gavid) {
|
|
|
|
found = true;
|
2010-04-08 23:14:47 +00:00
|
|
|
}
|
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (!found) {
|
|
|
|
DB_HOST_APP_VERSION hav;
|
2010-04-15 21:58:44 +00:00
|
|
|
hav.clear();
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
hav.host_id = hostid;
|
|
|
|
hav.app_version_id = gavid;
|
|
|
|
new_havs.push_back(hav);
|
|
|
|
}
|
2010-04-08 23:14:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
char query[256], clause[512];
|
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// create new records
|
2010-04-08 23:14:47 +00:00
|
|
|
//
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
for (i=0; i<new_havs.size(); i++) {
|
|
|
|
DB_HOST_APP_VERSION& hav = new_havs[i];
|
|
|
|
|
|
|
|
retval = hav.insert();
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"hav.insert(): %d\n", retval
|
|
|
|
);
|
2010-04-08 23:14:47 +00:00
|
|
|
} else {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (config.debug_credit) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[credit] created host_app_version record (%d, %d)\n",
|
|
|
|
hav.host_id, hav.app_version_id
|
2010-04-08 23:14:47 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-03-05 23:08:53 +00:00
|
|
|
void send_work() {
|
2010-03-29 22:28:20 +00:00
|
|
|
int retval;
|
|
|
|
|
2009-03-05 23:08:53 +00:00
|
|
|
if (!g_wreq->rsc_spec_request && g_wreq->seconds_to_fill == 0) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (all_apps_use_hr && hr_unknown_platform(g_request->host)) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"Not sending work because unknown HR class\n"
|
|
|
|
);
|
|
|
|
g_wreq->hr_reject_perm = true;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// decide on attributes of HOST_APP_VERSIONS
|
|
|
|
//
|
|
|
|
get_reliability_and_trust();
|
2009-03-05 23:08:53 +00:00
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
get_prefs_info();
|
2008-04-26 23:34:38 +00:00
|
|
|
|
|
|
|
if (config.enable_assignment) {
|
2008-12-19 18:14:02 +00:00
|
|
|
if (send_assigned_jobs()) {
|
2008-04-26 23:34:38 +00:00
|
|
|
if (config.debug_assignment) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[assign] [HOST#%d] sent assigned jobs\n", g_reply->host.id
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
2010-04-08 18:27:27 +00:00
|
|
|
goto done;
|
2008-04-26 23:34:38 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-16 16:29:54 +00:00
|
|
|
if (config.workload_sim && g_request->have_other_results_list) {
|
2008-04-26 23:34:38 +00:00
|
|
|
init_ip_results(
|
2009-06-22 21:11:19 +00:00
|
|
|
g_request->global_prefs.work_buf_min(),
|
|
|
|
g_wreq->effective_ncpus, g_request->ip_results
|
2008-04-26 23:34:38 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2009-11-12 21:30:33 +00:00
|
|
|
// assume no jobs are available to send;
|
|
|
|
// if this turns out not to be the case, clear this flag
|
|
|
|
//
|
|
|
|
g_wreq->no_jobs_available = true;
|
|
|
|
|
2009-03-03 00:12:55 +00:00
|
|
|
if (config.locality_scheduler_fraction > 0) {
|
|
|
|
if (drand() < config.locality_scheduler_fraction) {
|
2010-04-06 20:32:02 +00:00
|
|
|
if (config.debug_locality) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[mixed] sending locality work first\n"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
send_work_locality();
|
|
|
|
if (config.debug_locality) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[mixed] sending non-locality work second\n"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
send_work_old();
|
2009-03-03 00:12:55 +00:00
|
|
|
} else {
|
2010-04-06 20:32:02 +00:00
|
|
|
if (config.debug_locality) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[mixed] sending non-locality work first\n"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
send_work_old();
|
|
|
|
if (config.debug_locality) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[mixed] sending locality work second\n"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
send_work_locality();
|
2009-03-03 00:12:55 +00:00
|
|
|
}
|
|
|
|
} else if (config.locality_scheduling) {
|
2008-12-19 18:14:02 +00:00
|
|
|
send_work_locality();
|
2008-05-23 16:13:30 +00:00
|
|
|
} else if (config.matchmaker) {
|
2008-12-16 16:29:54 +00:00
|
|
|
send_work_matchmaker();
|
2008-05-23 16:13:30 +00:00
|
|
|
} else {
|
2008-12-16 16:29:54 +00:00
|
|
|
send_work_old();
|
2008-04-26 23:34:38 +00:00
|
|
|
}
|
|
|
|
|
2010-04-08 18:27:27 +00:00
|
|
|
done:
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
retval = update_host_app_versions(g_reply->results, g_reply->host.id);
|
2010-03-29 22:28:20 +00:00
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
"update_host_app_versions() failed: %d\n", retval
|
2010-03-29 22:28:20 +00:00
|
|
|
);
|
|
|
|
}
|
2008-12-16 16:29:54 +00:00
|
|
|
explain_to_user();
|
2008-04-26 23:34:38 +00:00
|
|
|
}
|
|
|
|
|
2005-01-02 18:29:53 +00:00
|
|
|
const char *BOINC_RCSID_32dcd335e7 = "$Id$";
|