2008-08-06 18:36:30 +00:00
|
|
|
// This file is part of BOINC.
|
2005-01-20 23:22:22 +00:00
|
|
|
// http://boinc.berkeley.edu
|
2008-08-06 18:36:30 +00:00
|
|
|
// Copyright (C) 2008 University of California
|
2003-08-15 22:39:56 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
2003-08-15 22:39:56 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
2005-01-20 23:22:22 +00:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
2002-04-30 22:22:54 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
2002-04-30 22:22:54 +00:00
|
|
|
|
|
|
|
#ifndef _SERVER_TYPES_
|
|
|
|
#define _SERVER_TYPES_
|
|
|
|
|
2004-07-13 13:54:09 +00:00
|
|
|
#include <cstdio>
|
2002-04-30 22:22:54 +00:00
|
|
|
#include <vector>
|
|
|
|
|
2003-04-07 19:06:00 +00:00
|
|
|
#include "boinc_db.h"
|
2006-09-01 21:06:30 +00:00
|
|
|
#include "common_defs.h"
|
2004-05-03 02:18:35 +00:00
|
|
|
#include "md5_file.h"
|
2008-03-10 21:59:27 +00:00
|
|
|
#include "coproc.h"
|
2002-12-09 23:06:17 +00:00
|
|
|
|
2007-05-10 16:23:10 +00:00
|
|
|
#include "edf_sim.h"
|
|
|
|
|
2007-05-09 17:45:18 +00:00
|
|
|
// for projects that support work filtering by app,
|
|
|
|
// this records an app for which the user will accept work
|
|
|
|
//
|
2006-10-22 01:46:33 +00:00
|
|
|
struct APP_INFO {
|
2006-10-22 00:42:44 +00:00
|
|
|
int appid;
|
2008-03-07 21:13:01 +00:00
|
|
|
int work_available;
|
2006-10-22 00:42:44 +00:00
|
|
|
};
|
|
|
|
|
2007-08-16 17:33:41 +00:00
|
|
|
// represents a resource (disk etc.) that the client may not have enough of
|
|
|
|
//
|
|
|
|
struct RESOURCE {
|
|
|
|
bool insufficient;
|
|
|
|
double needed; // the min extra amount needed
|
|
|
|
|
|
|
|
inline void set_insufficient(double x) {
|
|
|
|
insufficient = true;
|
|
|
|
if (needed) {
|
|
|
|
if (x < needed) needed = x;
|
|
|
|
} else {
|
|
|
|
needed = x;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2008-03-07 21:13:01 +00:00
|
|
|
// message intended for human eyes
|
|
|
|
//
|
|
|
|
struct USER_MESSAGE {
|
|
|
|
std::string message;
|
|
|
|
std::string priority;
|
|
|
|
USER_MESSAGE(const char* m, const char*p);
|
|
|
|
};
|
|
|
|
|
2008-03-18 21:22:44 +00:00
|
|
|
struct HOST_USAGE {
|
2009-08-19 18:41:47 +00:00
|
|
|
double ncudas;
|
|
|
|
double natis;
|
2009-12-11 22:45:59 +00:00
|
|
|
double gpu_ram;
|
2008-03-27 18:25:29 +00:00
|
|
|
double avg_ncpus;
|
|
|
|
double max_ncpus;
|
2010-04-10 05:49:51 +00:00
|
|
|
double projected_flops;
|
|
|
|
// the scheduler's best estimate of wu.rsc_fpops_est/elapsed_time.
|
|
|
|
// Taken from host_app_version elapsed time statistics if available,
|
|
|
|
// else on estimate provided by app_plan()
|
|
|
|
double peak_flops;
|
|
|
|
// stored in result.estimated_flops, and used for credit calculations
|
2008-03-27 18:25:29 +00:00
|
|
|
char cmdline[256];
|
|
|
|
|
2008-07-14 22:32:20 +00:00
|
|
|
HOST_USAGE() {
|
2009-03-05 00:10:16 +00:00
|
|
|
ncudas = 0;
|
2009-08-17 17:07:38 +00:00
|
|
|
natis = 0;
|
2009-12-11 22:45:59 +00:00
|
|
|
gpu_ram = 0;
|
2008-07-14 22:32:20 +00:00
|
|
|
avg_ncpus = 1;
|
|
|
|
max_ncpus = 1;
|
2010-04-10 05:49:51 +00:00
|
|
|
projected_flops = 0;
|
|
|
|
peak_flops = 0;
|
2008-07-14 22:32:20 +00:00
|
|
|
strcpy(cmdline, "");
|
|
|
|
}
|
|
|
|
void sequential_app(double x) {
|
2009-03-05 00:10:16 +00:00
|
|
|
ncudas = 0;
|
2009-08-17 17:07:38 +00:00
|
|
|
natis = 0;
|
2009-12-11 22:45:59 +00:00
|
|
|
gpu_ram = 0;
|
2008-03-27 18:25:29 +00:00
|
|
|
avg_ncpus = 1;
|
|
|
|
max_ncpus = 1;
|
2010-04-10 05:49:51 +00:00
|
|
|
if (x <= 0) x = 1e9;
|
|
|
|
projected_flops = x;
|
|
|
|
peak_flops = x;
|
2008-03-27 18:25:29 +00:00
|
|
|
strcpy(cmdline, "");
|
|
|
|
}
|
2010-04-08 23:14:47 +00:00
|
|
|
inline int resource_type() {
|
|
|
|
if (ncudas) {
|
|
|
|
return ANON_PLATFORM_NVIDIA;
|
|
|
|
} else if (natis) {
|
|
|
|
return ANON_PLATFORM_ATI;
|
|
|
|
}
|
|
|
|
return ANON_PLATFORM_CPU;
|
|
|
|
}
|
2010-06-04 16:57:33 +00:00
|
|
|
inline const char* resource_name() {
|
|
|
|
if (ncudas) {
|
|
|
|
return "nvidia GPU";
|
|
|
|
} else if (natis) {
|
|
|
|
return "ATI GPU";
|
|
|
|
}
|
|
|
|
return "CPU";
|
|
|
|
}
|
2010-06-01 23:41:07 +00:00
|
|
|
inline bool uses_gpu() {
|
|
|
|
if (ncudas) return true;
|
|
|
|
if (natis) return true;
|
|
|
|
return false;
|
|
|
|
}
|
2008-03-18 21:22:44 +00:00
|
|
|
};
|
|
|
|
|
2004-09-10 00:41:48 +00:00
|
|
|
// a description of a sticky file on host.
|
|
|
|
//
|
|
|
|
struct FILE_INFO {
|
|
|
|
char name[256];
|
2008-03-07 21:13:01 +00:00
|
|
|
|
2004-09-10 21:02:11 +00:00
|
|
|
int parse(FILE*);
|
2004-09-10 00:41:48 +00:00
|
|
|
};
|
|
|
|
|
2004-06-24 21:00:13 +00:00
|
|
|
struct MSG_FROM_HOST_DESC {
|
2004-07-06 04:10:51 +00:00
|
|
|
char variety[256];
|
2004-09-10 00:41:48 +00:00
|
|
|
std::string msg_text;
|
2004-01-08 00:27:59 +00:00
|
|
|
int parse(FILE*);
|
|
|
|
};
|
|
|
|
|
2004-01-26 19:29:39 +00:00
|
|
|
// an app version from an anonymous-platform client
|
2010-06-01 23:41:07 +00:00
|
|
|
// (starting with 6.11, ALL clients send these)
|
2004-01-26 19:29:39 +00:00
|
|
|
//
|
|
|
|
struct CLIENT_APP_VERSION {
|
|
|
|
char app_name[256];
|
2009-03-05 17:54:39 +00:00
|
|
|
char platform[256];
|
2004-01-26 19:29:39 +00:00
|
|
|
int version_num;
|
- scheduler: add support for anonymous-platform coproc apps.
Old: although the request message contained all info
about the app version (flops, coproc usage etc.)
the server ignored this info,
and assumed that all anonymous platform apps where CPU.
With 6.6 client, this could produce infinite work fetch:
- client uses anon platform, has coproc app
- client has idle CPU, requests CPU work
- scheduler sends it jobs, thinking they will be done by CPU app
- client asks for more work etc.
New: scheduler parses full info on anon platform app versions:
plan class, FLOPS, coprocs.
It uses this info to make scheduling decisions;
in particular, if the request is for CUDA work,
if will only send jobs that use a CUDA app version.
The <result> records it returns contain info
(plan_class) that tells the client which app_version to use.
This will work correctly even if the client has multiple app versions
for the same app (e.g., a CPU version and a GPU version)
svn path=/trunk/boinc/; revision=17506
2009-03-05 17:30:10 +00:00
|
|
|
char plan_class[256];
|
|
|
|
HOST_USAGE host_usage;
|
2010-04-08 23:14:47 +00:00
|
|
|
double rsc_fpops_scale;
|
|
|
|
// multiply wu.rsc_fpops_est and rsc_fpops_limit
|
|
|
|
// by this amount when send to client,
|
|
|
|
// to reflect the discrepancy between how fast the client
|
|
|
|
// thinks the app is versus how fast we think it is
|
|
|
|
APP* app;
|
2004-01-26 19:29:39 +00:00
|
|
|
|
|
|
|
int parse(FILE*);
|
|
|
|
};
|
|
|
|
|
- scheduler: add support for anonymous-platform coproc apps.
Old: although the request message contained all info
about the app version (flops, coproc usage etc.)
the server ignored this info,
and assumed that all anonymous platform apps where CPU.
With 6.6 client, this could produce infinite work fetch:
- client uses anon platform, has coproc app
- client has idle CPU, requests CPU work
- scheduler sends it jobs, thinking they will be done by CPU app
- client asks for more work etc.
New: scheduler parses full info on anon platform app versions:
plan class, FLOPS, coprocs.
It uses this info to make scheduling decisions;
in particular, if the request is for CUDA work,
if will only send jobs that use a CUDA app version.
The <result> records it returns contain info
(plan_class) that tells the client which app_version to use.
This will work correctly even if the client has multiple app versions
for the same app (e.g., a CPU version and a GPU version)
svn path=/trunk/boinc/; revision=17506
2009-03-05 17:30:10 +00:00
|
|
|
// keep track of the best app_version for each app for this host
|
|
|
|
//
|
|
|
|
struct BEST_APP_VERSION {
|
|
|
|
int appid;
|
|
|
|
|
|
|
|
bool present;
|
2010-04-08 18:27:27 +00:00
|
|
|
// false means there's no usable version for this app
|
- scheduler: add support for anonymous-platform coproc apps.
Old: although the request message contained all info
about the app version (flops, coproc usage etc.)
the server ignored this info,
and assumed that all anonymous platform apps where CPU.
With 6.6 client, this could produce infinite work fetch:
- client uses anon platform, has coproc app
- client has idle CPU, requests CPU work
- scheduler sends it jobs, thinking they will be done by CPU app
- client asks for more work etc.
New: scheduler parses full info on anon platform app versions:
plan class, FLOPS, coprocs.
It uses this info to make scheduling decisions;
in particular, if the request is for CUDA work,
if will only send jobs that use a CUDA app version.
The <result> records it returns contain info
(plan_class) that tells the client which app_version to use.
This will work correctly even if the client has multiple app versions
for the same app (e.g., a CPU version and a GPU version)
svn path=/trunk/boinc/; revision=17506
2009-03-05 17:30:10 +00:00
|
|
|
|
|
|
|
CLIENT_APP_VERSION* cavp;
|
2010-04-08 18:27:27 +00:00
|
|
|
// populated if anonymous platform
|
- scheduler: add support for anonymous-platform coproc apps.
Old: although the request message contained all info
about the app version (flops, coproc usage etc.)
the server ignored this info,
and assumed that all anonymous platform apps where CPU.
With 6.6 client, this could produce infinite work fetch:
- client uses anon platform, has coproc app
- client has idle CPU, requests CPU work
- scheduler sends it jobs, thinking they will be done by CPU app
- client asks for more work etc.
New: scheduler parses full info on anon platform app versions:
plan class, FLOPS, coprocs.
It uses this info to make scheduling decisions;
in particular, if the request is for CUDA work,
if will only send jobs that use a CUDA app version.
The <result> records it returns contain info
(plan_class) that tells the client which app_version to use.
This will work correctly even if the client has multiple app versions
for the same app (e.g., a CPU version and a GPU version)
svn path=/trunk/boinc/; revision=17506
2009-03-05 17:30:10 +00:00
|
|
|
|
|
|
|
APP_VERSION* avp;
|
2010-04-08 18:27:27 +00:00
|
|
|
// populated otherwise
|
2009-07-29 17:29:56 +00:00
|
|
|
|
- scheduler: add support for anonymous-platform coproc apps.
Old: although the request message contained all info
about the app version (flops, coproc usage etc.)
the server ignored this info,
and assumed that all anonymous platform apps where CPU.
With 6.6 client, this could produce infinite work fetch:
- client uses anon platform, has coproc app
- client has idle CPU, requests CPU work
- scheduler sends it jobs, thinking they will be done by CPU app
- client asks for more work etc.
New: scheduler parses full info on anon platform app versions:
plan class, FLOPS, coprocs.
It uses this info to make scheduling decisions;
in particular, if the request is for CUDA work,
if will only send jobs that use a CUDA app version.
The <result> records it returns contain info
(plan_class) that tells the client which app_version to use.
This will work correctly even if the client has multiple app versions
for the same app (e.g., a CPU version and a GPU version)
svn path=/trunk/boinc/; revision=17506
2009-03-05 17:30:10 +00:00
|
|
|
HOST_USAGE host_usage;
|
2010-04-08 18:27:27 +00:00
|
|
|
// populated in either case
|
- scheduler: add support for anonymous-platform coproc apps.
Old: although the request message contained all info
about the app version (flops, coproc usage etc.)
the server ignored this info,
and assumed that all anonymous platform apps where CPU.
With 6.6 client, this could produce infinite work fetch:
- client uses anon platform, has coproc app
- client has idle CPU, requests CPU work
- scheduler sends it jobs, thinking they will be done by CPU app
- client asks for more work etc.
New: scheduler parses full info on anon platform app versions:
plan class, FLOPS, coprocs.
It uses this info to make scheduling decisions;
in particular, if the request is for CUDA work,
if will only send jobs that use a CUDA app version.
The <result> records it returns contain info
(plan_class) that tells the client which app_version to use.
This will work correctly even if the client has multiple app versions
for the same app (e.g., a CPU version and a GPU version)
svn path=/trunk/boinc/; revision=17506
2009-03-05 17:30:10 +00:00
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
bool reliable;
|
|
|
|
bool trusted;
|
|
|
|
|
|
|
|
DB_HOST_APP_VERSION* host_app_version();
|
|
|
|
// get the HOST_APP_VERSION, if any
|
|
|
|
|
- scheduler: add support for anonymous-platform coproc apps.
Old: although the request message contained all info
about the app version (flops, coproc usage etc.)
the server ignored this info,
and assumed that all anonymous platform apps where CPU.
With 6.6 client, this could produce infinite work fetch:
- client uses anon platform, has coproc app
- client has idle CPU, requests CPU work
- scheduler sends it jobs, thinking they will be done by CPU app
- client asks for more work etc.
New: scheduler parses full info on anon platform app versions:
plan class, FLOPS, coprocs.
It uses this info to make scheduling decisions;
in particular, if the request is for CUDA work,
if will only send jobs that use a CUDA app version.
The <result> records it returns contain info
(plan_class) that tells the client which app_version to use.
This will work correctly even if the client has multiple app versions
for the same app (e.g., a CPU version and a GPU version)
svn path=/trunk/boinc/; revision=17506
2009-03-05 17:30:10 +00:00
|
|
|
BEST_APP_VERSION() {
|
|
|
|
present = false;
|
|
|
|
cavp = NULL;
|
|
|
|
avp = NULL;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2004-03-23 03:59:32 +00:00
|
|
|
// subset of global prefs used by scheduler
|
|
|
|
//
|
|
|
|
struct GLOBAL_PREFS {
|
2008-02-03 21:46:30 +00:00
|
|
|
double mod_time;
|
2004-03-23 03:59:32 +00:00
|
|
|
double disk_max_used_gb;
|
|
|
|
double disk_max_used_pct;
|
|
|
|
double disk_min_free_gb;
|
2005-04-18 18:42:29 +00:00
|
|
|
double work_buf_min_days;
|
2006-10-04 17:01:36 +00:00
|
|
|
double ram_max_used_busy_frac;
|
|
|
|
double ram_max_used_idle_frac;
|
2008-03-27 18:25:29 +00:00
|
|
|
double max_ncpus_pct;
|
2004-03-23 03:59:32 +00:00
|
|
|
|
2006-11-07 17:40:55 +00:00
|
|
|
void parse(const char* buf, const char* venue);
|
2006-10-06 18:52:50 +00:00
|
|
|
void defaults();
|
- scheduler: add <workload_sim> config option.
If set, the scheduler will use EDF simulation,
together with the in-progress workload reported by the client,
to avoid sending results that
1) will miss their deadline, or
2) will cause an in-progress result to miss its deadline, or
3) will make an in-progress result miss its deadline
by more than is already predicted.
If this option is not set, or if the client request doesn't
include a workload description (i.e. the client is old)
use the existing approach, which assumes there's no workload.
NOTE: this is experimental. Production projects should not use it.
- EDF sim: write debug stuff to stderr instead of stdout
- Account manager:
- if an account is detach_when_done, set dont_request_more_work
- check done_request_more_work even for first-time projects
- update_uotd: generate a file for use by Google gadget
- user_links(): use full URLs (so can use in Google gadget)
client/
acct_mgr.C
work_fetch.C
html/
inc/
uotd.inc
util.inc
user/
uotd_gadget.php (new)
sched/
Makefile.am
edf_sim.C
sched_config.C,h
sched_resend.C
sched_send.C,h
server_types.C,h
svn path=/trunk/boinc/; revision=12639
2007-05-10 21:50:52 +00:00
|
|
|
inline double work_buf_min() {return work_buf_min_days*86400;}
|
2004-03-23 03:59:32 +00:00
|
|
|
};
|
|
|
|
|
2004-09-13 18:05:54 +00:00
|
|
|
struct GUI_URLS {
|
|
|
|
char* text;
|
|
|
|
void init();
|
2004-09-27 19:44:40 +00:00
|
|
|
void get_gui_urls(USER& user, HOST& host, TEAM& team, char*);
|
2004-09-13 18:05:54 +00:00
|
|
|
};
|
|
|
|
|
2006-06-26 22:58:24 +00:00
|
|
|
struct PROJECT_FILES {
|
|
|
|
char* text;
|
|
|
|
void init();
|
|
|
|
};
|
|
|
|
|
2007-04-30 23:35:39 +00:00
|
|
|
// Represents a result from this project that the client has.
|
|
|
|
// The request message has a list of these.
|
|
|
|
// The reply message may include a list of those to be aborted
|
|
|
|
// or aborted if not started
|
|
|
|
//
|
2005-07-28 09:00:19 +00:00
|
|
|
struct OTHER_RESULT {
|
2009-06-01 22:15:14 +00:00
|
|
|
char name[256];
|
2010-06-01 23:41:07 +00:00
|
|
|
int app_version; // index into CLIENT_APP_VERSION array
|
2009-06-01 22:15:14 +00:00
|
|
|
char plan_class[64];
|
|
|
|
bool have_plan_class;
|
2007-04-05 17:02:01 +00:00
|
|
|
bool abort;
|
|
|
|
bool abort_if_not_started;
|
2008-02-26 17:24:29 +00:00
|
|
|
int reason; // see codes below
|
2005-07-28 09:00:19 +00:00
|
|
|
|
|
|
|
int parse(FILE*);
|
|
|
|
};
|
|
|
|
|
2008-02-26 17:24:29 +00:00
|
|
|
#define ABORT_REASON_NOT_FOUND 1
|
|
|
|
#define ABORT_REASON_WU_CANCELLED 2
|
|
|
|
#define ABORT_REASON_ASSIMILATED 3
|
|
|
|
#define ABORT_REASON_TIMED_OUT 4
|
|
|
|
|
2007-04-19 22:45:57 +00:00
|
|
|
struct CLIENT_PLATFORM {
|
|
|
|
char name[256];
|
|
|
|
int parse(FILE*);
|
|
|
|
};
|
|
|
|
|
2007-04-30 21:19:24 +00:00
|
|
|
struct PLATFORM_LIST {
|
|
|
|
std::vector<PLATFORM*> list;
|
|
|
|
};
|
|
|
|
|
2002-04-30 22:22:54 +00:00
|
|
|
struct SCHEDULER_REQUEST {
|
|
|
|
char authenticator[256];
|
2007-04-19 22:45:57 +00:00
|
|
|
CLIENT_PLATFORM platform;
|
|
|
|
std::vector<CLIENT_PLATFORM> alt_platforms;
|
2008-02-21 00:47:50 +00:00
|
|
|
PLATFORM_LIST platforms;
|
2004-04-30 18:26:20 +00:00
|
|
|
char cross_project_id[256];
|
2002-04-30 22:22:54 +00:00
|
|
|
int hostid; // zero if first RPC
|
2002-12-02 04:29:40 +00:00
|
|
|
int core_client_major_version;
|
|
|
|
int core_client_minor_version;
|
2005-08-31 00:18:36 +00:00
|
|
|
int core_client_release;
|
2010-01-13 17:28:59 +00:00
|
|
|
int core_client_version; // 10000*major + 100*minor + release
|
2002-04-30 22:22:54 +00:00
|
|
|
int rpc_seqno;
|
2004-12-06 22:41:19 +00:00
|
|
|
double work_req_seconds;
|
2005-02-06 21:26:21 +00:00
|
|
|
// in "normalized CPU seconds" (see work_req.php)
|
2009-01-10 00:43:33 +00:00
|
|
|
double cpu_req_secs;
|
|
|
|
double cpu_req_instances;
|
2004-07-06 21:51:49 +00:00
|
|
|
double resource_share_fraction;
|
2005-10-04 21:44:58 +00:00
|
|
|
// this project's fraction of total resource share
|
|
|
|
double rrs_fraction;
|
|
|
|
// ... of runnable resource share
|
|
|
|
double prrs_fraction;
|
|
|
|
// ... of potentially runnable resource share
|
2009-01-30 21:25:24 +00:00
|
|
|
double cpu_estimated_delay;
|
|
|
|
// currently queued jobs saturate the CPU for this long;
|
|
|
|
// used for crude deadline check
|
2005-06-29 04:17:28 +00:00
|
|
|
double duration_correction_factor;
|
2008-03-31 16:19:45 +00:00
|
|
|
char global_prefs_xml[BLOB_SIZE];
|
|
|
|
char working_global_prefs_xml[BLOB_SIZE];
|
2004-05-13 18:18:22 +00:00
|
|
|
char code_sign_key[4096];
|
2004-08-14 00:37:38 +00:00
|
|
|
|
2004-09-10 00:41:48 +00:00
|
|
|
std::vector<CLIENT_APP_VERSION> client_app_versions;
|
2004-03-23 03:59:32 +00:00
|
|
|
GLOBAL_PREFS global_prefs;
|
2004-05-03 02:18:35 +00:00
|
|
|
char global_prefs_source_email_hash[MD5_LEN];
|
2002-04-30 22:22:54 +00:00
|
|
|
|
2004-05-13 04:48:19 +00:00
|
|
|
HOST host; // request message is parsed into here.
|
|
|
|
// does NOT contain the full host record.
|
2008-03-10 21:59:27 +00:00
|
|
|
COPROCS coprocs;
|
2004-09-10 00:41:48 +00:00
|
|
|
std::vector<RESULT> results;
|
2007-07-05 04:18:48 +00:00
|
|
|
// completed results being reported
|
2004-09-10 00:41:48 +00:00
|
|
|
std::vector<MSG_FROM_HOST_DESC> msgs_from_host;
|
2007-07-06 16:37:00 +00:00
|
|
|
std::vector<FILE_INFO> file_infos;
|
|
|
|
// sticky files reported by host for locality scheduling
|
|
|
|
std::vector<FILE_INFO> file_delete_candidates;
|
|
|
|
// sticky files reported by host, deletion candidates
|
|
|
|
std::vector<FILE_INFO> files_not_needed;
|
|
|
|
// sticky files reported by host, no longer needed
|
2005-07-28 09:00:19 +00:00
|
|
|
std::vector<OTHER_RESULT> other_results;
|
2007-07-05 04:18:48 +00:00
|
|
|
// in-progress results from this project
|
2005-07-28 09:00:19 +00:00
|
|
|
std::vector<IP_RESULT> ip_results;
|
2007-07-05 04:18:48 +00:00
|
|
|
// in-progress results from all projects
|
2005-07-28 10:13:30 +00:00
|
|
|
bool have_other_results_list;
|
|
|
|
bool have_ip_results_list;
|
2008-01-13 00:12:14 +00:00
|
|
|
bool have_time_stats_log;
|
2008-04-02 19:05:08 +00:00
|
|
|
bool client_cap_plan_class;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
int sandbox;
|
|
|
|
// whether client uses account-based sandbox. -1 = don't know
|
- scheduler and client: fix the "allow multiple clients" feature.
This feature lets you run the BOINC client as a job on grid systems
that handle only 1-CPU jobs;
it disables various mechanisms that prevent multiple clients per host
(which is normally a bad thing).
Old:
- Run the client with a --allow_multiple_clients flag.
This tells it not to use a mutex that prevents
multiple clients per host.
- Run the project with the <multiple_clients_per_host> config flag.
This suppresses two mechanisms:
- (avoid duplicate host records)
on a scheduler request with no host ID,
looks for a host with same domain name, OS type,
and mem size, and assumes the request is from that host
- (job retry)
If we get a request that doesn't have a host ID
but does have a host CPID,
mark its in-progress results as over
NOTE: I CAN'T REMEMBER WHY WE SUPPRESS THIS;
MARK S, DO YOU REMEMBER?
Problem:
if the grid clients attach to a project that
doesn't use <multiple_clients_per_host>, bad things happen.
E.g., if there are several requests at about the same time,
most of them will fail with
"another RPC already in progress" errors.
If a project does include this flag,
it loses protection from duplicate host records.
New:
- If the client is run with --allow_multiple_clients flag,
it passes a <allow_multiple_clients> element
in scheduler requests.
- The scheduler skips the duplicate-host check on
requests that include this flag.
- There is no more <multiple_clients_per_host> scheduler option.
Note: if a project using the old mechanism upgrades to this change,
it will need to use new clients for its grid deployment.
svn path=/trunk/boinc/; revision=21839
2010-06-29 16:37:28 +00:00
|
|
|
int allow_multiple_clients;
|
|
|
|
// whether client allows multiple clients per host, -1 don't know
|
2009-08-18 20:44:12 +00:00
|
|
|
bool using_weak_auth;
|
|
|
|
// Request uses weak authenticator.
|
|
|
|
// Don't modify user prefs or CPID
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
int last_rpc_dayofyear;
|
|
|
|
int current_rpc_dayofyear;
|
2009-08-18 20:44:12 +00:00
|
|
|
|
2002-04-30 22:22:54 +00:00
|
|
|
SCHEDULER_REQUEST();
|
|
|
|
~SCHEDULER_REQUEST();
|
2008-07-01 16:34:51 +00:00
|
|
|
const char* parse(FILE*);
|
2005-01-31 16:10:49 +00:00
|
|
|
int write(FILE*); // write request info to file: not complete
|
2002-04-30 22:22:54 +00:00
|
|
|
};
|
|
|
|
|
2005-04-18 17:54:03 +00:00
|
|
|
// keep track of bottleneck disk preference
|
|
|
|
//
|
|
|
|
struct DISK_LIMITS {
|
|
|
|
double max_used;
|
|
|
|
double max_frac;
|
|
|
|
double min_free;
|
|
|
|
};
|
|
|
|
|
2010-05-21 21:49:54 +00:00
|
|
|
// summary of a client's request for work, and our response to it
|
|
|
|
// Note: this is zeroed out in SCHEDULER_REPLY constructor
|
|
|
|
//
|
|
|
|
struct WORK_REQ {
|
|
|
|
bool anonymous_platform;
|
|
|
|
|
|
|
|
// Flags used by old-style scheduling,
|
|
|
|
// while making multiple passes through the work array
|
|
|
|
bool infeasible_only;
|
|
|
|
bool reliable_only;
|
|
|
|
bool user_apps_only;
|
|
|
|
bool beta_only;
|
|
|
|
|
|
|
|
// user preferences
|
|
|
|
bool no_cuda;
|
|
|
|
bool no_ati;
|
|
|
|
bool no_cpu;
|
|
|
|
bool allow_non_preferred_apps;
|
|
|
|
bool allow_beta_work;
|
|
|
|
std::vector<APP_INFO> preferred_apps;
|
|
|
|
|
|
|
|
bool has_reliable_version;
|
|
|
|
// whether the host has a reliable app version
|
|
|
|
|
|
|
|
int effective_ncpus;
|
|
|
|
int effective_ngpus;
|
|
|
|
|
|
|
|
// 6.7+ clients send separate requests for different resource types:
|
|
|
|
//
|
|
|
|
double cpu_req_secs; // instance-seconds requested
|
|
|
|
double cpu_req_instances; // number of idle instances, use if possible
|
|
|
|
double cuda_req_secs;
|
|
|
|
double cuda_req_instances;
|
|
|
|
double ati_req_secs;
|
|
|
|
double ati_req_instances;
|
|
|
|
inline bool need_cpu() {
|
|
|
|
return (cpu_req_secs>0) || (cpu_req_instances>0);
|
|
|
|
}
|
|
|
|
inline bool need_cuda() {
|
|
|
|
return (cuda_req_secs>0) || (cuda_req_instances>0);
|
|
|
|
}
|
|
|
|
inline bool need_ati() {
|
|
|
|
return (ati_req_secs>0) || (ati_req_instances>0);
|
|
|
|
}
|
|
|
|
inline void clear_cpu_req() {
|
|
|
|
cpu_req_secs = 0;
|
|
|
|
cpu_req_instances = 0;
|
|
|
|
}
|
|
|
|
inline void clear_gpu_req() {
|
|
|
|
cuda_req_secs = 0;
|
|
|
|
cuda_req_instances = 0;
|
|
|
|
ati_req_secs = 0;
|
|
|
|
ati_req_instances = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// older clients send send a single number, the requested duration of jobs
|
|
|
|
//
|
|
|
|
double seconds_to_fill;
|
|
|
|
|
|
|
|
// true if new-type request
|
|
|
|
//
|
|
|
|
bool rsc_spec_request;
|
|
|
|
|
|
|
|
double disk_available;
|
|
|
|
double ram, usable_ram;
|
|
|
|
double running_frac;
|
|
|
|
int njobs_sent;
|
|
|
|
|
|
|
|
// The following keep track of the "easiest" job that was rejected
|
|
|
|
// by EDF simulation.
|
|
|
|
// Any jobs harder than this can be rejected without doing the simulation.
|
|
|
|
//
|
|
|
|
double edf_reject_min_cpu;
|
|
|
|
int edf_reject_max_delay_bound;
|
|
|
|
bool have_edf_reject;
|
|
|
|
void edf_reject(double cpu, int delay_bound) {
|
|
|
|
if (have_edf_reject) {
|
|
|
|
if (cpu < edf_reject_min_cpu) edf_reject_min_cpu = cpu;
|
|
|
|
if (delay_bound> edf_reject_max_delay_bound) edf_reject_max_delay_bound = delay_bound;
|
|
|
|
} else {
|
|
|
|
edf_reject_min_cpu = cpu;
|
|
|
|
edf_reject_max_delay_bound = delay_bound;
|
|
|
|
have_edf_reject = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bool edf_reject_test(double cpu, int delay_bound) {
|
|
|
|
if (!have_edf_reject) return false;
|
|
|
|
if (cpu < edf_reject_min_cpu) return false;
|
|
|
|
if (delay_bound > edf_reject_max_delay_bound) return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
RESOURCE disk;
|
|
|
|
RESOURCE mem;
|
|
|
|
RESOURCE speed;
|
|
|
|
RESOURCE bandwidth;
|
|
|
|
|
|
|
|
std::vector<USER_MESSAGE> no_work_messages;
|
|
|
|
std::vector<BEST_APP_VERSION*> best_app_versions;
|
|
|
|
std::vector<BEST_APP_VERSION*> all_best_app_versions;
|
|
|
|
std::vector<DB_HOST_APP_VERSION> host_app_versions;
|
|
|
|
std::vector<DB_HOST_APP_VERSION> host_app_versions_orig;
|
|
|
|
|
|
|
|
// various reasons for not sending jobs (used to explain why)
|
|
|
|
//
|
|
|
|
bool no_allowed_apps_available;
|
|
|
|
bool excessive_work_buf;
|
|
|
|
bool hr_reject_temp;
|
|
|
|
bool hr_reject_perm;
|
|
|
|
bool outdated_client;
|
|
|
|
bool no_cuda_prefs;
|
|
|
|
bool no_ati_prefs;
|
|
|
|
bool no_cpu_prefs;
|
|
|
|
bool max_jobs_on_host_exceeded;
|
|
|
|
bool max_jobs_on_host_cpu_exceeded;
|
|
|
|
bool max_jobs_on_host_gpu_exceeded;
|
|
|
|
bool no_jobs_available; // project has no work right now
|
|
|
|
|
|
|
|
//int max_jobs_per_day;
|
|
|
|
// host.max_results_day * (NCPUS + NGPUS*gpu_multiplier)
|
|
|
|
int max_jobs_per_rpc;
|
2010-06-01 23:41:07 +00:00
|
|
|
#if 0
|
2010-05-21 21:49:54 +00:00
|
|
|
int njobs_on_host;
|
|
|
|
// How many jobs from this project are in progress on the host.
|
|
|
|
// Initially this is the number of "other_results"
|
|
|
|
// reported in the request message.
|
|
|
|
// If the resend_lost_results option is used,
|
|
|
|
// it's set to the number of outstanding results taken from the DB
|
|
|
|
// (those that were lost are resent).
|
|
|
|
// As new results are sent, it's incremented.
|
|
|
|
int njobs_on_host_cpu;
|
|
|
|
// same, but just CPU jobs.
|
|
|
|
int njobs_on_host_gpu;
|
|
|
|
// same, but just GPU jobs.
|
|
|
|
int max_jobs_on_host;
|
|
|
|
int max_jobs_on_host_cpu;
|
|
|
|
int max_jobs_on_host_gpu;
|
2010-06-01 03:45:49 +00:00
|
|
|
#endif
|
2010-05-21 21:49:54 +00:00
|
|
|
void update_for_result(double seconds_filled);
|
|
|
|
void add_no_work_message(const char*);
|
|
|
|
void get_job_limits();
|
|
|
|
|
|
|
|
~WORK_REQ() {
|
|
|
|
for (unsigned int i=0; i<all_best_app_versions.size(); i++) {
|
|
|
|
delete all_best_app_versions[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2002-09-22 23:27:14 +00:00
|
|
|
// NOTE: if any field requires initialization,
|
|
|
|
// you must do it in the constructor. Nothing is zeroed by default.
|
|
|
|
//
|
2002-04-30 22:22:54 +00:00
|
|
|
struct SCHEDULER_REPLY {
|
2005-01-31 23:20:49 +00:00
|
|
|
WORK_REQ wreq;
|
2005-04-18 17:54:03 +00:00
|
|
|
DISK_LIMITS disk_limits;
|
2005-02-16 23:17:43 +00:00
|
|
|
double request_delay; // don't request again until this time elapses
|
2005-01-31 22:19:03 +00:00
|
|
|
std::vector<USER_MESSAGE> messages;
|
2004-10-01 22:00:39 +00:00
|
|
|
int hostid;
|
|
|
|
// nonzero only if a new host record was created.
|
|
|
|
// this tells client to reset rpc_seqno
|
2005-05-17 05:55:24 +00:00
|
|
|
int lockfile_fd; // file descriptor of lockfile, or -1 if no lock.
|
2006-10-06 18:52:50 +00:00
|
|
|
bool send_global_prefs;
|
2002-12-02 04:29:40 +00:00
|
|
|
bool nucleus_only; // send only message
|
2002-05-29 23:25:21 +00:00
|
|
|
USER user;
|
2004-05-03 02:18:35 +00:00
|
|
|
char email_hash[MD5_LEN];
|
2004-05-13 04:48:19 +00:00
|
|
|
HOST host; // after validation, contains full host rec
|
2003-02-24 21:31:36 +00:00
|
|
|
TEAM team;
|
2004-09-10 00:41:48 +00:00
|
|
|
std::vector<APP> apps;
|
|
|
|
std::vector<APP_VERSION> app_versions;
|
|
|
|
std::vector<WORKUNIT>wus;
|
|
|
|
std::vector<RESULT>results;
|
2005-06-22 06:02:59 +00:00
|
|
|
std::vector<std::string>result_acks;
|
2007-04-05 17:02:01 +00:00
|
|
|
std::vector<std::string>result_aborts;
|
|
|
|
std::vector<std::string>result_abort_if_not_starteds;
|
2004-09-10 00:41:48 +00:00
|
|
|
std::vector<MSG_TO_HOST>msgs_to_host;
|
|
|
|
std::vector<FILE_INFO>file_deletes;
|
2004-05-13 18:18:22 +00:00
|
|
|
char code_sign_key[4096];
|
|
|
|
char code_sign_key_signature[4096];
|
2004-06-24 21:00:13 +00:00
|
|
|
bool send_msg_ack;
|
2008-03-13 23:35:13 +00:00
|
|
|
bool project_is_down;
|
2002-04-30 22:22:54 +00:00
|
|
|
|
|
|
|
SCHEDULER_REPLY();
|
|
|
|
~SCHEDULER_REPLY();
|
2008-10-01 22:07:35 +00:00
|
|
|
int write(FILE*, SCHEDULER_REQUEST&);
|
2002-04-30 22:22:54 +00:00
|
|
|
void insert_app_unique(APP&);
|
|
|
|
void insert_app_version_unique(APP_VERSION&);
|
|
|
|
void insert_workunit_unique(WORKUNIT&);
|
|
|
|
void insert_result(RESULT&);
|
2009-08-21 19:14:15 +00:00
|
|
|
void insert_message(const char* msg, const char* prio);
|
|
|
|
void insert_message(USER_MESSAGE&);
|
2005-02-16 23:17:43 +00:00
|
|
|
void set_delay(double);
|
2002-04-30 22:22:54 +00:00
|
|
|
};
|
|
|
|
|
2008-12-15 21:14:32 +00:00
|
|
|
extern SCHEDULER_REQUEST* g_request;
|
|
|
|
extern SCHEDULER_REPLY* g_reply;
|
2008-12-16 16:29:54 +00:00
|
|
|
extern WORK_REQ* g_wreq;
|
2008-12-15 21:14:32 +00:00
|
|
|
|
2010-04-01 22:51:19 +00:00
|
|
|
static inline void add_no_work_message(const char* m) {
|
2009-08-21 20:38:39 +00:00
|
|
|
g_wreq->add_no_work_message(m);
|
|
|
|
}
|
|
|
|
|
2010-01-12 21:53:40 +00:00
|
|
|
extern void get_weak_auth(USER&, char*);
|
|
|
|
extern void get_rss_auth(USER&, char*);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
extern void read_host_app_versions();
|
|
|
|
extern DB_HOST_APP_VERSION* get_host_app_version(int gavid);
|
|
|
|
extern void write_host_app_versions();
|
|
|
|
|
|
|
|
extern DB_HOST_APP_VERSION* gavid_to_havp(int gavid);
|
|
|
|
extern DB_HOST_APP_VERSION* quota_exceeded_version();
|
2010-01-12 21:53:40 +00:00
|
|
|
|
2002-04-30 22:22:54 +00:00
|
|
|
#endif
|