boinc/sched/sample_work_generator.cpp

264 lines
8.3 KiB
C++
Raw Normal View History

// This file is part of BOINC.
2007-04-18 20:49:58 +00:00
// http://boinc.berkeley.edu
// Copyright (C) 2008 University of California
2007-04-18 20:49:58 +00:00
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation,
// either version 3 of the License, or (at your option) any later version.
2007-04-18 20:49:58 +00:00
//
// BOINC is distributed in the hope that it will be useful,
2007-04-18 20:49:58 +00:00
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
2007-04-18 20:49:58 +00:00
// sample_work_generator: example BOINC work generator.
//
// --app name app name (default example_app)
// --in_template_file input template file (default example_app_in)
// --out_template_file output template file (default example_app_out)
// -d N log verbosity level (0..4)
// --help show usage
// --version show version
2007-04-18 20:49:58 +00:00
//
// - Runs as a daemon, and creates an unbounded supply of work.
// It attempts to maintain a "cushion" of 100 unsent job instances
// for the given app.
2007-04-18 20:49:58 +00:00
// (your app may not work this way; e.g. you might create work in batches)
// - Creates a new input file for each job;
// the file (and the workunit names) contain a timestamp
// and sequence number, so they're unique.
//
// This is an example - customize for your needs
2007-04-18 20:49:58 +00:00
#include <sys/param.h>
#include <unistd.h>
#include <cstdlib>
#include <string>
#include <cstring>
#include "backend_lib.h"
2007-04-18 20:49:58 +00:00
#include "boinc_db.h"
#include "error_numbers.h"
#include "filesys.h"
2007-04-18 20:49:58 +00:00
#include "parse.h"
#include "str_replace.h"
#include "str_util.h"
#include "svn_version.h"
#include "util.h"
2007-04-18 20:49:58 +00:00
#include "sched_config.h"
#include "sched_util.h"
#include "sched_msgs.h"
2007-04-18 20:49:58 +00:00
#define CUSHION 10
2007-04-18 20:49:58 +00:00
// maintain at least this many unsent results
#define REPLICATION_FACTOR 1
// number of instances of each job
2007-04-18 20:49:58 +00:00
const char* app_name = "example_app";
const char* in_template_file = "example_app_in";
const char* out_template_file = "example_app_out";
char* in_template;
2007-04-18 20:49:58 +00:00
DB_APP app;
int start_time;
int seqno;
// create one new job
//
int make_job() {
DB_WORKUNIT wu;
char name[256], path[MAXPATHLEN];
2007-04-18 20:49:58 +00:00
const char* infiles[1];
int retval;
2007-04-18 20:49:58 +00:00
// make a unique name (for the job and its input file)
//
sprintf(name, "%s_%d_%d", app_name, start_time, seqno++);
2007-04-18 20:49:58 +00:00
// Create the input file.
// Put it at the right place in the download dir hierarchy
//
retval = config.download_path(name, path);
if (retval) return retval;
2007-04-18 20:49:58 +00:00
FILE* f = fopen(path, "w");
if (!f) return ERR_FOPEN;
fprintf(f, "This is the input file for job %s", name);
fclose(f);
// Fill in the job parameters
//
wu.clear();
wu.appid = app.id;
safe_strcpy(wu.name, name);
wu.rsc_fpops_est = 1e12;
wu.rsc_fpops_bound = 1e14;
wu.rsc_memory_bound = 1e8;
wu.rsc_disk_bound = 1e8;
wu.delay_bound = 86400;
wu.min_quorum = REPLICATION_FACTOR;
2007-04-18 20:49:58 +00:00
wu.target_nresults = REPLICATION_FACTOR;
wu.max_error_results = REPLICATION_FACTOR*4;
wu.max_total_results = REPLICATION_FACTOR*8;
wu.max_success_results = REPLICATION_FACTOR*4;
infiles[0] = name;
2007-04-18 20:49:58 +00:00
// Register the job with BOINC
//
sprintf(path, "templates/%s", out_template_file);
2007-04-18 20:49:58 +00:00
return create_work(
wu,
in_template,
path,
config.project_path(path),
2007-04-18 20:49:58 +00:00
infiles,
1,
config
);
}
void main_loop() {
int retval;
while (1) {
check_stop_daemons();
long n;
retval = count_unsent_results(n, app.id);
if (retval) {
log_messages.printf(MSG_CRITICAL,
"count_unsent_jobs() failed: %s\n", boincerror(retval)
);
exit(retval);
}
2007-04-18 20:49:58 +00:00
if (n > CUSHION) {
daemon_sleep(10);
2007-04-18 20:49:58 +00:00
} else {
int njobs = (CUSHION-n)/REPLICATION_FACTOR;
log_messages.printf(MSG_DEBUG,
"Making %d jobs\n", njobs
);
2007-04-18 20:49:58 +00:00
for (int i=0; i<njobs; i++) {
retval = make_job();
if (retval) {
log_messages.printf(MSG_CRITICAL,
"can't make job: %s\n", boincerror(retval)
);
exit(retval);
}
2007-04-18 20:49:58 +00:00
}
// Wait for the transitioner to create instances
// of the jobs we just created.
// Otherwise we'll create too many jobs.
//
double now = dtime();
while (1) {
daemon_sleep(5);
double x;
retval = min_transition_time(x);
if (retval) {
log_messages.printf(MSG_CRITICAL,
"min_transition_time failed: %s\n", boincerror(retval)
);
exit(retval);
}
if (x > now) break;
}
2007-04-18 20:49:58 +00:00
}
}
}
void usage(char *name) {
fprintf(stderr, "This is an example BOINC work generator.\n"
"This work generator has the following properties\n"
"(you may need to change some or all of these):\n"
" It attempts to maintain a \"cushion\" of 100 unsent job instances.\n"
" (your app may not work this way; e.g. you might create work in batches)\n"
"- Creates work for the application \"example_app\".\n"
"- Creates a new input file for each job;\n"
" the file (and the workunit names) contain a timestamp\n"
" and sequence number, so that they're unique.\n\n"
"Usage: %s [OPTION]...\n\n"
"Options:\n"
" [ --app X Application name (default: example_app)\n"
" [ --in_template_file Input template (default: example_app_in)\n"
" [ --out_template_file Output template (default: example_app_out)\n"
" [ -d X ] Sets debug level to X.\n"
" [ -h | --help ] Shows this help text.\n"
" [ -v | --version ] Shows version information.\n",
name
);
}
int main(int argc, char** argv) {
int i, retval;
char buf[256];
for (i=1; i<argc; i++) {
- server: change the following from per-host to per-(host, app version): - daily quota mechanism - reliable mechanism (accelerated retries) - "trusted" mechanism (adaptive replication) - scheduler: enforce host scale probation only for apps with host_scale_check set. - validator: do scale probation on invalid results (need this in addition to error and timeout cases) - feeder: update app version scales every 10 min, not 10 sec - back-end apps: support --foo as well as -foo for options Notes: - If you have, say, cuda, cuda23 and cuda_fermi plan classes, a host will have separate quotas for each one. That means it could error out on 100 jobs for cuda_fermi, and when its quota goes to zero, error out on 100 jobs for cuda23, etc. This is intentional; there may be cases where one version works but not the others. - host.error_rate and host.max_results_day are deprecated TODO: - the values in the app table for limits on jobs in progress etc. should override rather than config.xml. Implementation notes: scheduler: process_request(): read all host_app_versions for host at start; Compute "reliable" and "trusted" for each one. write modified records at end get_app_version(): add "reliable_only" arg; if set, use only reliable versions skip over-quota versions Multi-pass scheduling: if have at least one reliable version, do a pass for jobs that need reliable, and use only reliable versions. Then clear best_app_versions cache. Score-based scheduling: for need-reliable jobs, it will pick the fastest version, then give a score bonus if that version happens to be reliable. When get back a successful result from client: increase daily quota When get back an error result from client: impose scale probation decrease daily quota if not aborted Validator: when handling a WU, create a vector of HOST_APP_VERSION parallel to vector of RESULT. Pass it to assign_credit_set(). Make copies of originals so we can update only modified ones update HOST_APP_VERSION error rates Transitioner: decrease quota on timeout svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
if (is_arg(argv[i], "d")) {
if (!argv[++i]) {
log_messages.printf(MSG_CRITICAL, "%s requires an argument\n\n", argv[--i]);
usage(argv[0]);
exit(1);
}
int dl = atoi(argv[i]);
log_messages.set_debug_level(dl);
if (dl == 4) g_print_queries = true;
} else if (!strcmp(argv[i], "--app")) {
app_name = argv[++i];
} else if (!strcmp(argv[i], "--in_template_file")) {
in_template_file = argv[++i];
} else if (!strcmp(argv[i], "--out_template_file")) {
out_template_file = argv[++i];
- server: change the following from per-host to per-(host, app version): - daily quota mechanism - reliable mechanism (accelerated retries) - "trusted" mechanism (adaptive replication) - scheduler: enforce host scale probation only for apps with host_scale_check set. - validator: do scale probation on invalid results (need this in addition to error and timeout cases) - feeder: update app version scales every 10 min, not 10 sec - back-end apps: support --foo as well as -foo for options Notes: - If you have, say, cuda, cuda23 and cuda_fermi plan classes, a host will have separate quotas for each one. That means it could error out on 100 jobs for cuda_fermi, and when its quota goes to zero, error out on 100 jobs for cuda23, etc. This is intentional; there may be cases where one version works but not the others. - host.error_rate and host.max_results_day are deprecated TODO: - the values in the app table for limits on jobs in progress etc. should override rather than config.xml. Implementation notes: scheduler: process_request(): read all host_app_versions for host at start; Compute "reliable" and "trusted" for each one. write modified records at end get_app_version(): add "reliable_only" arg; if set, use only reliable versions skip over-quota versions Multi-pass scheduling: if have at least one reliable version, do a pass for jobs that need reliable, and use only reliable versions. Then clear best_app_versions cache. Score-based scheduling: for need-reliable jobs, it will pick the fastest version, then give a score bonus if that version happens to be reliable. When get back a successful result from client: increase daily quota When get back an error result from client: impose scale probation decrease daily quota if not aborted Validator: when handling a WU, create a vector of HOST_APP_VERSION parallel to vector of RESULT. Pass it to assign_credit_set(). Make copies of originals so we can update only modified ones update HOST_APP_VERSION error rates Transitioner: decrease quota on timeout svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
} else if (is_arg(argv[i], "h") || is_arg(argv[i], "help")) {
usage(argv[0]);
exit(0);
- server: change the following from per-host to per-(host, app version): - daily quota mechanism - reliable mechanism (accelerated retries) - "trusted" mechanism (adaptive replication) - scheduler: enforce host scale probation only for apps with host_scale_check set. - validator: do scale probation on invalid results (need this in addition to error and timeout cases) - feeder: update app version scales every 10 min, not 10 sec - back-end apps: support --foo as well as -foo for options Notes: - If you have, say, cuda, cuda23 and cuda_fermi plan classes, a host will have separate quotas for each one. That means it could error out on 100 jobs for cuda_fermi, and when its quota goes to zero, error out on 100 jobs for cuda23, etc. This is intentional; there may be cases where one version works but not the others. - host.error_rate and host.max_results_day are deprecated TODO: - the values in the app table for limits on jobs in progress etc. should override rather than config.xml. Implementation notes: scheduler: process_request(): read all host_app_versions for host at start; Compute "reliable" and "trusted" for each one. write modified records at end get_app_version(): add "reliable_only" arg; if set, use only reliable versions skip over-quota versions Multi-pass scheduling: if have at least one reliable version, do a pass for jobs that need reliable, and use only reliable versions. Then clear best_app_versions cache. Score-based scheduling: for need-reliable jobs, it will pick the fastest version, then give a score bonus if that version happens to be reliable. When get back a successful result from client: increase daily quota When get back an error result from client: impose scale probation decrease daily quota if not aborted Validator: when handling a WU, create a vector of HOST_APP_VERSION parallel to vector of RESULT. Pass it to assign_credit_set(). Make copies of originals so we can update only modified ones update HOST_APP_VERSION error rates Transitioner: decrease quota on timeout svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
} else if (is_arg(argv[i], "v") || is_arg(argv[i], "version")) {
printf("%s\n", SVN_VERSION);
exit(0);
} else {
log_messages.printf(MSG_CRITICAL, "unknown command line argument: %s\n\n", argv[i]);
usage(argv[0]);
exit(1);
}
}
2007-04-18 20:49:58 +00:00
retval = config.parse_file();
if (retval) {
log_messages.printf(MSG_CRITICAL,
"Can't parse config.xml: %s\n", boincerror(retval)
2007-04-18 20:49:58 +00:00
);
exit(1);
}
retval = boinc_db.open(
config.db_name, config.db_host, config.db_user, config.db_passwd
);
if (retval) {
log_messages.printf(MSG_CRITICAL, "can't open db\n");
2007-04-18 20:49:58 +00:00
exit(1);
}
snprintf(buf, sizeof(buf), "where name='%s'", app_name);
if (app.lookup(buf)) {
log_messages.printf(MSG_CRITICAL, "can't find app %s\n", app_name);
2007-04-18 20:49:58 +00:00
exit(1);
}
snprintf(buf, sizeof(buf), "templates/%s", in_template_file);
if (read_file_malloc(config.project_path(buf), in_template)) {
log_messages.printf(MSG_CRITICAL, "can't read input template %s\n", buf);
2007-04-18 20:49:58 +00:00
exit(1);
}
start_time = time(0);
seqno = 0;
log_messages.printf(MSG_NORMAL, "Starting\n");
2007-04-18 20:49:58 +00:00
main_loop();
}