2008-08-06 18:36:30 +00:00
|
|
|
// This file is part of BOINC.
|
2005-01-20 23:22:22 +00:00
|
|
|
// http://boinc.berkeley.edu
|
2008-08-06 18:36:30 +00:00
|
|
|
// Copyright (C) 2008 University of California
|
2004-07-13 13:54:09 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
2004-07-13 13:54:09 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
2005-01-20 23:22:22 +00:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
2003-09-27 23:20:40 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
2003-09-27 23:20:40 +00:00
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// wu_check
|
2003-10-07 04:47:00 +00:00
|
|
|
// look for results with missing input files
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// --repair change them to server_state OVER, outcome COULDNT_SEND
|
2009-08-31 19:35:46 +00:00
|
|
|
//
|
|
|
|
// NOTE 1: this assumes that jobs have a single input file.
|
|
|
|
// NOTE 2: should rewrite to enumerate WUs, not results
|
2003-10-07 04:47:00 +00:00
|
|
|
|
2005-11-21 18:34:44 +00:00
|
|
|
#include "config.h"
|
2012-05-09 16:11:50 +00:00
|
|
|
#include <sys/param.h>
|
2004-07-13 13:54:09 +00:00
|
|
|
#include <cstdio>
|
2008-02-27 23:26:38 +00:00
|
|
|
#include <cstdlib>
|
|
|
|
#include <cstring>
|
|
|
|
#include <string>
|
2003-09-27 23:20:40 +00:00
|
|
|
|
2005-01-08 07:44:22 +00:00
|
|
|
#include "boinc_db.h"
|
2009-09-17 17:56:59 +00:00
|
|
|
#include "svn_version.h"
|
2005-01-08 07:44:22 +00:00
|
|
|
|
2010-11-08 17:51:57 +00:00
|
|
|
#include "error_numbers.h"
|
2013-08-23 00:01:45 +00:00
|
|
|
#include "filesys.h"
|
2003-09-27 23:20:40 +00:00
|
|
|
#include "parse.h"
|
2004-08-06 11:42:41 +00:00
|
|
|
#include "util.h"
|
2005-01-08 07:44:22 +00:00
|
|
|
|
2003-09-27 23:20:40 +00:00
|
|
|
#include "sched_config.h"
|
2005-01-08 07:44:22 +00:00
|
|
|
#include "sched_util.h"
|
2003-09-27 23:20:40 +00:00
|
|
|
|
2003-10-07 04:47:00 +00:00
|
|
|
bool repair = false;
|
|
|
|
|
2009-08-31 19:35:46 +00:00
|
|
|
// wu_check
|
2003-09-27 23:20:40 +00:00
|
|
|
// See whether input files that should be present, are
|
|
|
|
|
|
|
|
// get the path a WU's input file
|
|
|
|
//
|
|
|
|
int get_file_path(WORKUNIT& wu, char* path) {
|
|
|
|
char buf[256];
|
|
|
|
bool flag;
|
|
|
|
flag = parse_str(wu.xml_doc, "<name>", buf, sizeof(buf));
|
2003-10-21 04:06:55 +00:00
|
|
|
if (!flag) return ERR_XML_PARSE;
|
2005-09-23 21:09:00 +00:00
|
|
|
dir_hier_path(buf, config.download_dir, config.uldl_dir_fanout, path);
|
2003-09-27 23:20:40 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2003-10-10 22:55:29 +00:00
|
|
|
int handle_result(DB_RESULT& result) {
|
2003-09-27 23:20:40 +00:00
|
|
|
DB_WORKUNIT wu;
|
|
|
|
int retval;
|
2012-05-09 16:11:50 +00:00
|
|
|
char path[MAXPATHLEN];
|
2004-10-04 23:59:51 +00:00
|
|
|
char buf[256];
|
2003-09-27 23:20:40 +00:00
|
|
|
FILE* f;
|
|
|
|
|
|
|
|
retval = wu.lookup_id(result.workunitid);
|
|
|
|
if (retval) {
|
|
|
|
printf(
|
|
|
|
"ERROR: can't find WU %d for result %d\n",
|
|
|
|
result.workunitid, result.id
|
|
|
|
);
|
2003-10-10 22:55:29 +00:00
|
|
|
return 1;
|
2003-09-27 23:20:40 +00:00
|
|
|
}
|
|
|
|
get_file_path(wu, path);
|
|
|
|
f = fopen(path, "r");
|
|
|
|
if (f) {
|
|
|
|
fclose(f);
|
|
|
|
} else {
|
2003-10-10 22:55:29 +00:00
|
|
|
printf("no file %s for result %d\n",
|
2003-09-27 23:20:40 +00:00
|
|
|
path, result.id
|
|
|
|
);
|
2003-10-07 04:47:00 +00:00
|
|
|
if (repair) {
|
|
|
|
if (result.server_state == RESULT_SERVER_STATE_UNSENT) {
|
|
|
|
result.server_state = RESULT_SERVER_STATE_OVER;
|
|
|
|
result.outcome = RESULT_OUTCOME_COULDNT_SEND;
|
2004-10-04 23:59:51 +00:00
|
|
|
sprintf(
|
|
|
|
buf,"server_state=%d, outcome=%d",
|
|
|
|
result.server_state, result.outcome
|
|
|
|
);
|
|
|
|
retval = result.update_field(buf);
|
|
|
|
if (retval) {
|
|
|
|
printf(
|
|
|
|
"ERROR: can't update result %d\n",
|
|
|
|
result.id
|
|
|
|
);
|
|
|
|
return 1;
|
|
|
|
}
|
2003-10-07 04:47:00 +00:00
|
|
|
}
|
|
|
|
}
|
2003-10-10 22:55:29 +00:00
|
|
|
return 1;
|
2003-09-27 23:20:40 +00:00
|
|
|
}
|
2003-10-10 22:55:29 +00:00
|
|
|
return 0;
|
2003-09-27 23:20:40 +00:00
|
|
|
}
|
|
|
|
|
2009-09-17 17:56:59 +00:00
|
|
|
void usage(char *name) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Looks for results with missing input files\n\n"
|
|
|
|
"Usage: %s [OPTION]\n\n"
|
|
|
|
"Options:\n"
|
2010-11-08 17:51:57 +00:00
|
|
|
" [ --repair ] change them to server_state OVER,\n"
|
2009-09-17 17:56:59 +00:00
|
|
|
" outcome COULDNT_SEND\n"
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
" [ -h | --help ] Shows this help text\n"
|
|
|
|
" [ -v | --version ] Shows version information\n",
|
2009-09-17 17:56:59 +00:00
|
|
|
name
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2003-10-07 04:47:00 +00:00
|
|
|
int main(int argc, char** argv) {
|
2003-09-27 23:20:40 +00:00
|
|
|
DB_RESULT result;
|
|
|
|
char clause[256];
|
2003-10-10 22:55:29 +00:00
|
|
|
int retval, n, nerr;
|
2003-10-07 04:47:00 +00:00
|
|
|
|
2009-09-17 17:56:59 +00:00
|
|
|
for(int c = 1; c < argc; c++) {
|
|
|
|
std::string option(argv[c]);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if(option == "-h" || option == "--help") {
|
2009-09-17 17:56:59 +00:00
|
|
|
usage(argv[0]);
|
|
|
|
exit(0);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if(option == "-v" || option == "--version") {
|
2009-09-17 17:56:59 +00:00
|
|
|
printf("%s\n", SVN_VERSION);
|
|
|
|
exit(0);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (option == "--repair") {
|
2009-09-17 17:56:59 +00:00
|
|
|
repair = true;
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "unknown command line argument: %s\n\n", argv[c]);
|
|
|
|
usage(argv[0]);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2003-10-07 04:47:00 +00:00
|
|
|
retval = config.parse_file();
|
|
|
|
if (retval) exit(1);
|
2003-09-27 23:20:40 +00:00
|
|
|
|
2010-11-08 17:51:57 +00:00
|
|
|
retval = boinc_db.open(
|
|
|
|
config.db_name, config.db_host, config.db_user, config.db_passwd
|
|
|
|
);
|
2003-10-10 22:55:29 +00:00
|
|
|
if (retval) {
|
2010-11-08 17:51:57 +00:00
|
|
|
printf("boinc_db.open: %s\n", boincerror(retval));
|
2003-10-10 22:55:29 +00:00
|
|
|
exit(1);
|
|
|
|
}
|
2003-09-27 23:20:40 +00:00
|
|
|
|
2003-10-10 22:55:29 +00:00
|
|
|
n = nerr = 0;
|
|
|
|
printf("Unsent results:\n");
|
2003-09-27 23:20:40 +00:00
|
|
|
sprintf(clause, "where server_state=%d", RESULT_SERVER_STATE_UNSENT);
|
|
|
|
while (!result.enumerate(clause)) {
|
2003-10-10 22:55:29 +00:00
|
|
|
retval = handle_result(result);
|
|
|
|
n++;
|
|
|
|
if (retval) nerr++;
|
2003-09-27 23:20:40 +00:00
|
|
|
}
|
2003-10-10 22:55:29 +00:00
|
|
|
printf("%d out of %d errors\n", nerr, n);
|
|
|
|
n = nerr = 0;
|
|
|
|
printf("In progress results:\n");
|
2003-09-27 23:20:40 +00:00
|
|
|
sprintf(clause, "where server_state=%d", RESULT_SERVER_STATE_IN_PROGRESS);
|
|
|
|
while (!result.enumerate(clause)) {
|
2003-10-10 22:55:29 +00:00
|
|
|
retval = handle_result(result);
|
|
|
|
n++;
|
|
|
|
if (retval) nerr++;
|
2003-09-27 23:20:40 +00:00
|
|
|
}
|
2003-10-10 22:55:29 +00:00
|
|
|
printf("%d out of %d errors\n", nerr, n);
|
2003-09-27 23:20:40 +00:00
|
|
|
}
|
2004-12-08 00:40:19 +00:00
|
|
|
|
2005-01-02 18:29:53 +00:00
|
|
|
const char *BOINC_RCSID_8f4e399992 = "$Id$";
|