2008-08-06 18:36:30 +00:00
|
|
|
// This file is part of BOINC.
|
2005-01-20 23:22:22 +00:00
|
|
|
// http://boinc.berkeley.edu
|
2008-08-06 18:36:30 +00:00
|
|
|
// Copyright (C) 2008 University of California
|
2003-07-02 20:57:59 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
2003-07-02 20:57:59 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
2005-01-20 23:22:22 +00:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
2003-01-07 08:11:16 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
2003-01-07 08:11:16 +00:00
|
|
|
|
|
|
|
|
2008-11-03 22:56:16 +00:00
|
|
|
// file deleter. See usage() below for usage.
|
2007-04-17 17:12:44 +00:00
|
|
|
|
|
|
|
// enum sizes. RESULT_PER_ENUM is three times larger on the
|
|
|
|
// assumption of 3-fold average redundancy.
|
|
|
|
// This balances the rate at which input and output files are deleted
|
|
|
|
//
|
|
|
|
#define WUS_PER_ENUM 500
|
|
|
|
#define RESULTS_PER_ENUM 1500
|
|
|
|
|
|
|
|
// how long to wait until delete antiques, and how often to do it
|
|
|
|
//
|
|
|
|
#define ANTIQUE_DELAY 3600
|
|
|
|
#define ANTIQUE_INTERVAL 86400
|
2010-01-12 12:31:23 +00:00
|
|
|
#define ANTIQUE_LIMIT 50000
|
2007-04-17 17:12:44 +00:00
|
|
|
|
|
|
|
// how often to retry errors
|
|
|
|
//
|
2008-10-28 21:59:25 +00:00
|
|
|
#define ERROR_INTERVAL 3600
|
2003-01-07 08:11:16 +00:00
|
|
|
|
2005-11-21 18:34:44 +00:00
|
|
|
#include "config.h"
|
2005-08-20 02:45:49 +00:00
|
|
|
#include <list>
|
2004-07-13 13:54:09 +00:00
|
|
|
#include <cstring>
|
2008-02-27 23:26:38 +00:00
|
|
|
#include <string>
|
2004-07-13 13:54:09 +00:00
|
|
|
#include <cstdlib>
|
|
|
|
#include <ctime>
|
2002-12-17 19:00:43 +00:00
|
|
|
#include <unistd.h>
|
2004-11-29 09:47:48 +00:00
|
|
|
#include <errno.h>
|
2005-08-20 02:45:49 +00:00
|
|
|
#include <pwd.h>
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/stat.h>
|
2005-08-22 19:29:27 +00:00
|
|
|
#ifdef HAVE_STRINGS_H
|
|
|
|
#include <strings.h>
|
|
|
|
#endif
|
2002-12-17 19:00:43 +00:00
|
|
|
|
2003-04-07 19:06:00 +00:00
|
|
|
#include "boinc_db.h"
|
2002-12-17 19:00:43 +00:00
|
|
|
#include "parse.h"
|
2005-05-26 20:42:35 +00:00
|
|
|
#include "error_numbers.h"
|
2007-04-17 17:12:44 +00:00
|
|
|
#include "util.h"
|
2007-02-21 16:26:51 +00:00
|
|
|
#include "str_util.h"
|
2009-06-16 20:54:44 +00:00
|
|
|
#include "str_replace.h"
|
2005-01-08 06:54:03 +00:00
|
|
|
#include "filesys.h"
|
2005-08-24 21:55:37 +00:00
|
|
|
#include "strings.h"
|
2009-09-17 17:56:59 +00:00
|
|
|
#include "svn_version.h"
|
2005-01-08 06:54:03 +00:00
|
|
|
|
2003-08-15 00:45:25 +00:00
|
|
|
#include "sched_config.h"
|
2003-03-08 00:09:40 +00:00
|
|
|
#include "sched_util.h"
|
2004-04-08 08:15:23 +00:00
|
|
|
#include "sched_msgs.h"
|
2002-12-17 19:00:43 +00:00
|
|
|
|
2003-02-10 19:51:32 +00:00
|
|
|
#define LOCKFILE "file_deleter.out"
|
2003-06-20 01:31:03 +00:00
|
|
|
#define PIDFILE "file_deleter.pid"
|
2003-02-10 19:51:32 +00:00
|
|
|
|
2008-11-26 19:09:27 +00:00
|
|
|
#define DEFAULT_SLEEP_INTERVAL 5
|
2005-08-25 23:07:58 +00:00
|
|
|
#define RESULTS_PER_WU 4 // an estimate of redundancy
|
2004-12-06 22:41:19 +00:00
|
|
|
|
2009-03-03 00:12:55 +00:00
|
|
|
int id_modulus=0, id_remainder=0, appid=0;
|
2007-04-17 17:12:44 +00:00
|
|
|
bool dont_retry_errors = false;
|
|
|
|
bool dont_delete_antiques = false;
|
|
|
|
bool dont_delete_batches = false;
|
2008-08-06 20:31:48 +00:00
|
|
|
int antique_delay = ANTIQUE_DELAY;
|
2010-01-12 12:31:23 +00:00
|
|
|
int antique_interval = ANTIQUE_INTERVAL;
|
|
|
|
int antique_limit = ANTIQUE_LIMIT;
|
2008-11-03 22:56:16 +00:00
|
|
|
bool do_input_files = true;
|
|
|
|
bool do_output_files = true;
|
2008-11-26 20:37:11 +00:00
|
|
|
int sleep_interval = DEFAULT_SLEEP_INTERVAL;
|
2008-11-03 22:56:16 +00:00
|
|
|
|
2009-09-17 17:56:59 +00:00
|
|
|
void usage(char *name) {
|
|
|
|
fprintf(stderr, "Deletes files that are no longer needed.\n\n"
|
|
|
|
"Default operation:\n"
|
|
|
|
"1) enumerate N WUs and M results (N,M compile params)\n"
|
|
|
|
" that are ready to file-delete, and try to delete their files\n"
|
|
|
|
"2) if the enums didn't yield anything, sleep for K seconds\n"
|
|
|
|
"3) repeat from 1)\n"
|
|
|
|
"4) every 1 hour, enumerate everything in state FILE_DELETE_ERROR\n"
|
|
|
|
" and try to delete it.\n"
|
|
|
|
"5) after 1 hour, and every 24 hours thereafter,\n"
|
|
|
|
" scan for and delete all files in the upload directories\n"
|
|
|
|
" that are older than any WU in the database,\n"
|
|
|
|
" and were created at least one month ago.\n"
|
|
|
|
" This deletes files uploaded by hosts after the WU was deleted.\n\n"
|
|
|
|
"Usage: %s [OPTION]...\n\n"
|
|
|
|
"Options:\n"
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
" -d N | --debug_level N set debug output level (1 to 4)\n"
|
|
|
|
" --mod M R handle only WUs with ID mod M == R\n"
|
|
|
|
" --appid ID handle only WUs of a particular app\n"
|
|
|
|
" --one_pass instead of sleeping in 2), exit\n"
|
|
|
|
" --delete_antiques_now do 5) immediately\n"
|
|
|
|
" --dont_retry_error don't do 4)\n"
|
|
|
|
" --dont_delete_antiques don't do 5)\n"
|
|
|
|
" --delete_antiques_interval change the interval between delete antique passes (in seconds, defaults to 24h)\n"
|
|
|
|
" --delete_antiques_limit change the maximum number of files deleted in one delete antique pass (defaults to 50000)\n"
|
|
|
|
" --preserve_result_files update the DB, but don't delete output files.\n"
|
|
|
|
" For debugging.\n"
|
|
|
|
" --preserve_wu_files update the DB, but don't delete input files.\n"
|
2009-09-17 17:56:59 +00:00
|
|
|
" For debugging.\n"
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
" --dont_delete_batches don't delete anything with positive batch number\n"
|
|
|
|
" --input_files_only delete only input (download) files\n"
|
|
|
|
" --output_files_only delete only output (upload) files\n"
|
|
|
|
" [ -h | --help ] shows this help text\n"
|
|
|
|
" [ -v | --version ] shows version information\n",
|
2009-09-17 17:56:59 +00:00
|
|
|
name
|
2008-11-03 22:56:16 +00:00
|
|
|
);
|
|
|
|
}
|
2005-07-26 22:14:55 +00:00
|
|
|
|
2009-06-10 17:42:18 +00:00
|
|
|
// Given a filename, find its full path in the upload directory hierarchy
|
|
|
|
// Return ERR_OPENDIR if dir isn't there (possibly recoverable error),
|
|
|
|
// ERR_NOT_FOUND if dir is there but not file
|
2005-01-03 18:00:53 +00:00
|
|
|
//
|
2005-10-23 07:19:03 +00:00
|
|
|
int get_file_path(
|
|
|
|
const char *filename, char* upload_dir, int fanout, char* path
|
|
|
|
) {
|
2009-09-10 21:24:39 +00:00
|
|
|
dir_hier_path(filename, upload_dir, fanout, path, true);
|
2005-05-26 20:23:24 +00:00
|
|
|
if (boinc_file_exists(path)) {
|
2005-01-03 18:00:53 +00:00
|
|
|
return 0;
|
2005-05-26 20:23:24 +00:00
|
|
|
}
|
2009-06-10 17:42:18 +00:00
|
|
|
char* p = strrchr(path, '/');
|
|
|
|
*p = 0;
|
|
|
|
if (boinc_file_exists(path)) {
|
|
|
|
return ERR_NOT_FOUND;
|
|
|
|
}
|
|
|
|
return ERR_OPENDIR;
|
2005-01-03 18:00:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2002-12-17 19:00:43 +00:00
|
|
|
int wu_delete_files(WORKUNIT& wu) {
|
|
|
|
char* p;
|
2008-03-31 16:19:45 +00:00
|
|
|
char filename[256], pathname[256], buf[BLOB_SIZE];
|
2003-06-10 05:33:39 +00:00
|
|
|
bool no_delete=false;
|
2005-05-26 20:23:24 +00:00
|
|
|
int count_deleted = 0, retval, mthd_retval = 0;
|
2002-12-17 19:00:43 +00:00
|
|
|
|
2006-11-09 00:20:14 +00:00
|
|
|
if (strstr(wu.name, "nodelete")) return 0;
|
2005-01-29 23:29:54 +00:00
|
|
|
|
2003-04-03 18:35:40 +00:00
|
|
|
safe_strcpy(buf, wu.xml_doc);
|
2004-11-29 09:23:51 +00:00
|
|
|
|
2002-12-20 02:12:27 +00:00
|
|
|
p = strtok(buf, "\n");
|
2002-12-17 19:00:43 +00:00
|
|
|
strcpy(filename, "");
|
|
|
|
while (p) {
|
|
|
|
if (parse_str(p, "<name>", filename, sizeof(filename))) {
|
|
|
|
} else if (match_tag(p, "<file_info>")) {
|
|
|
|
no_delete = false;
|
|
|
|
strcpy(filename, "");
|
|
|
|
} else if (match_tag(p, "<no_delete/>")) {
|
|
|
|
no_delete = true;
|
|
|
|
} else if (match_tag(p, "</file_info>")) {
|
|
|
|
if (!no_delete) {
|
2009-06-10 17:42:18 +00:00
|
|
|
retval = get_file_path(
|
|
|
|
filename, config.download_dir, config.uldl_dir_fanout,
|
2005-01-02 07:44:40 +00:00
|
|
|
pathname
|
|
|
|
);
|
2009-06-10 17:42:18 +00:00
|
|
|
if (retval == ERR_OPENDIR) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"[WU#%d] missing dir for %s\n",
|
|
|
|
wu.id, filename
|
|
|
|
);
|
|
|
|
mthd_retval = ERR_UNLINK;
|
|
|
|
} else if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"[WU#%d] get_file_path: %s: %s\n",
|
|
|
|
wu.id, filename, boincerror(retval)
|
2005-05-26 20:42:35 +00:00
|
|
|
);
|
2004-08-06 11:42:41 +00:00
|
|
|
} else {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2005-05-26 20:42:35 +00:00
|
|
|
"[WU#%d] deleting %s\n", wu.id, filename
|
|
|
|
);
|
2004-08-06 11:42:41 +00:00
|
|
|
retval = unlink(pathname);
|
2005-05-26 20:23:24 +00:00
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"[WU#%d] unlink %s failed: %s\n",
|
|
|
|
wu.id, filename, boincerror(retval)
|
2005-05-26 20:42:35 +00:00
|
|
|
);
|
2005-08-25 23:07:58 +00:00
|
|
|
mthd_retval = ERR_UNLINK;
|
2005-05-26 20:23:24 +00:00
|
|
|
} else {
|
2005-08-25 23:07:58 +00:00
|
|
|
count_deleted++;
|
2004-08-06 11:42:41 +00:00
|
|
|
}
|
2010-07-31 04:08:14 +00:00
|
|
|
// delete the cached MD5 file if needed
|
|
|
|
//
|
|
|
|
if (config.cache_md5_info) {
|
|
|
|
strcat(pathname,".md5");
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[WU#%d] deleting %s\n", wu.id, filename
|
|
|
|
);
|
|
|
|
retval = unlink(pathname);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"[WU#%d] unlink %s failed: %s\n",
|
|
|
|
wu.id, filename, boincerror(retval)
|
2010-07-31 04:08:14 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
2004-08-06 11:42:41 +00:00
|
|
|
}
|
2002-12-17 19:00:43 +00:00
|
|
|
}
|
|
|
|
}
|
2002-12-18 01:34:51 +00:00
|
|
|
p = strtok(0, "\n");
|
2002-12-17 19:00:43 +00:00
|
|
|
}
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
2005-09-26 23:28:48 +00:00
|
|
|
"[WU#%d] deleted %d file(s)\n", wu.id, count_deleted
|
|
|
|
);
|
2005-05-26 20:23:24 +00:00
|
|
|
return mthd_retval;
|
2002-12-17 19:00:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int result_delete_files(RESULT& result) {
|
|
|
|
char* p;
|
2008-03-31 16:19:45 +00:00
|
|
|
char filename[256], pathname[256], buf[BLOB_SIZE];
|
2003-06-10 05:33:39 +00:00
|
|
|
bool no_delete=false;
|
2005-05-26 20:23:24 +00:00
|
|
|
int count_deleted = 0, retval, mthd_retval = 0;
|
2003-07-02 20:57:59 +00:00
|
|
|
|
2003-04-03 18:35:40 +00:00
|
|
|
safe_strcpy(buf, result.xml_doc_in);
|
2002-12-20 02:12:27 +00:00
|
|
|
p = strtok(buf,"\n");
|
2002-12-17 19:00:43 +00:00
|
|
|
while (p) {
|
|
|
|
if (parse_str(p, "<name>", filename, sizeof(filename))) {
|
|
|
|
} else if (match_tag(p, "<file_info>")) {
|
|
|
|
no_delete = false;
|
|
|
|
strcpy(filename, "");
|
|
|
|
} else if (match_tag(p, "<no_delete/>")) {
|
|
|
|
no_delete = true;
|
|
|
|
} else if (match_tag(p, "</file_info>")) {
|
|
|
|
if (!no_delete) {
|
2005-05-26 20:42:35 +00:00
|
|
|
retval = get_file_path(
|
|
|
|
filename, config.upload_dir, config.uldl_dir_fanout,
|
2004-08-06 11:42:41 +00:00
|
|
|
pathname
|
2003-09-20 17:38:13 +00:00
|
|
|
);
|
2009-06-10 17:42:18 +00:00
|
|
|
if (retval == ERR_OPENDIR) {
|
|
|
|
mthd_retval = ERR_OPENDIR;
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"[RESULT#%d] missing dir for %s\n",
|
|
|
|
result.id, pathname
|
|
|
|
);
|
|
|
|
} else if (retval) {
|
2005-08-20 02:45:49 +00:00
|
|
|
// the fact that no result files were found is a critical
|
|
|
|
// error if this was a successful result, but is to be
|
|
|
|
// expected if the result outcome was failure, since in
|
|
|
|
// that case there may well be no output file produced.
|
|
|
|
//
|
|
|
|
int debug_or_crit;
|
|
|
|
if (RESULT_OUTCOME_SUCCESS == result.outcome) {
|
2008-02-21 21:00:58 +00:00
|
|
|
debug_or_crit=MSG_CRITICAL;
|
2005-08-20 02:45:49 +00:00
|
|
|
} else {
|
2008-02-21 21:00:58 +00:00
|
|
|
debug_or_crit=MSG_DEBUG;
|
2005-08-20 02:45:49 +00:00
|
|
|
}
|
|
|
|
log_messages.printf(debug_or_crit,
|
|
|
|
"[RESULT#%d] outcome=%d client_state=%d No file %s to delete\n",
|
|
|
|
result.id, result.outcome, result.client_state, filename
|
2004-08-06 11:42:41 +00:00
|
|
|
);
|
|
|
|
} else {
|
|
|
|
retval = unlink(pathname);
|
2005-05-26 20:23:24 +00:00
|
|
|
if (retval) {
|
2005-08-25 23:07:58 +00:00
|
|
|
mthd_retval = ERR_UNLINK;
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"[RESULT#%d] unlink %s error: %s %s\n",
|
|
|
|
result.id, pathname, boincerror(retval),
|
2005-08-25 23:07:58 +00:00
|
|
|
(retval && errno)?strerror(errno):""
|
2005-05-26 20:23:24 +00:00
|
|
|
);
|
|
|
|
} else {
|
2005-08-25 23:07:58 +00:00
|
|
|
count_deleted++;
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2005-08-25 23:07:58 +00:00
|
|
|
"[RESULT#%d] unlinked %s\n", result.id, pathname
|
|
|
|
);
|
2005-05-26 20:23:24 +00:00
|
|
|
}
|
2004-08-06 11:42:41 +00:00
|
|
|
}
|
2002-12-17 19:00:43 +00:00
|
|
|
}
|
|
|
|
}
|
2002-12-18 01:34:51 +00:00
|
|
|
p = strtok(0, "\n");
|
2002-12-17 19:00:43 +00:00
|
|
|
}
|
2003-07-02 20:57:59 +00:00
|
|
|
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
2005-05-26 20:42:35 +00:00
|
|
|
"[RESULT#%d] deleted %d file(s)\n", result.id, count_deleted
|
2003-09-20 17:38:13 +00:00
|
|
|
);
|
2005-05-26 20:23:24 +00:00
|
|
|
return mthd_retval;
|
2002-12-17 19:00:43 +00:00
|
|
|
}
|
|
|
|
|
2004-11-29 09:23:51 +00:00
|
|
|
// set by corresponding command line arguments.
|
|
|
|
static bool preserve_wu_files=false;
|
|
|
|
static bool preserve_result_files=false;
|
|
|
|
|
2009-09-10 21:24:39 +00:00
|
|
|
// return true if we changed the file_delete_state of a WU or a result
|
2002-12-17 19:00:43 +00:00
|
|
|
//
|
2005-06-02 07:04:29 +00:00
|
|
|
bool do_pass(bool retry_error) {
|
2003-06-04 17:21:26 +00:00
|
|
|
DB_WORKUNIT wu;
|
|
|
|
DB_RESULT result;
|
2005-06-02 07:04:29 +00:00
|
|
|
bool did_something = false;
|
2003-06-04 17:21:26 +00:00
|
|
|
char buf[256];
|
2006-06-07 00:09:05 +00:00
|
|
|
char clause[256];
|
2009-09-10 21:24:39 +00:00
|
|
|
int retval, new_state;
|
2002-12-17 19:00:43 +00:00
|
|
|
|
2004-05-03 19:30:01 +00:00
|
|
|
check_stop_daemons();
|
2003-03-08 00:09:40 +00:00
|
|
|
|
2006-06-07 00:09:05 +00:00
|
|
|
strcpy(clause, "");
|
2005-07-26 22:14:55 +00:00
|
|
|
if (id_modulus) {
|
2006-06-07 00:09:05 +00:00
|
|
|
sprintf(clause, " and id %% %d = %d ", id_modulus, id_remainder);
|
|
|
|
}
|
2007-04-17 17:12:44 +00:00
|
|
|
if (dont_delete_batches) {
|
2006-06-07 00:09:05 +00:00
|
|
|
strcat(clause, " and batch <= 0 ");
|
2005-07-26 22:14:55 +00:00
|
|
|
}
|
2009-03-03 00:12:55 +00:00
|
|
|
if (appid) {
|
|
|
|
sprintf(buf, " and appid = %d ", appid);
|
|
|
|
strcat(clause, buf);
|
|
|
|
}
|
2007-04-17 17:12:44 +00:00
|
|
|
sprintf(buf,
|
|
|
|
"where file_delete_state=%d %s limit %d",
|
|
|
|
retry_error?FILE_DELETE_ERROR:FILE_DELETE_READY,
|
|
|
|
clause, WUS_PER_ENUM
|
|
|
|
);
|
|
|
|
|
2008-11-03 22:56:16 +00:00
|
|
|
while (do_input_files) {
|
2008-03-27 18:25:29 +00:00
|
|
|
retval = wu.enumerate(buf);
|
|
|
|
if (retval) {
|
|
|
|
if (retval != ERR_DB_NOT_FOUND) {
|
2008-10-27 21:23:07 +00:00
|
|
|
log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n");
|
2008-03-27 18:25:29 +00:00
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2005-08-20 02:45:49 +00:00
|
|
|
|
2009-09-10 21:24:39 +00:00
|
|
|
if (preserve_wu_files) {
|
|
|
|
retval = 0;
|
|
|
|
} else {
|
2005-05-26 20:23:24 +00:00
|
|
|
retval = wu_delete_files(wu);
|
2004-12-20 20:47:25 +00:00
|
|
|
}
|
2005-05-26 20:42:35 +00:00
|
|
|
if (retval) {
|
2009-09-10 21:24:39 +00:00
|
|
|
new_state = FILE_DELETE_ERROR;
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"[WU#%d] file deletion failed: %s\n", wu.id, boincerror(retval)
|
2005-09-26 23:28:48 +00:00
|
|
|
);
|
2005-05-26 20:42:35 +00:00
|
|
|
} else {
|
2009-09-10 21:24:39 +00:00
|
|
|
new_state = FILE_DELETE_DONE;
|
2004-10-04 23:59:51 +00:00
|
|
|
}
|
2009-09-10 21:24:39 +00:00
|
|
|
if (new_state != wu.file_delete_state) {
|
|
|
|
sprintf(buf, "file_delete_state=%d", new_state);
|
|
|
|
retval = wu.update_field(buf);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"[WU#%d] update failed: %s\n", wu.id, boincerror(retval)
|
2009-09-10 21:24:39 +00:00
|
|
|
);
|
|
|
|
} else {
|
|
|
|
log_messages.printf(MSG_DEBUG,
|
|
|
|
"[WU#%d] file_delete_state updated\n", wu.id
|
|
|
|
);
|
|
|
|
did_something = true;
|
|
|
|
}
|
2009-05-12 22:44:04 +00:00
|
|
|
}
|
2002-12-17 19:00:43 +00:00
|
|
|
}
|
|
|
|
|
2007-04-17 17:12:44 +00:00
|
|
|
sprintf(buf,
|
|
|
|
"where file_delete_state=%d %s limit %d",
|
|
|
|
retry_error?FILE_DELETE_ERROR:FILE_DELETE_READY,
|
|
|
|
clause, RESULTS_PER_ENUM
|
|
|
|
);
|
2005-08-11 23:53:24 +00:00
|
|
|
|
2008-11-03 22:56:16 +00:00
|
|
|
while (do_output_files) {
|
2008-10-27 21:23:07 +00:00
|
|
|
retval = result.enumerate(buf);
|
|
|
|
if (retval) {
|
|
|
|
if (retval != ERR_DB_NOT_FOUND) {
|
|
|
|
log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n");
|
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2009-09-10 21:24:39 +00:00
|
|
|
if (preserve_result_files) {
|
|
|
|
retval = 0;
|
|
|
|
} else {
|
2007-04-17 17:12:44 +00:00
|
|
|
retval = result_delete_files(result);
|
2005-05-26 20:23:24 +00:00
|
|
|
}
|
2007-04-17 17:12:44 +00:00
|
|
|
if (retval) {
|
2009-09-10 21:24:39 +00:00
|
|
|
new_state = FILE_DELETE_ERROR;
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"[RESULT#%d] file deletion failed: %s\n", result.id, boincerror(retval)
|
2007-04-17 17:12:44 +00:00
|
|
|
);
|
|
|
|
} else {
|
2009-09-10 21:24:39 +00:00
|
|
|
new_state = FILE_DELETE_DONE;
|
2005-08-08 09:11:37 +00:00
|
|
|
}
|
2009-09-10 21:24:39 +00:00
|
|
|
if (new_state != result.file_delete_state) {
|
|
|
|
sprintf(buf, "file_delete_state=%d", new_state);
|
|
|
|
retval = result.update_field(buf);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"[RESULT#%d] update failed: %s\n", result.id, boincerror(retval)
|
2009-09-10 21:24:39 +00:00
|
|
|
);
|
|
|
|
} else {
|
|
|
|
log_messages.printf(MSG_DEBUG,
|
|
|
|
"[RESULT#%d] file_delete_state updated\n", result.id
|
|
|
|
);
|
|
|
|
did_something = true;
|
|
|
|
}
|
2009-05-12 22:44:04 +00:00
|
|
|
}
|
2005-08-11 23:53:24 +00:00
|
|
|
}
|
2005-08-08 09:11:37 +00:00
|
|
|
|
2005-08-11 23:53:24 +00:00
|
|
|
return did_something;
|
2005-08-08 09:11:37 +00:00
|
|
|
}
|
|
|
|
|
2005-08-20 02:45:49 +00:00
|
|
|
struct FILE_RECORD {
|
2009-02-26 00:23:23 +00:00
|
|
|
std::string name;
|
2005-08-20 02:45:49 +00:00
|
|
|
int date_modified;
|
|
|
|
};
|
|
|
|
|
|
|
|
bool operator == (const FILE_RECORD& fr1, const FILE_RECORD& fr2) {
|
|
|
|
return (fr1.date_modified == fr2.date_modified && fr1.name == fr2.name);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool operator < (const FILE_RECORD& fr1, const FILE_RECORD& fr2) {
|
2007-04-17 17:12:44 +00:00
|
|
|
if (fr1.date_modified < fr2.date_modified) return true;
|
|
|
|
if (fr1.date_modified > fr2.date_modified) return false;
|
|
|
|
if (fr1.name < fr2.name) return true;
|
|
|
|
return false;
|
2005-08-20 02:45:49 +00:00
|
|
|
}
|
|
|
|
|
2007-04-17 17:12:44 +00:00
|
|
|
// list of antique files to delete,
|
|
|
|
// sorted by mod time (primary key) and name(secondary key)
|
2005-08-20 02:45:49 +00:00
|
|
|
//
|
|
|
|
std::list<FILE_RECORD> files_to_delete;
|
|
|
|
|
2007-04-17 17:12:44 +00:00
|
|
|
// delete files in antique files list, and empty the list.
|
|
|
|
// Returns number of files deleted, or negative for error.
|
|
|
|
//
|
2009-06-10 17:42:18 +00:00
|
|
|
// TODO: the list contains filenames, and we convert these to paths.
|
|
|
|
// This is wacked. The list should contain paths.
|
|
|
|
//
|
2007-04-17 17:12:44 +00:00
|
|
|
int delete_antique_files() {
|
2005-08-20 02:45:49 +00:00
|
|
|
int nfiles=0;
|
|
|
|
|
2008-10-28 21:59:25 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
2010-07-08 18:02:07 +00:00
|
|
|
"delete_antique_files(): start (%d files)\n",
|
|
|
|
(int)files_to_delete.size()
|
2008-10-28 21:59:25 +00:00
|
|
|
);
|
2007-04-17 17:12:44 +00:00
|
|
|
while (!files_to_delete.empty()) {
|
2005-08-20 02:45:49 +00:00
|
|
|
char timestamp[128];
|
|
|
|
char pathname[1024];
|
|
|
|
int retval;
|
|
|
|
|
2005-08-25 23:07:58 +00:00
|
|
|
FILE_RECORD fr = files_to_delete.front();
|
|
|
|
check_stop_daemons();
|
2005-08-20 02:45:49 +00:00
|
|
|
|
2005-08-25 23:07:58 +00:00
|
|
|
retval = get_file_path(
|
2005-10-23 07:19:03 +00:00
|
|
|
fr.name.c_str(), config.upload_dir,
|
2005-08-25 23:07:58 +00:00
|
|
|
config.uldl_dir_fanout, pathname
|
|
|
|
);
|
2005-08-20 02:45:49 +00:00
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"get_file_path(%s) failed: %s\n", fr.name.c_str(), boincerror(retval)
|
2005-08-20 02:45:49 +00:00
|
|
|
);
|
2008-10-28 21:59:25 +00:00
|
|
|
return retval;
|
2005-08-20 02:45:49 +00:00
|
|
|
}
|
|
|
|
|
2005-08-25 23:07:58 +00:00
|
|
|
strcpy(timestamp, time_to_string(fr.date_modified));
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
2005-08-20 02:45:49 +00:00
|
|
|
"deleting [antique %s] %s\n",
|
2007-04-17 17:12:44 +00:00
|
|
|
timestamp, pathname
|
2005-08-20 02:45:49 +00:00
|
|
|
);
|
|
|
|
if (unlink(pathname)) {
|
2005-08-25 23:07:58 +00:00
|
|
|
int save_error=errno;
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2005-08-25 23:07:58 +00:00
|
|
|
"unlink(%s) failed: %s\n",
|
|
|
|
pathname, strerror(save_error)
|
2005-08-20 02:45:49 +00:00
|
|
|
);
|
2008-10-28 21:59:25 +00:00
|
|
|
return retval;
|
2005-08-25 23:07:58 +00:00
|
|
|
} else {
|
2005-08-20 02:45:49 +00:00
|
|
|
nfiles++;
|
2005-08-25 23:07:58 +00:00
|
|
|
files_to_delete.pop_front();
|
|
|
|
}
|
2007-04-17 17:12:44 +00:00
|
|
|
}
|
2008-10-28 21:59:25 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
|
|
|
"delete_antique_files(): done, deleted %d files\n", nfiles
|
|
|
|
);
|
|
|
|
return 0;
|
2005-08-20 02:45:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-04-17 17:12:44 +00:00
|
|
|
// construct a list "file_to_delete" of old files.
|
|
|
|
// Return number of files added to list, or negative for error.
|
2005-08-20 02:45:49 +00:00
|
|
|
//
|
|
|
|
int add_antiques_to_list(int days) {
|
|
|
|
char command[256];
|
|
|
|
char single_line[1024];
|
|
|
|
FILE *fp;
|
|
|
|
int dirlen=strlen(config.upload_dir);
|
2007-06-27 20:09:18 +00:00
|
|
|
struct passwd *apache_info=getpwnam(config.httpd_user);
|
2005-08-20 02:45:49 +00:00
|
|
|
int del_time=time(0)-86400*days;
|
|
|
|
int nfiles=0;
|
|
|
|
|
|
|
|
if (!apache_info) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-01-12 12:31:23 +00:00
|
|
|
"default httpd_user '%s' found - add <httpd_user> entry in config.xml\n",
|
2008-03-03 19:33:32 +00:00
|
|
|
config.httpd_user
|
2005-09-26 23:28:48 +00:00
|
|
|
);
|
2005-08-25 23:07:58 +00:00
|
|
|
return -1;
|
2005-08-20 02:45:49 +00:00
|
|
|
}
|
2008-08-06 20:31:48 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
|
|
|
"Searching for antique files older than %d days\n", days
|
|
|
|
);
|
2005-08-20 02:45:49 +00:00
|
|
|
|
2010-01-12 12:31:23 +00:00
|
|
|
sprintf(command, "find %s -type f -mtime +%d -follow | head -%d", config.upload_dir, days, antique_limit);
|
2005-08-20 02:45:49 +00:00
|
|
|
|
|
|
|
// Now execute the command, read output on a stream. We could use
|
|
|
|
// find to also exec a 'delete' command. But we want to log all
|
|
|
|
// file names into the log, and do lots of sanity checking, so
|
|
|
|
// this way is better.
|
|
|
|
//
|
|
|
|
if (!(fp=popen(command, "r"))) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2005-09-26 23:28:48 +00:00
|
|
|
"command %s failed\n", command
|
|
|
|
);
|
2005-08-20 02:45:49 +00:00
|
|
|
return -2;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (fgets(single_line, 1024, fp)) {
|
|
|
|
char pathname[1024];
|
2005-08-25 23:07:58 +00:00
|
|
|
char *fname_at_end=NULL;
|
|
|
|
int nchars=strlen(single_line);
|
|
|
|
struct stat statbuf;
|
|
|
|
const char *err=NULL;
|
2006-11-19 11:39:01 +00:00
|
|
|
FILE_RECORD fr;
|
2005-08-20 02:45:49 +00:00
|
|
|
|
2005-08-25 23:07:58 +00:00
|
|
|
// We can interrupt this at any point.
|
|
|
|
// pclose() is called when process exits.
|
2005-08-20 02:45:49 +00:00
|
|
|
check_stop_daemons();
|
|
|
|
|
2005-08-25 23:07:58 +00:00
|
|
|
// Do serious sanity checking on the path before
|
|
|
|
// adding the file!!
|
|
|
|
//
|
|
|
|
if (!err && nchars > 1022) err="line too long";
|
|
|
|
if (!err && nchars < dirlen + 1) err="path shorter than upload directory name";
|
|
|
|
if (!err && single_line[nchars-1] != '\n') err="no newline terminator in line";
|
|
|
|
if (!err && strncmp(config.upload_dir, single_line, dirlen)) err="upload directory not in path";
|
|
|
|
if (!err && single_line[dirlen] != '/') err="no slash separator in path";
|
|
|
|
if (!err) single_line[nchars-1]='\0';
|
|
|
|
if (!err && stat(single_line, &statbuf)) err="stat failed";
|
|
|
|
if (!err && statbuf.st_mtime > del_time) err="file too recent";
|
|
|
|
if (!err && apache_info->pw_uid != statbuf.st_uid) err="file not owned by httpd user";
|
|
|
|
if (!err && !(fname_at_end=rindex(single_line+dirlen, '/'))) err="no trailing filename";
|
|
|
|
if (!err) fname_at_end++;
|
|
|
|
if (!err && !strlen(fname_at_end)) err="trailing filename too short";
|
2006-11-19 11:39:01 +00:00
|
|
|
|
|
|
|
// skip NFS file system markers of form .nfs*
|
|
|
|
//
|
|
|
|
if (!err && !strncmp(fname_at_end, ".nfs", 4)) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2006-11-19 11:39:01 +00:00
|
|
|
"Ignoring antique (stale) NFS lockfile %s\n", single_line
|
|
|
|
);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2005-08-25 23:07:58 +00:00
|
|
|
if (!err && get_file_path(fname_at_end, config.upload_dir, config.uldl_dir_fanout, pathname)) err="get_file_path() failed";
|
|
|
|
if (!err && strcmp(pathname, single_line)) err="file in wrong hierarchical upload subdirectory";
|
|
|
|
|
|
|
|
if (err) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2005-08-25 23:07:58 +00:00
|
|
|
"Can't list %s for deletion: %s\n",
|
|
|
|
single_line, err
|
2005-08-20 02:45:49 +00:00
|
|
|
);
|
2005-08-25 23:07:58 +00:00
|
|
|
// This file deleting business is SERIOUS. Give up at the
|
|
|
|
// first sign of ANYTHING amiss.
|
|
|
|
//
|
|
|
|
pclose(fp);
|
|
|
|
return -3;
|
|
|
|
}
|
2005-08-20 02:45:49 +00:00
|
|
|
|
|
|
|
// insert this file onto the list
|
2005-08-25 23:07:58 +00:00
|
|
|
fr.date_modified = statbuf.st_mtime;
|
|
|
|
fr.name = fname_at_end;
|
|
|
|
files_to_delete.push_back(fr);
|
2005-08-20 02:45:49 +00:00
|
|
|
nfiles++;
|
|
|
|
|
|
|
|
} // while (fgets(single_line, 1024, fp)) {
|
|
|
|
pclose(fp);
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
2005-08-20 02:45:49 +00:00
|
|
|
"Found %d antique files to delete\n",
|
|
|
|
nfiles
|
|
|
|
);
|
2007-04-17 17:12:44 +00:00
|
|
|
files_to_delete.sort();
|
|
|
|
files_to_delete.unique();
|
2005-08-20 02:45:49 +00:00
|
|
|
return nfiles;
|
|
|
|
}
|
|
|
|
|
|
|
|
// returns number of files found & added, or negative for error.
|
|
|
|
//
|
|
|
|
int find_antique_files() {
|
|
|
|
char buf[256];
|
|
|
|
DB_WORKUNIT wu;
|
|
|
|
|
|
|
|
check_stop_daemons();
|
|
|
|
|
2007-04-17 17:12:44 +00:00
|
|
|
// Find the oldest workunit. We could add
|
|
|
|
// "where file_delete_state!=FILE_DELETE_DONE" to the query,
|
|
|
|
// but this might create some race condition
|
|
|
|
// with the 'regular' file delete mechanism,
|
|
|
|
// so better to do it like this.
|
2005-08-20 02:45:49 +00:00
|
|
|
//
|
2007-06-20 16:27:27 +00:00
|
|
|
sprintf(buf, "order by id limit 1");
|
2005-08-20 02:45:49 +00:00
|
|
|
if (!wu.enumerate(buf)) {
|
|
|
|
// Don't ever delete files younger than a month.
|
|
|
|
//
|
|
|
|
int days = 1 + (time(0) - wu.create_time)/86400;
|
|
|
|
if (days<31) days=31;
|
|
|
|
|
2005-08-25 23:07:58 +00:00
|
|
|
return add_antiques_to_list(days);
|
2005-08-20 02:45:49 +00:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-04-17 17:12:44 +00:00
|
|
|
void do_antique_pass() {
|
|
|
|
int retval;
|
2005-08-20 02:45:49 +00:00
|
|
|
|
|
|
|
// If any problems appear in deleting antique files
|
|
|
|
// immediately DISABLE this feature.
|
|
|
|
//
|
2007-04-17 17:12:44 +00:00
|
|
|
retval = find_antique_files();
|
|
|
|
if (retval < 0) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"Problem 1 [%s] in antique file deletion: turning OFF --delete_antiques switch\n",
|
|
|
|
boincerror(retval)
|
2006-11-19 11:39:01 +00:00
|
|
|
);
|
2007-04-17 17:12:44 +00:00
|
|
|
dont_delete_antiques = true;
|
|
|
|
return;
|
2005-08-20 02:45:49 +00:00
|
|
|
}
|
|
|
|
|
2007-04-17 17:12:44 +00:00
|
|
|
retval = delete_antique_files();
|
2008-10-28 21:59:25 +00:00
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"Problem 2 [%s] in antique file deletion: turning OFF --delete_antiques switch\n",
|
|
|
|
boincerror(retval)
|
2006-11-19 11:39:01 +00:00
|
|
|
);
|
2007-04-17 17:12:44 +00:00
|
|
|
dont_delete_antiques = true;
|
2006-11-19 11:39:01 +00:00
|
|
|
}
|
2005-08-20 02:45:49 +00:00
|
|
|
}
|
|
|
|
|
2002-12-17 19:00:43 +00:00
|
|
|
int main(int argc, char** argv) {
|
|
|
|
int retval;
|
2007-06-20 22:34:06 +00:00
|
|
|
bool one_pass = false;
|
2002-12-17 19:00:43 +00:00
|
|
|
int i;
|
2010-01-12 12:31:23 +00:00
|
|
|
DB_APP app;
|
|
|
|
|
2004-05-03 19:30:01 +00:00
|
|
|
check_stop_daemons();
|
2010-01-12 12:31:23 +00:00
|
|
|
|
|
|
|
*app.name='\0';
|
2002-12-17 19:00:43 +00:00
|
|
|
for (i=1; i<argc; i++) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (is_arg(argv[i], "one_pass")) {
|
2002-12-17 19:00:43 +00:00
|
|
|
one_pass = true;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "dont_retry_errors")) {
|
2007-04-17 17:12:44 +00:00
|
|
|
dont_retry_errors = true;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "preserve_wu_files")) {
|
2004-11-29 09:23:51 +00:00
|
|
|
preserve_wu_files = true;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "preserve_result_files")) {
|
2004-11-29 09:23:51 +00:00
|
|
|
preserve_result_files = true;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "app")) {
|
2010-01-12 12:31:23 +00:00
|
|
|
strcpy(app.name, argv[++i]);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "appid")) {
|
|
|
|
if (!argv[++i]) {
|
2009-09-17 17:56:59 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL, "%s requires an argument\n\n", argv[--i]);
|
|
|
|
usage(argv[0]);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
appid = atoi(argv[i]);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "d") || is_arg(argv[i], "debug_level")) {
|
|
|
|
if (!argv[++i]) {
|
2009-09-17 17:56:59 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL, "%s requires an argument\n\n", argv[--i]);
|
|
|
|
usage(argv[0]);
|
|
|
|
exit(1);
|
|
|
|
}
|
2010-04-05 21:59:33 +00:00
|
|
|
int dl = atoi(argv[i]);
|
|
|
|
log_messages.set_debug_level(dl);
|
|
|
|
if (dl == 4) g_print_queries = true;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "mod")) {
|
|
|
|
if (!argv[i+1] || !argv[i+2]) {
|
2009-09-17 17:56:59 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL, "%s requires two arguments\n\n", argv[i]);
|
|
|
|
usage(argv[0]);
|
|
|
|
exit(1);
|
|
|
|
}
|
2005-07-26 22:14:55 +00:00
|
|
|
id_modulus = atoi(argv[++i]);
|
|
|
|
id_remainder = atoi(argv[++i]);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "dont_delete_antiques")) {
|
2007-04-17 17:12:44 +00:00
|
|
|
dont_delete_antiques = true;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "delete_antiques_interval")) {
|
2010-01-12 12:31:23 +00:00
|
|
|
antique_interval = atoi(argv[++i]);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "delete_antiques_limit")) {
|
2010-01-12 12:31:23 +00:00
|
|
|
antique_limit = atoi(argv[++i]);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "dont_delete_batches")) {
|
2007-04-30 21:19:24 +00:00
|
|
|
dont_delete_batches = true;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "delete_antiques_now")) {
|
2008-08-06 20:31:48 +00:00
|
|
|
antique_delay = 0;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "input_files_only")) {
|
2008-11-03 22:56:16 +00:00
|
|
|
do_output_files = false;
|
|
|
|
dont_delete_antiques = true;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "output_files_only")) {
|
2008-11-03 22:56:16 +00:00
|
|
|
do_input_files = false;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "sleep_interval")) {
|
|
|
|
if (!argv[++i]) {
|
2009-09-17 17:56:59 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL, "%s requires an argument\n\n", argv[--i]);
|
|
|
|
usage(argv[0]);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
sleep_interval = atoi(argv[i]);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "h") || is_arg(argv[i], "help")) {
|
2009-09-17 17:56:59 +00:00
|
|
|
usage(argv[0]);
|
|
|
|
exit(0);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "v") || is_arg(argv[i], "version")) {
|
2009-09-17 17:56:59 +00:00
|
|
|
printf("%s\n", SVN_VERSION);
|
|
|
|
exit(0);
|
2002-12-17 19:00:43 +00:00
|
|
|
} else {
|
2009-09-17 17:56:59 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL, "unknown command line argument: %s\n\n", argv[i]);
|
|
|
|
usage(argv[0]);
|
|
|
|
exit(1);
|
2002-12-17 19:00:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-07-26 22:14:55 +00:00
|
|
|
if (id_modulus) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
2007-04-17 17:12:44 +00:00
|
|
|
"Using mod'ed WU/result enumeration. mod = %d rem = %d\n",
|
|
|
|
id_modulus, id_remainder
|
|
|
|
);
|
2005-07-26 22:14:55 +00:00
|
|
|
}
|
|
|
|
|
2009-05-07 13:54:51 +00:00
|
|
|
retval = config.parse_file();
|
2002-12-17 19:00:43 +00:00
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-05-07 13:54:51 +00:00
|
|
|
"Can't parse config.xml: %s\n", boincerror(retval)
|
2007-04-17 17:12:44 +00:00
|
|
|
);
|
2002-12-17 19:00:43 +00:00
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL, "Starting\n");
|
2003-02-27 19:29:48 +00:00
|
|
|
|
2004-01-15 23:53:13 +00:00
|
|
|
retval = boinc_db.open(config.db_name, config.db_host, config.db_user, config.db_passwd);
|
2003-01-09 07:24:27 +00:00
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL, "can't open DB\n");
|
2003-01-09 07:24:27 +00:00
|
|
|
exit(1);
|
|
|
|
}
|
2007-02-13 21:41:07 +00:00
|
|
|
retval = boinc_db.set_isolation_level(READ_UNCOMMITTED);
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"boinc_db.set_isolation_level: %s; %s\n",
|
|
|
|
boincerror(retval), boinc_db.error_string()
|
2007-02-13 21:41:07 +00:00
|
|
|
);
|
|
|
|
}
|
2010-01-12 12:31:23 +00:00
|
|
|
|
|
|
|
if (*app.name && !appid) {
|
|
|
|
char buf[256];
|
|
|
|
sprintf(buf, "where name='%s'", app.name);
|
|
|
|
retval = app.lookup(buf);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL, "Can't find app\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
appid=app.id;
|
|
|
|
log_messages.printf(MSG_DEBUG, "Deleting files of appid %d\n",appid);
|
|
|
|
}
|
|
|
|
|
2003-12-31 23:09:21 +00:00
|
|
|
install_stop_signal_handler();
|
2007-04-17 17:12:44 +00:00
|
|
|
|
|
|
|
bool retry_errors_now = !dont_retry_errors;
|
|
|
|
double next_error_time=0;
|
2008-08-06 20:31:48 +00:00
|
|
|
double next_antique_time = dtime() + antique_delay;
|
2005-07-17 19:52:44 +00:00
|
|
|
while (1) {
|
2007-04-17 17:12:44 +00:00
|
|
|
bool got_any = do_pass(false);
|
|
|
|
if (retry_errors_now) {
|
|
|
|
bool got_any_errors = do_pass(true);
|
|
|
|
if (got_any_errors) {
|
|
|
|
got_any = true;
|
|
|
|
} else {
|
|
|
|
retry_errors_now = false;
|
|
|
|
next_error_time = dtime() + ERROR_INTERVAL;
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
2007-04-17 17:12:44 +00:00
|
|
|
"ending retry of previous errors\n"
|
|
|
|
);
|
2005-08-20 02:45:49 +00:00
|
|
|
}
|
2002-12-17 19:00:43 +00:00
|
|
|
}
|
2007-04-17 17:12:44 +00:00
|
|
|
if (!got_any) {
|
|
|
|
if (one_pass) break;
|
2008-11-26 19:09:27 +00:00
|
|
|
sleep(sleep_interval);
|
2007-04-17 17:12:44 +00:00
|
|
|
}
|
|
|
|
if (!dont_delete_antiques && (dtime() > next_antique_time)) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
2007-04-17 17:12:44 +00:00
|
|
|
"Doing antique deletion pass\n"
|
|
|
|
);
|
|
|
|
do_antique_pass();
|
2010-01-12 12:31:23 +00:00
|
|
|
next_antique_time = dtime() + antique_interval;
|
2007-04-17 17:12:44 +00:00
|
|
|
}
|
|
|
|
if (!dont_retry_errors && !retry_errors_now && (dtime() > next_error_time)) {
|
|
|
|
retry_errors_now = true;
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
2007-04-17 17:12:44 +00:00
|
|
|
"starting retry of previous errors\n"
|
|
|
|
);
|
|
|
|
}
|
2002-12-17 19:00:43 +00:00
|
|
|
}
|
|
|
|
}
|
2004-12-08 00:40:19 +00:00
|
|
|
|
2005-01-02 18:29:53 +00:00
|
|
|
const char *BOINC_RCSID_bd0d4938a6 = "$Id$";
|