boinc/sched/db_purge.C

578 lines
17 KiB
C

/* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
// The contents of this file are subject to the BOINC Public License
// Version 1.0 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
// http://boinc.berkeley.edu/license_1.0.txt
//
// Software distributed under the License is distributed on an "AS IS"
// basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
// License for the specific language governing rights and limitations
// under the License.
//
// The Original Code is the Berkeley Open Infrastructure for Network Computing.
//
// The Initial Developer of the Original Code is the SETI@home project.
// Portions created by the SETI@home project are Copyright (C) 2002
// University of California at Berkeley. All Rights Reserved.
//
// Contributor(s):
//
// db_purge:
// purge workunit and result records that are no longer needed from
// the database
#include <cstdio>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <cstring>
#include <cstdlib>
#include <ctime>
#include <string>
#include <time.h>
#include <errno.h>
using namespace std;
#include "boinc_db.h"
#include "util.h"
#include "parse.h"
#include "sched_config.h"
#include "sched_util.h"
#include "sched_msgs.h"
#include "error_numbers.h"
#define LOCKFILE "db_purge.out"
#define PIDFILE "db_purge.pid"
#define WU_FILENAME_PREFIX "wu_archive"
#define RESULT_FILENAME_PREFIX "result_archive"
#define WU_INDEX_FILENAME_PREFIX "wu_index"
#define RESULT_INDEX_FILENAME_PREFIX "result_index"
#define DB_QUERY_LIMIT 1000
SCHED_CONFIG config;
FILE *wu_stream=NULL;
FILE *re_stream=NULL;
FILE *wu_index_stream=NULL;
FILE *re_index_stream=NULL;
int time_int=0;
// These is used if limiting the total number of workunits to eliminate
int purged_workunits= 0;
// If non-negative, maximum number of workunits to purge. Since all
// results associated with a purged workunit are also purged, this
// also limits the number of purged results.
int max_number_workunits_to_purge=-1;
// set on command line if compression of archives is desired
#define COMPRESSION_NONE 0
#define COMPRESSION_GZIP 1
#define COMPRESSION_ZIP 2
// subscripts MUST be in agreement with defines above
char *suffix[3]={"", ".gz", ".zip"};
// default is no compression
int compress=COMPRESSION_NONE;
// set on command line if archive files should be closed and re-opened
// after getting some max no of WU in the file
int max_wu_per_file=0;
// keep track of how many WU archived in file so far
int wu_stored_in_file=0;
// this opens the archive. Only subtle thing is that if the user has
// asked for compression, then we popen(2) a pipe to gzip or zip.
// This does 'in place' compression.
//
void open_archive(char* filename_prefix, FILE*& f){
char path[256];
char command[512];
// append appropriate suffix for file type
sprintf(path, "../archives/%s_%d.xml", filename_prefix, time_int);
strcat(path, suffix[compress]);
// and construct appropriate command if needed
if (compress==COMPRESSION_GZIP)
sprintf(command, "gzip - > %s", path);
if (compress==COMPRESSION_ZIP)
sprintf(command, "zip %s -", path);
log_messages.printf(SCHED_MSG_LOG::NORMAL, "Opening archive %s\n", path);
// in the case with no compression, just open the file, else open
// a pipe to the compression executable.
if (compress==COMPRESSION_NONE) {
if (!(f = fopen( path,"w"))) {
log_messages.printf(SCHED_MSG_LOG::CRITICAL,"Can't open archive file %s %s\n",
path, errno?strerror(errno):"");
exit(3);
}
}
else if (!(f = popen(command,"w"))) {
log_messages.printf(SCHED_MSG_LOG::CRITICAL,"Can't open pipe %s %s\n",
command, errno?strerror(errno):"");
exit(4);
}
// set buffering to line buffered, since we are outputing XML on a
// line-by-line basis.
//
setlinebuf(f);
return;
}
void close_archive(char *filename, FILE*& fp){
char path[256];
// Set file pointer to NULL after closing file to indicate that
// it's closed.
if (!fp)
return;
// In case of errors, carry on anyway. This is deliberate, not lazy
if (compress==COMPRESSION_NONE)
fclose(fp);
else
pclose(fp);
fp=NULL;
// append appropriate file type
sprintf(path, "../archives/%s_%d.xml", filename, time_int);
strcat(path, suffix[compress]);
log_messages.printf(SCHED_MSG_LOG::NORMAL,
"Closed archive file %s containing records of %d workunits\n",
path, wu_stored_in_file);
return;
}
// opens the various archive files. Guarantees that the timestamp
// does not equal the previous timestamp
//
void open_all_archives() {
int old_time=time_int;
// make sure we get a NEW value of the file timestamp!
while (old_time == (time_int = (int)time(0)))
sleep(1);
// open all the archives.
open_archive(WU_FILENAME_PREFIX, wu_stream);
open_archive(RESULT_FILENAME_PREFIX, re_stream);
open_archive(RESULT_INDEX_FILENAME_PREFIX, re_index_stream);
open_archive(WU_INDEX_FILENAME_PREFIX, wu_index_stream);
return;
}
// closes (and optionally compresses) the archive files. Clears file
// pointers to indicate that files are not open.
//
void close_all_archives() {
close_archive(WU_FILENAME_PREFIX, wu_stream);
close_archive(RESULT_FILENAME_PREFIX, re_stream);
close_archive(RESULT_INDEX_FILENAME_PREFIX, re_index_stream);
close_archive(WU_INDEX_FILENAME_PREFIX, wu_index_stream);
log_messages.printf(SCHED_MSG_LOG::NORMAL,
"Closed archive files with %d workunits\n",
wu_stored_in_file
);
return;
}
// The exit handler always calls this at the end to be sure that the
// database is closed cleanly.
void close_db_exit_handler() {
boinc_db.close();
return;
}
int archive_result(DB_RESULT& result) {
fprintf(re_stream,
"<result_archive>\n"
" <id>%d</id>\n",
result.id
);
string r1, r2;
r1= result.stderr_out;
xml_escape(r1, r2);
fprintf(
re_stream,
" <create_time>%d</create_time>\n"
" <workunitid>%d</workunitid>\n"
" <server_state>%d</server_state>\n"
" <outcome>%d</outcome>\n"
" <client_state>%d</client_state>\n"
" <hostid>%d</hostid>\n"
" <userid>%d</userid>\n"
" <report_deadline>%d</report_deadline>\n"
" <sent_time>%d</sent_time>\n"
" <received_time>%d</received_time>\n"
" <name>%s</name>\n"
" <cpu_time>%.15e</cpu_time>\n"
" <xml_doc_in>%s</xml_doc_in>\n"
" <xml_doc_out>%s</xml_doc_out>\n"
" <stderr_out>%s</stderr_out>\n"
" <batch>%d</batch>\n"
" <file_delete_state>%d</file_delete_state>\n"
" <validate_state>%d</validate_state>\n"
" <claimed_credit>%.15e</claimed_credit>\n"
" <granted_credit>%.15e</granted_credit>\n"
" <opaque>%f</opaque>\n"
" <random>%d</random>\n"
" <app_version_num>%d</app_version_num>\n"
" <appid>%d</appid>\n"
" <exit_status>%d</exit_status>\n"
" <teamid>%d</teamid>\n"
" <priority>%d</priority>\n"
" <mod_time>%s</mod_time>\n",
result.create_time,
result.workunitid,
result.server_state,
result.outcome,
result.client_state,
result.hostid,
result.userid,
result.report_deadline,
result.sent_time,
result.received_time,
result.name,
result.cpu_time,
result.xml_doc_in,
result.xml_doc_out,
r2.c_str(),
result.batch,
result.file_delete_state,
result.validate_state,
result.claimed_credit,
result.granted_credit,
result.opaque,
result.random,
result.app_version_num,
result.appid,
result.exit_status,
result.teamid,
result.priority,
result.mod_time
);
fprintf(re_stream,
"</result_archive>\n"
);
fprintf(re_index_stream,
"%d %d\n",
result.id, time_int
);
return 0;
}
int archive_wu(DB_WORKUNIT& wu) {
fprintf(wu_stream,
"<workunit_archive>\n"
" <id>%d</id>\n",
wu.id
);
fprintf(wu_stream,
" <create_time>%d</create_time>\n"
" <appid>%d</appid>\n"
" <name>%s</name>\n"
" <xml_doc>%s</xml_doc>\n"
" <batch>%d</batch>\n"
" <rsc_fpops_est>%.15e</rsc_fpops_est>\n"
" <rsc_fpops_bound>%.15e</rsc_fpops_bound>\n"
" <rsc_memory_bound>%.15e</rsc_memory_bound>\n"
" <rsc_disk_bound>%.15e</rsc_disk_bound>\n"
" <need_validate>%d</need_validate>\n"
" <canonical_resultid>%d</canonical_resultid>\n"
" <canonical_credit>%.15e</canonical_credit>\n"
" <transition_time>%d</transition_time>\n"
" <delay_bound>%d</delay_bound>\n"
" <error_mask>%d</error_mask>\n"
" <file_delete_state>%d</file_delete_state>\n"
" <assimilate_state>%d</assimilate_state>\n"
" <hr_class>%d</hr_class>\n"
" <opaque>%f</opaque>\n"
" <min_quorum>%d</min_quorum>\n"
" <target_nresults>%d</target_nresults>\n"
" <max_error_results>%d</max_error_results>\n"
" <max_total_results>%d</max_total_results>\n"
" <max_success_results>%d</max_success_results>\n"
" <result_template_file>%s</result_template_file>\n"
" <priority>%d</priority>\n"
" <mod_time>%s</mod_time>\n",
wu.create_time,
wu.appid,
wu.name,
wu.xml_doc,
wu.batch,
wu.rsc_fpops_est,
wu.rsc_fpops_bound,
wu.rsc_memory_bound,
wu.rsc_disk_bound,
wu.need_validate,
wu.canonical_resultid,
wu.canonical_credit,
wu.transition_time,
wu.delay_bound,
wu.error_mask,
wu.file_delete_state,
wu.assimilate_state,
wu.hr_class,
wu.opaque,
wu.min_quorum,
wu.target_nresults,
wu.max_error_results,
wu.max_total_results,
wu.max_success_results,
wu.result_template_file,
wu.priority,
wu.mod_time
);
fprintf(wu_stream,
"</workunit_archive>\n"
);
fprintf(wu_index_stream,
"%d %d\n",
wu.id, time_int
);
return 0;
}
int purge_and_archive_results(DB_WORKUNIT& wu, int& number_results) {
int retval= 0;
DB_RESULT result;
char buf[256];
number_results=0;
sprintf(buf, "where workunitid=%d", wu.id);
while (!result.enumerate(buf)) {
retval= archive_result(result);
if (retval) return retval;
log_messages.printf(SCHED_MSG_LOG::DEBUG,"Archived result [%d] to a file\n", result.id);
retval= result.delete_from_db();
if (retval) return retval;
log_messages.printf(SCHED_MSG_LOG::DEBUG,"Purged result [%d] from database\n", result.id);
number_results++;
}
return 0;
}
// return nonzer if did anything
//
bool do_pass() {
int retval= 0;
// The number of workunits/results purged in a single pass of
// do_pass(). Since do_pass() may be invoked multiple times,
// corresponding global variables store global totals.
//
int do_pass_purged_workunits = 0;
int do_pass_purged_results = 0;
// check to see if we got a stop signal. Note that if we do catch
// a stop signal here, we call an exit handler that closes [and
// optionally compresses] files before returning to the OS.
//
check_stop_daemons();
bool did_something = false;
DB_WORKUNIT wu;
char buf[256];
// select all workunits with file_delete_state='DONE'
//
sprintf(buf, "where file_delete_state=%d limit %d", FILE_DELETE_DONE,
DB_QUERY_LIMIT);
int n=0; // cleared in purge_and_archive_results, but avoid
// compiler complaints about uninitialized
while (!wu.enumerate(buf)) {
did_something = true;
// if archives have not already been opened, then open them.
if (!wu_stream)
open_all_archives();
retval = purge_and_archive_results(wu, n);
do_pass_purged_results += n;
retval= archive_wu(wu);
if (retval) {
log_messages.printf(SCHED_MSG_LOG::CRITICAL,
"Failed to write to XML file workunit:%d\n", wu.id
);
exit(5);
}
log_messages.printf(SCHED_MSG_LOG::DEBUG,"Archived workunit [%d] to a file\n", wu.id);
//purge workunit from DB
retval= wu.delete_from_db();
if (retval) {
log_messages.printf(SCHED_MSG_LOG::CRITICAL,"Can't delete workunit [%d] from database:%d\n", wu.id, retval);
exit(6);
}
log_messages.printf(SCHED_MSG_LOG::DEBUG,"Purged workunit [%d] from database\n", wu.id);
purged_workunits++;
do_pass_purged_workunits++;
wu_stored_in_file++;
// flush the various output files.
fflush(NULL);
// if file has got max # of workunits, close and compress it.
// This sets file pointers to NULL
if (max_wu_per_file && wu_stored_in_file>=max_wu_per_file) {
close_all_archives();
wu_stored_in_file=0;
}
if (max_number_workunits_to_purge>=0 && purged_workunits >= max_number_workunits_to_purge)
break;
}
log_messages.printf(SCHED_MSG_LOG::NORMAL,
"Archived %d workunits and %d results\n",
do_pass_purged_workunits,do_pass_purged_results
);
if (wu_stored_in_file>0) {
log_messages.printf(SCHED_MSG_LOG::DEBUG,
"Currently open archive files contain %d workunits\n",
wu_stored_in_file);
}
return did_something;
}
int main(int argc, char** argv) {
int retval;
bool asynch = false, one_pass = false;
int i;
check_stop_daemons();
for (i=1; i<argc; i++) {
if (!strcmp(argv[i], "-asynch")) {
asynch = true;
} else if (!strcmp(argv[i], "-one_pass")) {
one_pass = true;
} else if (!strcmp(argv[i], "-d")) {
log_messages.set_debug_level(atoi(argv[++i]));
} else if (!strcmp(argv[i], "-max")) {
// This is the limit on the maximum number of workunits to
// purge. Since all results associated with those WU are
// also purged, it indirectly controls that number as
// well.
max_number_workunits_to_purge= atoi(argv[++i]);
} else if (!strcmp(argv[i], "-zip")) {
// compress output files using zip. If used with
// -max_wu_per_file then the files get compressed after
// being closed. In any case the files are compressed
// when db_purge exits on a signal.
compress=COMPRESSION_ZIP;
// compress output files using gzip. If used with
// -max_wu_per_file then the files get compressed after
// being closed. In any case the files are compressed
// when db_purge exits on a signal.
} else if (!strcmp(argv[i], "-gzip")) {
compress=COMPRESSION_GZIP;
} else if (!strcmp(argv[i], "-max_wu_per_file")) {
// This is the limit on the maximum number of workunits to
// purge into a given file. The file is then closed and
// another file is opened. This can be used to get a
// series of bite-sized files instead of one huge file.
max_wu_per_file = atoi(argv[++i]);
} else {
log_messages.printf(SCHED_MSG_LOG::CRITICAL,
"Unrecognized arg: %s\n",
argv[i]
);
}
}
retval = config.parse_file("..");
if (retval) {
log_messages.printf(SCHED_MSG_LOG::CRITICAL,
"Can't parse config file\n"
);
exit(1);
}
if (asynch) {
if (fork()) {
exit(0);
}
}
log_messages.printf(SCHED_MSG_LOG::NORMAL, "Starting\n");
retval = boinc_db.open(config.db_name, config.db_host, config.db_user,
config.db_passwd);
if (retval) {
log_messages.printf(SCHED_MSG_LOG::CRITICAL, "Can't open DB\n");
exit(2);
}
install_stop_signal_handler();
//time_t now;
//struct tm tim;
//size_t i;
//now= time(0);
//tim= *(localtime(&now));
//i= strftime(time_int,30,"%b%d_%Y_%H:%M:%S", &tim);
mkdir("../archives", 0777);
// on exit, either via the check_stop_daemons signal handler, or
// through a regular call to exit, these functions will be called
// in the opposite order of registration.
atexit(close_db_exit_handler);
atexit(close_all_archives);
if (one_pass) {
do_pass();
} else {
while (1) {
if (max_number_workunits_to_purge>=0 && purged_workunits >= max_number_workunits_to_purge)
break;
if (!do_pass())
sleep(10);
}
}
// files and database are closed by exit handler
exit(0);
}
#ifdef __GNUC__
static volatile const char __attribute__((unused)) *BOINCrcsid="$Id$";
#else
static volatile const char *BOINCrcsid="$Id$";
#endif