watchdogs

svn path=/trunk/boinc/; revision=990
This commit is contained in:
David Anderson 2003-02-28 22:39:36 +00:00
parent 73d1cf466e
commit bd4dfc7df1
6 changed files with 130 additions and 6 deletions

View File

@ -54,7 +54,8 @@ PROJECT::PROJECT() {
master_fetch_failures = 0;
resource_debt = 0;
debt_order = 0;
master_url_fetch_pending = 0;
master_url_fetch_pending = false;
sched_rpc_pending = false;
}
PROJECT::~PROJECT() {

View File

@ -128,8 +128,7 @@ PROJECT* CLIENT_STATE::next_project(PROJECT* old) {
pbest = 0;
for (i=0; i<projects.size(); i++) {
p = projects[i];
if (p->master_url_fetch_pending)
continue;
if (p->master_url_fetch_pending) continue;
if (p->min_rpc_time > now ) continue;
if (old && p->debt_order <= old->debt_order) continue;
if (p->debt_order < best) {

View File

@ -309,6 +309,7 @@ bool SCHEDULER_OP::poll() {
if (http_op.http_op_state == HTTP_STATE_DONE) {
action = true;
project->master_url_fetch_pending = false;
gstate.set_client_state_dirty("master URL fetch done");
http_ops->remove(&http_op);
if (http_op.http_op_retval == 0) {
if (log_flags.sched_op_debug) {

View File

@ -45,7 +45,23 @@ using namespace std;
// return true if the WU can be executed on the host
//
bool wu_is_feasible(WORKUNIT& wu, HOST& host) {
return ((wu.rsc_disk <= host.d_free) && (wu.rsc_memory <= host.m_nbytes));
char buf[256];
if(host.d_free && wu.rsc_disk > host.d_free) {
sprintf(buf, "WU %d needs %f disk; host %d has %f\n",
wu.id, wu.rsc_disk, host.id, host.d_free
);
write_log(buf);
return false;
}
if (host.m_nbytes && wu.rsc_memory > host.m_nbytes) {
sprintf(buf, "WU %d needs %f mem; host %d has %f\n",
wu.id, wu.rsc_memory, host.id, host.m_nbytes
);
write_log(buf);
return false;
}
return true;
}
// estimate the time that a WU will take on a host
@ -177,7 +193,8 @@ int authenticate_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
);
strcpy(reply.message_priority, "low");
reply.request_delay = 120;
write_log("bad authenticator\n");
sprintf(buf, "Bad authenticator: %s\n", sreq.authenticator);
write_log(buf);
return -1;
}
@ -202,7 +219,8 @@ int authenticate_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
"Visit this project's web site to get an authenticator.");
strcpy(reply.message_priority, "low");
reply.request_delay = 120;
write_log("bad authenticator\n");
sprintf(buf, "Bad authenticator: %s\n", sreq.authenticator);
write_log(buf);
return -1;
}
new_host:

40
sched/wd.php Executable file
View File

@ -0,0 +1,40 @@
#! /usr/local/bin/php
<?php
// General-purpose watchdog script.
// Run this from crontab.
// We use the mod time of a file "watchdog_exec_time"
// to keep track of the last time we ran.
// BOINC uses a number of "error log files".
// If any error log file has been updated since the last time we ran,
// sound the alarm.
// TODO: to detect file system full errors,
// have this program attempt to create/read a file.
function sound_alarm($x) {
echo "alarm: $x\n";
//email("davea@ssl.berkeley.edu", "BOINC problem", $x);
}
function check_log_file($file, $last_time) {
$t = filemtime($file);
if ($t == false) {
sound_alarm("log file ".$file." missing");
} else if ($t > $last_time) {
$lines = file($file);
$last_line = $lines[count($lines)-1];
sound_alarm($last_line);
}
}
$last_time = filemtime("watchdog_exec_time");
if (!$last_time) {
alarm("foo");
}
touch("watchdog_exec_time");
check_log_file("error_log", $last_time);
?>

65
sched/wd_nresults_changing.php Executable file
View File

@ -0,0 +1,65 @@
#! /usr/local/bin/php
<?php
// watchdog script to ensure that the number of result records
// increases at least every X seconds (X = crontab period)
include_once("util.inc");
function fail($x) {
$f = fopen("error_log", "a");
if (!$f) return false;
fputs($f, "[".strftime("%T %D")."] ");
fputs($f, $x);
fclose($f);
exit();
}
function read_count_file() {
if (!file_exists("nresults")) {
return false;
}
$lines = file("nresults");
if (!$lines) return false;
return $lines[0];
}
function write_count_file($n) {
$f = fopen("nresults", "w");
if (!$f) return false;
$x = sprintf("%d", $n);
fwrite($f, $x);
fclose($f);
return true;
}
function get_count_from_db() {
$result = mysql_query("select count(*) from result");
if (!$result) return false;
$count = mysql_fetch_array($result);
mysql_free_result($result);
return $count[0];
}
$retval = db_init();
if (!$retval) {
fail("Can't open database");
}
$m = get_count_from_db();
echo "Count: $m\n";
if ($m == false) {
fail("Can't get result count from DB");
}
$n = read_count_file();
echo "Count: $n\n";
if ($n == false) {
write_count_file($m);
exit();
}
if ($n == $m) {
echo "fail\n";
fail("Result count hasn't changed\n");
}
write_count_file($m);
?>