diff --git a/checkin_notes b/checkin_notes index fa534a85fe..1428abfea4 100755 --- a/checkin_notes +++ b/checkin_notes @@ -5257,3 +5257,22 @@ David 31 May 2006 boinc_db.C sched/ sched_config.C + +Bruce 31 May 2006 + + - scheduler: do a better job of accepting results from host machines + which may modify an EXISTING & OVER result in the database. + + - file_upload_handler: check for 'stop_upload' trigger file in the + project root directory. If so, return ERR_TRANSIENT to hosts which + attempt uploads. This is the equivalent of 'stop_sched' for the + scheduler. (Eric, you may want to check that this is FCGI compatible). + + db/ + boinc_db.C + + sched/ + handle_request.C + file_upload_handler.C + sched_config.C + diff --git a/db/boinc_db.C b/db/boinc_db.C index d9c0ec1e17..c1fe94b918 100644 --- a/db/boinc_db.C +++ b/db/boinc_db.C @@ -1308,6 +1308,7 @@ void SCHED_RESULT_ITEM::parse(MYSQL_ROW& r) { userid = atoi(r[i++]); sent_time = atoi(r[i++]); received_time = atoi(r[i++]); + validate_state = atoi(r[i++]); } int DB_SCHED_RESULT_ITEM_SET::add_result(char* result_name) { @@ -1336,7 +1337,8 @@ int DB_SCHED_RESULT_ITEM_SET::enumerate() { " hostid, " " userid, " " sent_time, " - " received_time " + " received_time, " + " validate_state " "FROM " " result " "WHERE " diff --git a/sched/file_upload_handler.C b/sched/file_upload_handler.C index 7f94c41127..98fdcaac36 100644 --- a/sched/file_upload_handler.C +++ b/sched/file_upload_handler.C @@ -37,6 +37,7 @@ #include "parse.h" #include "util.h" #include "error_numbers.h" +#include "filesys.h" #include "sched_config.h" #include "sched_util.h" @@ -582,6 +583,11 @@ int main() { log_messages.pid = getpid(); log_messages.set_debug_level(DEBUG_LEVEL); + if (boinc_file_exists("../stop_upload")) { + return_error(ERR_TRANSIENT, "Maintenance underway: file uploads are temporarily disabled."); + exit(1); + } + retval = config.parse_file(".."); if (retval) { exit(1); diff --git a/sched/handle_request.C b/sched/handle_request.C index 40e704240f..3d4459c9b9 100644 --- a/sched/handle_request.C +++ b/sched/handle_request.C @@ -568,7 +568,7 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { // read results from database into "result_handler". // Quantities that must be read from the DB are those // where srip (see below) appears as an rval. - // These are: id, name, server_state, received_time, hostid. + // These are: id, name, server_state, received_time, hostid, validate_state. // Quantities that must be written to the DB are those for // which srip appears as an lval. These are: // hostid, teamid, received_time, client_state, cpu_time, exit_status, @@ -616,18 +616,64 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { // If one of them fails, set srip->id = 0, // which suppresses the DB update later on // - if (srip->server_state == RESULT_SERVER_STATE_OVER && srip->outcome != RESULT_OUTCOME_NO_REPLY) { - log_messages.printf( - SCHED_MSG_LOG::MSG_CRITICAL, - "[HOST#%d] [RESULT#%d %s] result already over\n", - reply.host.id, srip->id, srip->name - ); - // perhaps also send a message to the user saying - // that this result was already over? - srip->id = 0; - reply.result_acks.push_back(std::string(rp->name)); - continue; - } + + // If result is ALREADY over, do we replace it?? + // + if (srip->server_state == RESULT_SERVER_STATE_OVER) { + char *dont_replace_result = NULL; + switch (srip->outcome) { + case RESULT_OUTCOME_INIT: + // should never happen! + dont_replace_result = "server shows no record of having sent this work"; + break; + case RESULT_OUTCOME_SUCCESS: + // don't replace a successful result! + dont_replace_result = "successful result already reported for this result"; + break; + case RESULT_OUTCOME_COULDNT_SEND: + // should never happen! + dont_replace_result = "server records show that this work was not sent (couldn't send)"; + break; + case RESULT_OUTCOME_CLIENT_ERROR: + // result was previously cancelled on server side. + // keep this new, real result ONLY if validator has + // not already been invoked. + if (srip->validate_state != VALIDATE_STATE_INIT) { + dont_replace_result = "previous result reported as error, or canceled on server"; + } + break; + case RESULT_OUTCOME_NO_REPLY: + // result is late in arriving, but keep it anyhow + break; + case RESULT_OUTCOME_DIDNT_NEED: + // should never happen + dont_replace_result = "server records show that this work was not sent (not needed)"; + break; + case RESULT_OUTCOME_VALIDATE_ERROR: + // we already passed through the validator, so + // don't keep the new result + dont_replace_result = "server records show that an invalid result was already returned"; + break; + default: + dont_replace_result = "server logic bug; please alert BOINC developers"; + break; + } + if (dont_replace_result) { + char buf[256]; + log_messages.printf( + SCHED_MSG_LOG::MSG_CRITICAL, + "[HOST#%d] [RESULT#%d %s] result already over [outcome=%d validate_state=%d]\n", + reply.host.id, srip->id, srip->name, srip->outcome, srip->validate_state + ); + sprintf(buf, "Completed result %s refused: %s", srip->name, dont_replace_result); + USER_MESSAGE um(buf, "high"); + reply.insert_message(um); + srip->id = 0; + reply.result_acks.push_back(std::string(rp->name)); + continue; + } + } + if (srip->server_state == RESULT_SERVER_STATE_UNSENT) { log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL, diff --git a/sched/sched_config.C b/sched/sched_config.C index 307bab91ae..16c56a3ae9 100644 --- a/sched/sched_config.C +++ b/sched/sched_config.C @@ -185,12 +185,12 @@ int SCHED_CONFIG::parse(char* buf) { #endif int SCHED_CONFIG::parse_file(const char* dir) { - char* p; char path[256]; int retval; sprintf(path, "%s/%s", dir, CONFIG_FILE); #if 0 + char* p; retval = read_file_malloc(path, p); if (retval) return retval; retval = parse(p);