// This file is part of BOINC. // http://boinc.berkeley.edu // Copyright (C) 2019 University of California // // BOINC is free software; you can redistribute it and/or modify it // under the terms of the GNU Lesser General Public License // as published by the Free Software Foundation, // either version 3 of the License, or (at your option) any later version. // // BOINC is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // See the GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with BOINC. If not, see . // transitioner - handle transitions in the state of a WU // - a result has become DONE (via timeout or client reply) // - the WU error mask is set (e.g. by validater) // - assimilation is finished // // cmdline: // [ --one_pass ] do one pass, then exit // [ --d x ] debug level x // [ --mod n i ] process only WUs with (id mod n) == i // [ --sleep_interval x ] sleep x seconds if nothing to do // [ --wu_id n ] transition WU n (debugging) #include "config.h" #include #include #include #include #include #include #include #include #include #include "backend_lib.h" #include "boinc_db.h" #include "common_defs.h" #include "error_numbers.h" #include "filesys.h" #include "str_util.h" #include "svn_version.h" #include "util.h" #include "sched_config.h" #include "credit.h" #include "sched_util.h" #include "sched_msgs.h" #ifdef GCL_SIMULATOR #include "gcl_simulator.h" #endif #define LOCKFILE "transitioner.out" #define PIDFILE "transitioner.pid" #define SELECT_LIMIT 1000 #define DEFAULT_SLEEP_INTERVAL 5 int startup_time; R_RSA_PRIVATE_KEY key; int mod_n, mod_i; bool do_mod = false; bool one_pass = false; int sleep_interval = DEFAULT_SLEEP_INTERVAL; int wu_id = 0; void signal_handler(int) { log_messages.printf(MSG_NORMAL, "Signaled by simulator\n"); } int result_suffix(char* name) { char* p = strrchr(name, '_'); if (p) return atoi(p+1); return 0; } // A result timed out; penalize the corresponding host_app_version // static int result_timed_out( TRANSITIONER_ITEM res_item, TRANSITIONER_ITEM& wu_item ) { DB_HOST_APP_VERSION hav; char query[512], clause[512]; DB_ID_TYPE gavid = generalized_app_version_id( res_item.res_app_version_id, wu_item.appid ); int retval = hav_lookup(hav, res_item.res_hostid, gavid); if (retval) { log_messages.printf(MSG_NORMAL, "result_timed_out(): hav_lookup failed: %s\n", boincerror(retval) ); return 0; } hav.turnaround.update_var( (double)wu_item.delay_bound, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT ); int n = hav.max_jobs_per_day; if (n == 0) { n = config.daily_result_quota; } if (n > config.daily_result_quota) { n = config.daily_result_quota; } n -= 1; if (n < 1) { n = 1; } if (config.debug_quota) { log_messages.printf(MSG_NORMAL, "[quota] max_jobs_per_day for %ld; %d->%d\n", gavid, hav.max_jobs_per_day, n ); } hav.max_jobs_per_day = n; hav.consecutive_valid = 0; sprintf(query, "turnaround_n=%.15e, turnaround_avg=%.15e, turnaround_var=%.15e, turnaround_q=%.15e, max_jobs_per_day=%d, consecutive_valid=%d", hav.turnaround.n, hav.turnaround.avg, hav.turnaround.var, hav.turnaround.q, hav.max_jobs_per_day, hav.consecutive_valid ); sprintf(clause, "host_id=%lu and app_version_id=%lu", hav.host_id, hav.app_version_id ); retval = hav.update_fields_noid(query, clause); if (retval) { log_messages.printf(MSG_CRITICAL, "CRITICAL result_timed_out(): hav updated failed: %s\n", boincerror(retval) ); } return 0; } int handle_wu( DB_TRANSITIONER_ITEM_SET& transitioner, std::vector& items ) { int ntotal, nerrors, retval, ninprogress, nsuccess; int nunsent, ncouldnt_send, nover, ndidnt_need, nno_reply; int canonical_result_index, j; char suffix[256]; time_t now = time(0), x; bool all_over_and_validated, have_new_result_to_validate, do_delete; unsigned int i; TRANSITIONER_ITEM& wu_item = items[0]; TRANSITIONER_ITEM wu_item_original = wu_item; // count up the number of results in various states, // and check for timed-out results // ntotal = 0; nunsent = 0; // including INACTIVE ninprogress = 0; nover = 0; nerrors = 0; nsuccess = 0; // not counting invalid results!!!! ncouldnt_send = 0; nno_reply = 0; ndidnt_need = 0; have_new_result_to_validate = false; int rs, max_result_suffix = -1; // Scan the WU's results, and find the canonical result if there is one // canonical_result_index = -1; if (wu_item.canonical_resultid) { for (i=0; i= 0) { TRANSITIONER_ITEM& cr = items[canonical_result_index]; if (cr.res_file_delete_state == FILE_DELETE_DONE) { canonical_result_files_deleted = true; } } // Scan this WU's results, and // 1) count those in various server states; // 2) identify timed-out results and update their server state and outcome // 3) find the max result suffix (in case need to generate new ones) // 4) see if we have a new result to validate // (outcome SUCCESS and validate_state INIT) // for (i=0; i max_result_suffix) max_result_suffix = rs; switch (res_item.res_server_state) { case RESULT_SERVER_STATE_INACTIVE: case RESULT_SERVER_STATE_UNSENT: nunsent++; break; case RESULT_SERVER_STATE_IN_PROGRESS: if (res_item.res_report_deadline < now) { log_messages.printf(MSG_NORMAL, "[WU#%lu %s] [RESULT#%lu %s] result timed out (%d < %d) server_state:IN_PROGRESS=>OVER; outcome:NO_REPLY\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, res_item.res_report_deadline, (int)now ); res_item.res_server_state = RESULT_SERVER_STATE_OVER; res_item.res_outcome = RESULT_OUTCOME_NO_REPLY; retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%lu %s] [RESULT#%lu %s] update_result(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } retval = result_timed_out(res_item, wu_item); if (retval) { log_messages.printf(MSG_CRITICAL, "result_timed_out() error: %s\n", boincerror(retval) ); exit(1); } nover++; nno_reply++; } else { ninprogress++; } break; case RESULT_SERVER_STATE_OVER: nover++; switch (res_item.res_outcome) { case RESULT_OUTCOME_COULDNT_SEND: log_messages.printf(MSG_NORMAL, "[WU#%lu %s] [RESULT#%lu %s] result couldn't be sent\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); ncouldnt_send++; break; case RESULT_OUTCOME_SUCCESS: if (res_item.res_validate_state == VALIDATE_STATE_INIT) { if (canonical_result_files_deleted) { res_item.res_validate_state = VALIDATE_STATE_TOO_LATE; retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%lu %s] [RESULT#%lu %s] update_result(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } else { log_messages.printf(MSG_NORMAL, "[WU#%lu %s] [RESULT#%lu %s] validate_state:INIT=>TOO_LATE\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); } } else { have_new_result_to_validate = true; } } // don't count invalid results as successful // if (res_item.res_validate_state != VALIDATE_STATE_INVALID) { nsuccess++; } break; case RESULT_OUTCOME_CLIENT_ERROR: // is user aborted job, don't count it as an error // if (res_item.res_exit_status == EXIT_ABORTED_VIA_GUI) { nno_reply++; } else { nerrors++; } break; case RESULT_OUTCOME_VALIDATE_ERROR: nerrors++; break; case RESULT_OUTCOME_CLIENT_DETACHED: case RESULT_OUTCOME_NO_REPLY: nno_reply++; break; case RESULT_OUTCOME_DIDNT_NEED: ndidnt_need++; break; } break; } } log_messages.printf(MSG_DEBUG, "[WU#%lu %s] %d results: unsent %d, in_progress %d, over %d (success %d, error %d, couldnt_send %d, no_reply %d, didnt_need %d)\n", wu_item.id, wu_item.name, ntotal, nunsent, ninprogress, nover, nsuccess, nerrors, ncouldnt_send, nno_reply, ndidnt_need ); // if there's a new result to validate, trigger validation // if (have_new_result_to_validate && (nsuccess >= wu_item.min_quorum)) { wu_item.need_validate = true; log_messages.printf(MSG_NORMAL, "[WU#%lu %s] need_validate:=>true\n", wu_item.id, wu_item.name ); } // check for WU error conditions // NOTE: check on max # of success results is done in validater // if (ncouldnt_send > 0) { wu_item.error_mask |= WU_ERROR_COULDNT_SEND_RESULT; } // if WU has results with errors and there are no results that are // - successful // - in progress // - timed out (but could still be returned) // reset homogeneous redundancy class to give other platforms a try; // also reset app version ID if using HAV // if (nerrors && !(nsuccess || ninprogress || nno_reply)) { if (!config.hr_class_static) { wu_item.hr_class = 0; wu_item.app_version_id = 0; } } if (nerrors > wu_item.max_error_results) { log_messages.printf(MSG_NORMAL, "[WU#%lu %s] WU has too many errors (%d errors for %d results)\n", wu_item.id, wu_item.name, nerrors, ntotal ); wu_item.error_mask |= WU_ERROR_TOO_MANY_ERROR_RESULTS; } // see how many new results we need to make // int n_new_results_needed = wu_item.target_nresults - nunsent - ninprogress - nsuccess; if (n_new_results_needed < 0) n_new_results_needed = 0; int n_new_results_allowed = wu_item.max_total_results - ntotal; // if we're already at the limit and need more, error out the WU // bool too_many = false; if (n_new_results_allowed < 0) { too_many = true; } else if (n_new_results_allowed == 0) { if (n_new_results_needed > 0) { too_many = true; } } else { if (n_new_results_needed > n_new_results_allowed) { n_new_results_needed = n_new_results_allowed; } } if (too_many) { log_messages.printf(MSG_NORMAL, "[WU#%lu %s] WU has too many total results (%d)\n", wu_item.id, wu_item.name, ntotal ); wu_item.error_mask |= WU_ERROR_TOO_MANY_TOTAL_RESULTS; } // if this WU had an error, don't send any unsent results, // and trigger assimilation if needed // if (wu_item.error_mask) { for (i=0; iOVER; outcome:=>DIDNT_NEED\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); res_item.res_server_state = RESULT_SERVER_STATE_OVER; res_item.res_outcome = RESULT_OUTCOME_DIDNT_NEED; update_result = true; break; case RESULT_SERVER_STATE_OVER: switch (res_item.res_outcome) { case RESULT_OUTCOME_SUCCESS: switch(res_item.res_validate_state) { case VALIDATE_STATE_INIT: case VALIDATE_STATE_INCONCLUSIVE: res_item.res_validate_state = VALIDATE_STATE_NO_CHECK; update_result = true; break; } } } if (update_result) { retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%lu %s] [RESULT#%lu %s] result.update(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } } } if (wu_item.assimilate_state == ASSIMILATE_INIT) { wu_item.assimilate_state = ASSIMILATE_READY; log_messages.printf(MSG_NORMAL, "[WU#%lu %s] error_mask:%d assimilate_state:INIT=>READY\n", wu_item.id, wu_item.name, wu_item.error_mask ); } } else if (wu_item.canonical_resultid == 0) { // Here if no WU-level error. // Generate new results if needed. // std::string values; char value_buf[MAX_QUERY_LEN]; if (wu_item.transitioner_flags != TRANSITION_NO_NEW_RESULTS && n_new_results_needed > 0 ) { log_messages.printf( MSG_NORMAL, "[WU#%lu %s] Generating %d more results (%d target - %d unsent - %d in progress - %d success)\n", wu_item.id, wu_item.name, n_new_results_needed, wu_item.target_nresults, nunsent, ninprogress, nsuccess ); for (j=0; j most_recently_returned) { most_recently_returned = res_item.res_received_time; } if (res_item.res_outcome == RESULT_OUTCOME_SUCCESS) { if (res_item.res_validate_state == VALIDATE_STATE_INIT) { all_over_and_validated = false; all_over_and_ready_to_assimilate = false; } } else if (res_item.res_outcome == RESULT_OUTCOME_NO_REPLY) { if (now < res_item.res_report_deadline) { all_over_and_validated = false; } } } else { all_over_and_validated = false; all_over_and_ready_to_assimilate = false; } } // If we're deferring assimilation until all results are over and validated, // when that happens make sure that WU state is advanced to assimilate ready // the items.size is a kludge // if (all_over_and_ready_to_assimilate && wu_item.assimilate_state == ASSIMILATE_INIT && items.size() > 0 && wu_item.canonical_resultid > 0 ) { wu_item.assimilate_state = ASSIMILATE_READY; log_messages.printf(MSG_NORMAL, "[WU#%lu %s] Deferred assimilation now set to ASSIMILATE_STATE_READY\n", wu_item.id, wu_item.name ); } // if WU is assimilated, trigger file deletion // double deferred_file_delete_time = 0; if (wu_item.assimilate_state == ASSIMILATE_DONE) { if (now >= (most_recently_returned + config.delete_delay)) { // can delete input files if all results OVER // if (all_over_and_validated && wu_item.file_delete_state == FILE_DELETE_INIT) { wu_item.file_delete_state = FILE_DELETE_READY; log_messages.printf(MSG_DEBUG, "[WU#%lu %s] ASSIMILATE_DONE: file_delete_state:=>READY\n", wu_item.id, wu_item.name ); } // output of error results can be deleted immediately; // output of success results can be deleted if validated // for (i=0; iREADY\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name ); res_item.res_file_delete_state = FILE_DELETE_READY; retval = transitioner.update_result(res_item); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%lu %s] [RESULT#%lu %s] result.update(): %s\n", wu_item.id, wu_item.name, res_item.res_id, res_item.res_name, boincerror(retval) ); } } } } else { deferred_file_delete_time = most_recently_returned + config.delete_delay; log_messages.printf(MSG_DEBUG, "[WU#%lu %s] deferring file deletion for %.0f seconds\n", wu_item.id, wu_item.name, deferred_file_delete_time - now ); } } // Compute next transition time. // This is the min of // - timeouts of in-progress results // - deferred file deletion time // - safety net // // It is then adjusted to deal with transitioner congestion // if (wu_item.canonical_resultid || wu_item.error_mask) { wu_item.transition_time = INT_MAX; } else { // Safety net: if there is no canonical result and no WU-level error, // make sure that the transitioner will process this WU again. // In principle this is not needed, // but it makes the BOINC back-end more robust. // const int ten_days = 10*86400; int long_delay = (int)(1.5*wu_item.delay_bound); wu_item.transition_time = (long_delay > ten_days) ? long_delay : ten_days; wu_item.transition_time += time(0); } // handle timeout of in-progress results // for (i=0; i 86400) extra_delay = 86400; log_messages.printf(MSG_DEBUG, "[WU#%lu %s] transition time in past: adding extra delay %d sec\n", wu_item.id, wu_item.name, extra_delay ); wu_item.transition_time = now + extra_delay; } log_messages.printf(MSG_DEBUG, "[WU#%lu %s] setting transition_time to %d\n", wu_item.id, wu_item.name, wu_item.transition_time ); retval = transitioner.update_workunit(wu_item, wu_item_original); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%lu %s] workunit.update(): %s\n", wu_item.id, wu_item.name, boincerror(retval) ); return retval; } return 0; } bool do_pass() { int retval; DB_TRANSITIONER_ITEM_SET transitioner; std::vector items; bool did_something = false; if (!one_pass) check_stop_daemons(); // loop over entries that are due to be checked // while (1) { if (wu_id) { // kludge to tell enumerate to return a given WU mod_n = 1; mod_i = wu_id; } retval = transitioner.enumerate( (int)time(0), SELECT_LIMIT, mod_n, mod_i, items ); if (retval) { if (retval != ERR_DB_NOT_FOUND) { log_messages.printf(MSG_CRITICAL, "WU enum error: %s; exiting\n", boincerror(retval) ); exit(1); } break; } did_something = true; TRANSITIONER_ITEM& wu_item = items[0]; retval = handle_wu(transitioner, items); if (retval) { log_messages.printf(MSG_CRITICAL, "[WU#%lu %s] handle_wu: %s; quitting\n", wu_item.id, wu_item.name, boincerror(retval) ); // probably better to exit here. // Whatever cause this WU to fail (and it could be temporary) // might cause ALL WUs to fail // exit(1); } if (!one_pass) check_stop_daemons(); if (wu_id) break; } return did_something; } void main_loop() { int retval; retval = boinc_db.open(config.db_name, config.db_host, config.db_user, config.db_passwd); if (retval) { log_messages.printf(MSG_CRITICAL, "boinc_db.open failed: %s\n", boinc_db.error_string() ); exit(1); } while (1) { log_messages.printf(MSG_DEBUG, "doing a pass\n"); if (1) { bool did_something = do_pass(); if (one_pass) break; if (did_something) continue; #ifdef GCL_SIMULATOR continue_simulation("transitioner"); signal(SIGUSR2, simulator_signal_handler); pause(); #else log_messages.printf(MSG_DEBUG, "sleeping %d\n", sleep_interval); daemon_sleep(sleep_interval); #endif } } } void usage(char *name) { fprintf(stderr, "Handles transitions in the state of a WU\n" " - a result has become DONE (via timeout or client reply)\n" " - the WU error mask is set (e.g. by validater)\n" " - assimilation is finished\n\n" "Usage: %s [OPTION]...\n\n" "Options: \n" " [ --one_pass ] do one pass, then exit\n" " [ --d x ] debug level x\n" " [ --mod n i ] process only WUs with (id mod n) == i\n" " [ --sleep_interval x ] sleep x seconds if nothing to do\n" " [ -h | --help ] Show this help text.\n" " [ -v | --version ] Shows version information.\n", name ); } int main(int argc, char** argv) { int i, retval; char path[MAXPATHLEN]; startup_time = time(0); for (i=1; i