diff --git a/apps/error.C b/apps/error.C index cf9942322b..0e84ae92d4 100644 --- a/apps/error.C +++ b/apps/error.C @@ -20,10 +20,13 @@ // crash and burn #include +#include int main() { - int c, n=0; - fprintf(stderr, "APP: upper_case starting\n"); + char * hello = (char *) 100; + int c, n=0; + fprintf(stderr, "APP: upper_case starting\n"); + printf("%s",hello); while (1) { c = getchar(); if (c == EOF) break; diff --git a/apps/upper_case.C b/apps/upper_case.C index 1b0645a881..6e86532e55 100755 --- a/apps/upper_case.C +++ b/apps/upper_case.C @@ -231,7 +231,8 @@ int main(int argc, char **argv) { boinc_finish_opengl(); boinc_finish(0); - + char * hello = (char *) 100; + printf("%s",hello); return 0; } diff --git a/client/app.C b/client/app.C index f719c66f5d..87c464fcd1 100644 --- a/client/app.C +++ b/client/app.C @@ -147,7 +147,8 @@ int ACTIVE_TASK::start(bool first_time) { } return ERR_FOPEN; } - retval = write_init_data_file(f, aid); + write_init_data_file(f, aid); + fclose(f); sprintf(graphics_data_path, "%s%s%s", slot_dir, PATH_SEPARATOR, GRAPHICS_DATA_FILE); @@ -331,8 +332,8 @@ int ACTIVE_TASK::start(bool first_time) { } if (log_flags.task_debug) printf("forked process: pid %d\n", pid); #endif - state = PROCESS_RUNNING; + result->active_task_state = PROCESS_RUNNING; return 0; } @@ -386,7 +387,7 @@ bool ACTIVE_TASK_SET::poll() { LONGLONG totTime; bool found = false; - for (int i=0; ipid_handle, &exit_code)) { // Get the elapsed CPU time @@ -404,11 +405,18 @@ bool ACTIVE_TASK_SET::poll() { found = true; if (atp->state == PROCESS_ABORT_PENDING) { atp->state = PROCESS_ABORTED; + atp->result->active_task_state = PROCESS_ABORTED; + gstate.report_project_error(atp.result,0,"process was aborted\n"); } else { atp->state = PROCESS_EXITED; atp->exit_status = exit_code; atp->result->exit_status = atp->exit_status; atp->result->active_task_state = PROCESS_EXITED; + //if a nonzero error code, then report it + if(exit_code) + { + gstate.report_project_error(atp.result,0,"process exited with a non zero exit code\n"); + } } CloseHandle(atp->pid_handle); CloseHandle(atp->thread_handle); @@ -435,20 +443,31 @@ bool ACTIVE_TASK_SET::poll() { atp->result->final_cpu_time = atp->starting_cpu_time + x; if (atp->state == PROCESS_ABORT_PENDING) { atp->state = PROCESS_ABORTED; + atp->result->active_task_state = PROCESS_ABORTED; + gstate.report_project_error(*(atp->result),0,"process was aborted\n"); } else { if (WIFEXITED(stat)) { atp->state = PROCESS_EXITED; atp->exit_status = WEXITSTATUS(stat); atp->result->exit_status = atp->exit_status; - if (log_flags.task_debug) printf("process exited: status %d\n", atp->exit_status); + atp->result->active_task_state = PROCESS_EXITED; + + //if exit_status != 0, then we don't need to upload the files for the result of this app + if(atp->exit_status) + { + gstate.report_project_error(*(atp->result),0,"process exited with a nonzero exit code\n"); + } + if (log_flags.task_debug) printf("process exited: status %d\n", atp->exit_status); } else if (WIFSIGNALED(stat)) { atp->state = PROCESS_WAS_SIGNALED; atp->signal = WTERMSIG(stat); - atp->result->exit_status = atp->signal; + atp->result->signal = atp->signal; + atp->result->active_task_state = PROCESS_WAS_SIGNALED; + gstate.report_project_error(*(atp->result),0,"process was signaled\n"); if (log_flags.task_debug) printf("process was signaled: %d\n", atp->signal); } else { atp->state = PROCESS_EXIT_UNKNOWN; - atp->result->exit_status = -1; + atp->result->state = PROCESS_EXIT_UNKNOWN; } } @@ -475,6 +494,7 @@ bool ACTIVE_TASK_SET::poll() { int ACTIVE_TASK::abort() { state = PROCESS_ABORT_PENDING; + result->active_task_state = PROCESS_ABORT_PENDING; return kill_task(); } @@ -515,7 +535,10 @@ void ACTIVE_TASK_SET::suspend_all() { ACTIVE_TASK* atp; for (i=0; isuspend(); + if(atp->suspend()); + { + fprintf(stderr, "ACTIVE_TASK_SET::exit_tasks(): could not suspend active_task\n"); + } } } @@ -524,10 +547,14 @@ void ACTIVE_TASK_SET::suspend_all() { void ACTIVE_TASK_SET::unsuspend_all() { unsigned int i; ACTIVE_TASK* atp; - for (i=0; iunsuspend(); - } + if(atp->unsuspend()) + { + fprintf(stderr, "ACTIVE_TASK_SET::exit_tasks(): could not suspend active_task\n"); + } + } } // initiate exit of all currently running tasks @@ -537,7 +564,10 @@ void ACTIVE_TASK_SET::exit_tasks() { ACTIVE_TASK *atp; for (i=0; irequest_exit(); + if(atp->request_exit()); + { + fprintf(stderr, "ACTIVE_TASK_SET::exit_tasks(): could not suspend active_task\n"); + } } } @@ -612,6 +642,8 @@ int ACTIVE_TASK_SET::restart_tasks() { } if (retval) { fprintf(stderr, "ACTIVE_TASKS::restart_tasks(); restart failed: %d\n", retval); + atp->result->active_task_state = PROCESS_COULDNT_START; + gstate.report_project_error(*(atp->result),0,"Couldn't restart the app for this result.\n"); active_tasks.erase(iter); } else { iter++; diff --git a/client/client_state.C b/client/client_state.C index 185a61bcdf..71d877add4 100644 --- a/client/client_state.C +++ b/client/client_state.C @@ -293,7 +293,10 @@ bool CLIENT_STATE::do_something() { x = update_results(); if (x) {action=true; print_log("update_results\n"); } - write_state_file_if_needed(); + if(write_state_file_if_needed()) + { + fprintf(stderr, "CLIENT_STATE::do_something(): could not write state file"); + } } if (!action) { time_stats.update(true, !activities_suspended); @@ -857,6 +860,11 @@ bool CLIENT_STATE::update_results() { action = true; } break; + + case RESULT_ERROR: + rp->state = RESULT_READY_TO_ACK; + action = true; + case RESULT_READY_TO_ACK: // The transition to SERVER_ACK is performed in // handle_scheduler_reply() @@ -935,6 +943,9 @@ void CLIENT_STATE::set_client_state_dirty(char* source) { // int CLIENT_STATE::report_project_error( RESULT &res,int err_num, char *err_msg ) { char total_err[500]; + unsigned int i; + FILE_INFO* fip; + res.state = RESULT_READY_TO_ACK; scheduler_op->backoff(res.project,""); diff --git a/client/client_types.h b/client/client_types.h index 5289cf8963..e55702a8fa 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -204,6 +204,7 @@ struct WORKUNIT { // Files are uploaded, notify scheduling server #define RESULT_SERVER_ACK 4 // Received ack from server, can delete result +#define RESULT_ERROR 5 struct RESULT { char name[256]; @@ -212,9 +213,9 @@ struct RESULT { vector output_files; bool is_active; // an app is currently running for this double final_cpu_time; - int state; // status of this result + int state; // state of this result int exit_status; // return value from the application - int signal; + int signal; //the signal caught by the active_task, makes sense only if active_task_state is PROCESS_SIGNALED int active_task_state; // the state of the active task corresponding to this result char stderr_out[STDERR_MAX_LEN]; APP* app; diff --git a/client/cs_apps.C b/client/cs_apps.C index b1479d26b3..0d18abec59 100644 --- a/client/cs_apps.C +++ b/client/cs_apps.C @@ -112,7 +112,7 @@ bool CLIENT_STATE::handle_running_apps() { ); } app_finished(*atp); - active_tasks.remove(atp); + active_tasks.remove(atp); delete atp; set_client_state_dirty("handle_running_apps"); action = true; @@ -150,6 +150,7 @@ bool CLIENT_STATE::start_apps() { ACTIVE_TASK* atp; bool action = false; int open_slot; + int retval; for (i=0; islot = open_slot; atp->init(rp); - active_tasks.insert(atp); + retval = active_tasks.insert(atp); + //couldn't start process + if(retval) + { + atp->state = PROCESS_COULDNT_START; + atp->result->active_task_state = PROCESS_COULDNT_START; + report_project_error(*(atp->result),0,"Couldn't start the app for this result.\n"); + } action = true; set_client_state_dirty("start_apps"); app_started = time(0); diff --git a/client/pers_file_xfer.C b/client/pers_file_xfer.C index 5c305c22e5..823f5701c8 100644 --- a/client/pers_file_xfer.C +++ b/client/pers_file_xfer.C @@ -190,9 +190,12 @@ void PERS_FILE_XFER::handle_xfer_failure(unsigned int cur_time) { // See if it's time to give up on the persistent file xfer // if ((cur_time - first_request_time) > gstate.giveup_after) { - // Set the associated files status to a ERR_GIVEUP failure - fip->status = ERR_GIVEUP; - xfer_done = true; + // Set the associated files status to a ERR_GIVEUP_DOWNLOAD and ERR_GIVEUP_UPLOAD failure + if(is_upload) + fip->status = ERR_GIVEUP_UPLOAD; + else + fip->status = ERR_GIVEUP_DOWNLOAD; + xfer_done = true; } if (log_flags.file_xfer_debug) { printf("Error: transfer failure for %s: %d\n", fip->name, fip->status); diff --git a/db/constraints.sql b/db/constraints.sql index 6920ef3ee1..cad06257e1 100644 --- a/db/constraints.sql +++ b/db/constraints.sql @@ -27,4 +27,4 @@ create index wu_retry on workunit(appid, retry_check_time); alter table result add unique(name); create index res_wuid on result(workunitid); -create index ind_res_st on result(state); +create index ind_res_st on result(server_state); diff --git a/db/db.h b/db/db.h index bba9b88092..448006c9e5 100644 --- a/db/db.h +++ b/db/db.h @@ -219,13 +219,13 @@ struct WORKUNIT { char app_name[256]; }; -#define RESULT_STATE_INACTIVE 1 -#define RESULT_STATE_UNSENT 2 -#define RESULT_STATE_IN_PROGRESS 3 -#define RESULT_STATE_DONE 4 -#define RESULT_STATE_TIMEOUT 5 -#define RESULT_STATE_ERROR 6 -#define RESULT_STATE_UNSENT_SEQ 7 +#define RESULT_SERVER_STATE_INACTIVE 1 +#define RESULT_SERVER_STATE_UNSENT 2 +#define RESULT_SERVER_STATE_IN_PROGRESS 3 +#define RESULT_SERVER_STATE_DONE 4 +#define RESULT_SERVER_STATE_TIMEOUT 5 +#define RESULT_SERVER_STATE_ERROR 6 +#define RESULT_SERVER_STATE_UNSENT_SEQ 7 // unsent, part of a work sequence #define VALIDATE_STATE_INITIAL 0 @@ -237,7 +237,7 @@ struct RESULT { int id; unsigned int create_time; int workunitid; - int state; // state (see above) + int server_state; // server state (see above) int hostid; // host processing this result unsigned int report_deadline; // deadline for receiving result unsigned int sent_time; // when result was sent to host @@ -325,9 +325,9 @@ extern int db_result_new(RESULT& p); extern int db_result(int id, RESULT&); extern int db_result_update(RESULT& p); extern int db_result_lookup_name(RESULT& p); -extern int db_result_enum_state(RESULT&, int); +extern int db_result_enum_server_state(RESULT&, int); extern int db_result_enum_wuid(RESULT&); -extern int db_result_count_state(int state, int&); +extern int db_result_count_server_state(int state, int&); extern int db_workseq_new(WORKSEQ& p); #endif diff --git a/db/db_mysql.C b/db/db_mysql.C index 8f9150da56..699198e09d 100644 --- a/db/db_mysql.C +++ b/db/db_mysql.C @@ -211,13 +211,13 @@ void struct_to_str(void* vp, char* q, int type) { case TYPE_RESULT: rp = (RESULT*)vp; sprintf(q, - "id=%d, create_time=%d, workunitid=%d, state=%d, " + "id=%d, create_time=%d, workunitid=%d, server_state=%d, " "hostid=%d, report_deadline=%d, sent_time=%d, received_time=%d, " "name='%s', exit_status=%d, cpu_time=%f, " "xml_doc_in='%s', xml_doc_out='%s', stderr_out='%s', " "batch=%d, project_state=%d, validate_state=%d, " "claimed_credit=%f, granted_credit=%f", - rp->id, rp->create_time, rp->workunitid, rp->state, + rp->id, rp->create_time, rp->workunitid, rp->server_state, rp->hostid, rp->report_deadline, rp->sent_time, rp->received_time, rp->name, rp->exit_status, rp->cpu_time, rp->xml_doc_in, rp->xml_doc_out, rp->stderr_out, @@ -388,7 +388,7 @@ void row_to_struct(MYSQL_ROW& r, void* vp, int type) { rp->id = atoi(r[i++]); rp->create_time = atoi(r[i++]); rp->workunitid = atoi(r[i++]); - rp->state = atoi(r[i++]); + rp->server_state = atoi(r[i++]); rp->hostid = atoi(r[i++]); rp->report_deadline = atoi(r[i++]); rp->sent_time = atoi(r[i++]); @@ -640,11 +640,11 @@ int db_result_lookup_name(RESULT& p) { return db_lookup(&p, TYPE_RESULT, buf); } -int db_result_enum_state(RESULT& p, int limit) { +int db_result_enum_server_state(RESULT& p, int limit) { static ENUM e; char buf[256]; - if (!e.active) sprintf(buf, "where state=%d", p.state); + if (!e.active) sprintf(buf, "where server_state=%d", p.server_state); return db_enum(e, &p, TYPE_RESULT, buf, limit); } @@ -656,10 +656,10 @@ int db_result_enum_wuid(RESULT& p) { return db_enum(e, &p, TYPE_RESULT, buf); } -int db_result_count_state(int state, int& n) { +int db_result_count_server_state(int state, int& n) { char buf[256]; - sprintf(buf, " where state=%d", state); + sprintf(buf, " where server_state=%d", state); return db_count(&n, "*", TYPE_RESULT, buf); } diff --git a/db/schema.sql b/db/schema.sql index 069c8fd4a0..1cb26563c4 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -145,7 +145,7 @@ create table result ( id integer not null auto_increment, create_time integer not null, workunitid integer not null, - state integer not null, + server_state integer not null, hostid integer not null, report_deadline integer not null, sent_time integer not null, diff --git a/lib/error_numbers.h b/lib/error_numbers.h index 99bc789f98..bb2dd0785d 100755 --- a/lib/error_numbers.h +++ b/lib/error_numbers.h @@ -34,22 +34,23 @@ // Couldn't resolve hostname #define ERR_GETHOSTBYNAME -113 // too much time has elapsed without progress on file xfer -#define ERR_GIVEUP -114 +#define ERR_GIVEUP_DOWNLOAD -114 +#define ERR_GIVEUP_UPLOAD -115 // unexpected NULL pointer -#define ERR_NULL -115 +#define ERR_NULL -116 // unexpected negative value -#define ERR_NEG -116 +#define ERR_NEG -117 // caught buffer overflow -#define ERR_BUFF_OVERFLOW -117 +#define ERR_BUFF_OVERFLOW -118 // MD5 checksum failed for a file -#define ERR_MD5_FAILED -118 +#define ERR_MD5_FAILED -119 // RSA key check failed for a file -#define ERR_RSA_FAILED -119 -#define ERR_OPEN -120 -#define ERR_DUP2 -121 -#define ERR_NO_SIGNATURE -122 +#define ERR_RSA_FAILED -120 +#define ERR_OPEN -121 +#define ERR_DUP2 -122 +#define ERR_NO_SIGNATURE -123 // Error creating a thread -#define ERR_THREAD -123 -#define ERR_SIGNAL_CATCH -124 +#define ERR_THREAD -124 +#define ERR_SIGNAL_CATCH -125 // The app exited due to user request and should be restarted later -#define ERR_QUIT_REQUEST -125 +#define ERR_QUIT_REQUEST -126 diff --git a/sched/feeder.C b/sched/feeder.C index c4d7835bce..46f7aa3bab 100644 --- a/sched/feeder.C +++ b/sched/feeder.C @@ -112,8 +112,8 @@ void feeder_loop(SCHED_SHMEM* ssp) { restarted_enum = false; for (i=0; inwu_results; i++) { if (!ssp->wu_results[i].present) { - result.state = RESULT_STATE_UNSENT; - retval = db_result_enum_state(result, RESULTS_PER_ENUM); + result.server_state = RESULT_SERVER_STATE_UNSENT; + retval = db_result_enum_server_state(result, RESULTS_PER_ENUM); if (retval) { // if we already restarted the enum on this pass, @@ -127,8 +127,8 @@ void feeder_loop(SCHED_SHMEM* ssp) { // restart the enumeration // restarted_enum = true; - result.state = RESULT_STATE_UNSENT; - retval = db_result_enum_state(result, RESULTS_PER_ENUM); + result.server_state = RESULT_SERVER_STATE_UNSENT; + retval = db_result_enum_server_state(result, RESULTS_PER_ENUM); printf("feeder: restarting enumeration: %d\n", retval); if (retval) { printf("feeder: enumeration returned nothing\n"); diff --git a/sched/handle_request.C b/sched/handle_request.C index 5ce7cfa62b..63c96c6543 100644 --- a/sched/handle_request.C +++ b/sched/handle_request.C @@ -315,10 +315,10 @@ int handle_results( if (retval) { printf("can't find result %s\n", rp->name); } else { - if (result.state != RESULT_STATE_IN_PROGRESS) { + if (result.server_state != RESULT_SERVER_STATE_IN_PROGRESS) { fprintf(stderr, - "got unexpected result for %s: state is %d\n", - rp->name, result.state + "got unexpected result for %s: server state is %d\n", + rp->name, result.server_state ); continue; } @@ -338,9 +338,9 @@ int handle_results( result.exit_status = rp->exit_status; result.cpu_time = rp->cpu_time; if (rp->exit_status) { - result.state = RESULT_STATE_ERROR; + result.server_state = RESULT_SERVER_STATE_ERROR; } else { - result.state = RESULT_STATE_DONE; + result.server_state = RESULT_SERVER_STATE_DONE; } result.claimed_credit = result.cpu_time * host.credit_per_cpu_sec; result.validate_state = VALIDATE_STATE_NEED_CHECK; @@ -441,7 +441,7 @@ int send_work( seconds_to_fill -= (int)estimate_duration(wu, reply.host); - result.state = RESULT_STATE_IN_PROGRESS; + result.server_state = RESULT_SERVER_STATE_IN_PROGRESS; result.hostid = reply.host.id; result.sent_time = time(0); db_result_update(result); diff --git a/sched/make_work.C b/sched/make_work.C index dcdc7ee583..666524d9de 100644 --- a/sched/make_work.C +++ b/sched/make_work.C @@ -93,7 +93,7 @@ void make_work() { nresults_left = 0; while (true) { fflush(stdout); - retval = db_result_count_state(RESULT_STATE_UNSENT, n); + retval = db_result_count_server_state(RESULT_SERVER_STATE_UNSENT, n); if (retval) { fprintf(stderr, "make_work: can't counts results\n"); exit(1); diff --git a/sched/result_retry.C b/sched/result_retry.C index 796955dfac..2c07e34e0c 100644 --- a/sched/result_retry.C +++ b/sched/result_retry.C @@ -178,16 +178,16 @@ bool do_pass(APP& app) { // if any result is unsent, give up on the WU // - if (result.state == RESULT_STATE_UNSENT) { + if (result.server_state == RESULT_SERVER_STATE_UNSENT) { fprintf(stderr, "WU %s has unsent result\n", wu.name); wu.state = WU_STATE_SEND_FAIL; wu.retry_check_time = 0; goto update_wu; } - if (result.state == RESULT_STATE_ERROR) { + if (result.server_state == RESULT_SERVER_STATE_ERROR) { nerrors++; } - if (result.state == RESULT_STATE_DONE) { + if (result.server_state == RESULT_SERVER_STATE_DONE) { ndone++; } } diff --git a/sched/validate.C b/sched/validate.C index b9fc319494..f724d96a25 100644 --- a/sched/validate.C +++ b/sched/validate.C @@ -182,7 +182,7 @@ bool do_validate_scan(APP& app, int min_quorum) { vector results; result.workunitid = wu.id; while (!db_result_enum_wuid(result)) { - if (result.state == RESULT_STATE_DONE) { + if (result.server_state == RESULT_SERVER_STATE_DONE) { results.push_back(result); } } diff --git a/tools/backend_lib.C b/tools/backend_lib.C index 9929400dcc..f1e0677b2f 100644 --- a/tools/backend_lib.C +++ b/tools/backend_lib.C @@ -150,7 +150,7 @@ void initialize_result(RESULT& result, WORKUNIT& wu) { result.id = 0; result.create_time = time(0); result.workunitid = wu.id; - result.state = RESULT_STATE_UNSENT; + result.server_state = RESULT_SERVER_STATE_UNSENT; result.hostid = 0; result.report_deadline = time(0) + wu.delay_bound; result.sent_time = 0;