From 54bb40beef3a78bbbb404eb5e7e50ae5d8cb6a44 Mon Sep 17 00:00:00 2001 From: Hamid Aghdaee Date: Mon, 9 Dec 2002 22:36:46 +0000 Subject: [PATCH] *** empty log message *** svn path=/trunk/boinc/; revision=713 --- checkin_notes | 27 ++++++++++++++ client/app.C | 28 ++++++++------- client/client_state.C | 79 +++++++++++++++++++++++++++++++++++------ client/client_state.h | 15 +++++++- client/client_types.C | 9 +++-- client/client_types.h | 11 +++++- client/cs_apps.C | 8 ++--- client/pers_file_xfer.h | 2 +- 8 files changed, 147 insertions(+), 32 deletions(-) diff --git a/checkin_notes b/checkin_notes index 0a7d373e94..70f671059f 100755 --- a/checkin_notes +++ b/checkin_notes @@ -2573,3 +2573,30 @@ Eric Dec 8, 2002 pers_file_xfer.C scheduler_op.C +Hamid Dec 9,2002 + - added field client_state to result, which is what is reported back to the sched server as the state of the client in regards to this result when the RPC was made + + - added some more functionality to repor_project_error() + It now adds the following info to stderr_out of the result: + + message if any passed + %d + %d + %d + if download had failures + "\ + " %s + " %d\n" + " \n" + + + if upload had failures + same as above + + if coudln't start active task for result (in which err_num should be set) + "%d\n" + + + - The error mechanism right now is such that any failures regarding file_infos (download, upload or whatever) are recorded in the status of the file_infos and then garbage collect called report_project_error. However errors regarding starting and ending active tasks are reported from app.C and cs_apps.C. + + \ No newline at end of file diff --git a/client/app.C b/client/app.C index 976f420d9b..c81955fdca 100644 --- a/client/app.C +++ b/client/app.C @@ -258,6 +258,7 @@ int ACTIVE_TASK::start(bool first_time) { STARTUPINFO startup_info; char slotdirpath[256]; char cmd_line[512]; + int win_error; memset( &process_info, 0, sizeof( process_info ) ); memset( &startup_info, 0, sizeof( startup_info ) ); @@ -283,15 +284,15 @@ int ACTIVE_TASK::start(bool first_time) { &startup_info, &process_info )) { - state = GetLastError(); + win_error = GetLastError(); char *errorargs[] = {app_version->app_name,"","","",""}; LPVOID lpMsgBuf; FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER|FORMAT_MESSAGE_FROM_SYSTEM, - NULL, state, 0, (LPTSTR)&lpMsgBuf, 0, errorargs); + NULL, win_error, 0, (LPTSTR)&lpMsgBuf, 0, errorargs); // check for an error; if there is one, set error information for the currect result - if(state) { - gstate.report_project_error(*result, state, (LPTSTR)&lpMsgBuf); + if(win_error) { + gstate.report_project_error(*result, win_error, (LPTSTR)&lpMsgBuf); LocalFree(lpMsgBuf); return -1; } @@ -334,6 +335,7 @@ int ACTIVE_TASK::start(bool first_time) { #endif state = PROCESS_RUNNING; result->active_task_state = PROCESS_RUNNING; + result->client_state = CLIENT_COMPUTING; return 0; } @@ -406,7 +408,7 @@ bool ACTIVE_TASK_SET::poll() { if (atp->state == PROCESS_ABORT_PENDING) { atp->state = PROCESS_ABORTED; atp->result->active_task_state = PROCESS_ABORTED; - gstate.report_project_error(*(atp->result),0,"process was aborted\n"); + gstate.report_project_error(*(atp->result), 0, "process was aborted\n",CLIENT_COMPUTING); } else { atp->state = PROCESS_EXITED; atp->exit_status = exit_code; @@ -415,7 +417,7 @@ bool ACTIVE_TASK_SET::poll() { //if a nonzero error code, then report it if(exit_code) { - gstate.report_project_error(*(atp->result),0,"process exited with a non zero exit code\n"); + gstate.report_project_error(*(atp->result),0,"process exited with a non zero exit code\n",CLIENT_COMPUTING); } } CloseHandle(atp->pid_handle); @@ -444,7 +446,7 @@ bool ACTIVE_TASK_SET::poll() { if (atp->state == PROCESS_ABORT_PENDING) { atp->state = PROCESS_ABORTED; atp->result->active_task_state = PROCESS_ABORTED; - gstate.report_project_error(*(atp->result),0,"process was aborted\n"); + gstate.report_project_error(*(atp->result),0,"process was aborted\n",CLIENT_COMPUTING); } else { if (WIFEXITED(stat)) { atp->state = PROCESS_EXITED; @@ -455,7 +457,7 @@ bool ACTIVE_TASK_SET::poll() { //if exit_status != 0, then we don't need to upload the files for the result of this app if(atp->exit_status) { - gstate.report_project_error(*(atp->result),0,"process exited with a nonzero exit code\n"); + gstate.report_project_error(*(atp->result),0,"process exited with a nonzero exit code\n",CLIENT_COMPUTING); } if (log_flags.task_debug) printf("process exited: status %d\n", atp->exit_status); } else if (WIFSIGNALED(stat)) { @@ -463,11 +465,11 @@ bool ACTIVE_TASK_SET::poll() { atp->signal = WTERMSIG(stat); atp->result->signal = atp->signal; atp->result->active_task_state = PROCESS_WAS_SIGNALED; - gstate.report_project_error(*(atp->result),0,"process was signaled\n"); + gstate.report_project_error(*(atp->result),0,"process was signaled\n",CLIENT_COMPUTING); if (log_flags.task_debug) printf("process was signaled: %d\n", atp->signal); } else { - atp->state = PROCESS_EXIT_UNKNOWN; - atp->result->state = PROCESS_EXIT_UNKNOWN; + atp->state = PROCESS_EXIT_UNKNOWN; + atp->result->state = PROCESS_EXIT_UNKNOWN; } } @@ -643,7 +645,9 @@ int ACTIVE_TASK_SET::restart_tasks() { if (retval) { fprintf(stderr, "ACTIVE_TASKS::restart_tasks(); restart failed: %d\n", retval); atp->result->active_task_state = PROCESS_COULDNT_START; - gstate.report_project_error(*(atp->result),0,"Couldn't restart the app for this result.\n"); + atp->result->client_state = CLIENT_COMPUTING; + gstate.report_project_error(*(atp->result),retval,"Couldn't restart the app for this result.\n",CLIENT_COMPUTING); + active_tasks.erase(iter); } else { iter++; diff --git a/client/client_state.C b/client/client_state.C index e5075d871a..07849453a5 100644 --- a/client/client_state.C +++ b/client/client_state.C @@ -762,7 +762,8 @@ bool CLIENT_STATE::garbage_collect() { if(rp->wup->had_failure(failnum)) { // If we don't already have an error for this file if (rp->state < RESULT_READY_TO_ACK) { - report_project_error(*rp,failnum,"The work_unit corresponding to this result had an error"); + // the result wu corresponding to this result had an error downloading some input file(s). + report_project_error(*rp,0,"The work_unit corresponding to this result had an error",CLIENT_DOWNLOADING); } } else { rp->wup->ref_cnt++; @@ -774,8 +775,10 @@ bool CLIENT_STATE::garbage_collect() { // will be cleaned up after the server is notified // if(rp->output_files[i].file_info->had_failure(failnum)) { - if (rp->state < RESULT_READY_TO_ACK) { - report_project_error(*rp,failnum,"The outputfile corresponding to this result had an error"); + if (rp->state < RESULT_READY_TO_ACK) { + // had an error uploading a file for this result + rp->client_state = CLIENT_UPLOADING; + report_project_error(*rp,0,"The outputfile corresponding to this result had an error",CLIENT_UPLOADING); } } else { rp->output_files[i].file_info->ref_cnt++; @@ -860,6 +863,7 @@ bool CLIENT_STATE::update_results() { // before moving on if (rp->is_upload_done()) { rp->state = RESULT_READY_TO_ACK; + rp->client_state = CLIENT_DONE; action = true; } break; @@ -952,19 +956,74 @@ void CLIENT_STATE::set_client_state_dirty(char* source) { // Report error back to project, setting result state to finished and backing // off on the project. The error will appear in the stderr_out field of // the result -// -int CLIENT_STATE::report_project_error( RESULT &res,int err_num, char *err_msg ) { +// state is the desired client_state of the result after the call to this function is made. +int CLIENT_STATE::report_project_error( RESULT &res,int err_num, char *err_msg , int state) { char total_err[500]; unsigned int i; - FILE_INFO* fip; - - res.state = RESULT_READY_TO_ACK; + int failnum; + + //if this result is already in a state of error, then do nothing + if(res.state == RESULT_ERROR) + { + return 0; + } + + res.state = RESULT_ERROR; scheduler_op->backoff(res.project,""); - sprintf( total_err, "BOINC Core Client: Err %d: %s\n%d\n%d\n%d\n", err_num, err_msg,res.active_task_state,res.exit_status,res.signal ); - + res.client_state = state; + sprintf( total_err, + "%s\n" + "%d\n" + "%d\n" + "%d\n", + err_msg, + res.active_task_state, + res.exit_status, + res.signal ); + if( strlen(res.stderr_out)+strlen(total_err) < STDERR_MAX_LEN ) { + strcat(res.stderr_out, total_err ); + } + + if((res.client_state == CLIENT_COMPUTING) && (err_num)) + { + sprintf(total_err,"%d\n",err_num); + } + + if(res.client_state == CLIENT_DOWNLOADING) + { + for (i=0;iinput_files.size();i++) { + if(res.wup->input_files[i].file_info->had_failure(failnum)) + { + sprintf(total_err, + "\n" + " %s\n" + " %d\n" + "\n" + ,res.wup->input_files[i].file_info->name,failnum); + } + } + } + + if(res.client_state == CLIENT_UPLOADING) + { + for (i=0; ihad_failure(failnum)) { + + sprintf(total_err, + "\n" + " %s\n" + " %d\n" + "\n" + ,res.output_files[i].file_info->name,failnum); + } + } + } + + if(strlen(res.stderr_out)+strlen(total_err) < STDERR_MAX_LEN ) { strcat( res.stderr_out, total_err ); } diff --git a/client/client_state.h b/client/client_state.h index 4a86fe4502..399ff6a16f 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -126,7 +126,20 @@ public: int make_scheduler_request(PROJECT*, double); void handle_scheduler_reply(PROJECT*, char* scheduler_url); void set_client_state_dirty(char*); - int report_project_error( RESULT &res, int err_num, char *err_msg ); + int report_project_error( RESULT &res,int err_num, char *err_msg, int state); + //state corresponds to the client_state that the result should be in after the call + //This function gets called anytime there is an error regarding a result. + //It appends all the error information necessary for the backend to stderr_out for res. + //It goes through all input and output files for this result and prints necessary error codes + //In the case of failure to start or restart an Active_task, err_num should be set. + //This function is called in the following situations: + //1.When the active_task could not start or restart , in which case + //err_num should be set to the appropriate error_code. + //2.It is called when we fail in downloading an input file for the work unit of res or uploading + //the outputfiles for res. in which case err_num and err_msg are irrelavent, the function will take care of reporting these + //3.When the active_task exits with a non_zero error code or it gets signaled, relavant info + //is printed to stderr_out of res. + int add_project(char* master_url, char* authenticator); int change_project(int index, char* master_url, char* authenticator); int quit_project(int index); diff --git a/client/client_types.C b/client/client_types.C index a5eedbd541..8a96d3111f 100644 --- a/client/client_types.C +++ b/client/client_types.C @@ -599,6 +599,7 @@ void RESULT::clear() { exit_status = 0; active_task_state = 0; signal = 0; + client_state = CLIENT_UNINITIALIZED; strcpy(stderr_out, ""); app = NULL; wup = NULL; @@ -664,13 +665,15 @@ int RESULT::write(FILE* out, bool to_server) { FILE_INFO* fip; int n; + //if we didn't have an error with this result, (in which case we would have called report_project_error(): + fprintf(out, "\n" " %s\n" - " %d\n" - " %f\n", + " %d\n" + " %f\n", name, - exit_status, + client_state, final_cpu_time ); n = strlen(stderr_out); diff --git a/client/client_types.h b/client/client_types.h index e55702a8fa..16c90eb466 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -1,4 +1,4 @@ -// The contents of this file are subject to the Mozilla Public License + // The contents of this file are subject to the Mozilla Public License // Version 1.0 (the "License"); you may not use this file except in // compliance with the License. You may obtain a copy of the License at // http://www.mozilla.org/MPL/ @@ -206,6 +206,13 @@ struct WORKUNIT { // Received ack from server, can delete result #define RESULT_ERROR 5 +//The following are the states that the client is in according to the result. +#define CLIENT_UNINITIALIZED 0 +#define CLIENT_DOWNLOADING 1 +#define CLIENT_COMPUTING 2 +#define CLIENT_UPLOADING 3 +#define CLIENT_DONE 4 + struct RESULT { char name[256]; char wu_name[256]; @@ -218,6 +225,8 @@ struct RESULT { int signal; //the signal caught by the active_task, makes sense only if active_task_state is PROCESS_SIGNALED int active_task_state; // the state of the active task corresponding to this result char stderr_out[STDERR_MAX_LEN]; + int client_state; //the state of the client according to this result, as defined above. this information is passed back with the scheduler RPC to the scheduler server when there is any error related to the result or when the result is done. + APP* app; WORKUNIT* wup; PROJECT* project; diff --git a/client/cs_apps.C b/client/cs_apps.C index 0d18abec59..869cab6594 100644 --- a/client/cs_apps.C +++ b/client/cs_apps.C @@ -180,12 +180,12 @@ bool CLIENT_STATE::start_apps() { atp->init(rp); retval = active_tasks.insert(atp); //couldn't start process - if(retval) - { + if(retval) { atp->state = PROCESS_COULDNT_START; atp->result->active_task_state = PROCESS_COULDNT_START; - report_project_error(*(atp->result),0,"Couldn't start the app for this result.\n"); - } + + report_project_error(*(atp->result),retval,"Couldn't start the app for this result.\n",CLIENT_COMPUTING); + } action = true; set_client_state_dirty("start_apps"); app_started = time(0); diff --git a/client/pers_file_xfer.h b/client/pers_file_xfer.h index 5271768699..6fe60c1552 100644 --- a/client/pers_file_xfer.h +++ b/client/pers_file_xfer.h @@ -37,7 +37,7 @@ #define PERS_GIVEUP (60*60*24*7*2) // 2 weeks */ #define PERS_RETRY_DELAY_MIN 1 #define PERS_RETRY_DELAY_MAX 30 -#define PERS_GIVEUP 50 +#define PERS_GIVEUP 30 // give up on xfer if this time elapses since last byte xferred class PERS_FILE_XFER {