*** empty log message ***

svn path=/trunk/boinc/; revision=713
This commit is contained in:
Hamid Aghdaee 2002-12-09 22:36:46 +00:00
parent 799a502f0a
commit 54bb40beef
8 changed files with 147 additions and 32 deletions

View File

@ -2573,3 +2573,30 @@ Eric Dec 8, 2002
pers_file_xfer.C
scheduler_op.C
Hamid Dec 9,2002
- added field client_state to result, which is what is reported back to the sched server as the state of the client in regards to this result when the RPC was made
- added some more functionality to repor_project_error()
It now adds the following info to stderr_out of the result:
<message>message if any passed</mesage>
<active_task_state>%d</active_task_state>
<exit_status>%d</exit_status>
<signal>%d</signal>
if download had failures
"<download_error>\
" <file_name>%s</file_name>
" <error_code>%d</error_code>\n"
" </download_error>\n"
if upload had failures
same as above
if coudln't start active task for result (in which err_num should be set)
"<couldnt_start>%d</couldnt_start>\n"
- The error mechanism right now is such that any failures regarding file_infos (download, upload or whatever) are recorded in the status of the file_infos and then garbage collect called report_project_error. However errors regarding starting and ending active tasks are reported from app.C and cs_apps.C.

View File

@ -258,6 +258,7 @@ int ACTIVE_TASK::start(bool first_time) {
STARTUPINFO startup_info;
char slotdirpath[256];
char cmd_line[512];
int win_error;
memset( &process_info, 0, sizeof( process_info ) );
memset( &startup_info, 0, sizeof( startup_info ) );
@ -283,15 +284,15 @@ int ACTIVE_TASK::start(bool first_time) {
&startup_info,
&process_info
)) {
state = GetLastError();
win_error = GetLastError();
char *errorargs[] = {app_version->app_name,"","","",""};
LPVOID lpMsgBuf;
FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER|FORMAT_MESSAGE_FROM_SYSTEM,
NULL, state, 0, (LPTSTR)&lpMsgBuf, 0, errorargs);
NULL, win_error, 0, (LPTSTR)&lpMsgBuf, 0, errorargs);
// check for an error; if there is one, set error information for the currect result
if(state) {
gstate.report_project_error(*result, state, (LPTSTR)&lpMsgBuf);
if(win_error) {
gstate.report_project_error(*result, win_error, (LPTSTR)&lpMsgBuf);
LocalFree(lpMsgBuf);
return -1;
}
@ -334,6 +335,7 @@ int ACTIVE_TASK::start(bool first_time) {
#endif
state = PROCESS_RUNNING;
result->active_task_state = PROCESS_RUNNING;
result->client_state = CLIENT_COMPUTING;
return 0;
}
@ -406,7 +408,7 @@ bool ACTIVE_TASK_SET::poll() {
if (atp->state == PROCESS_ABORT_PENDING) {
atp->state = PROCESS_ABORTED;
atp->result->active_task_state = PROCESS_ABORTED;
gstate.report_project_error(*(atp->result),0,"process was aborted\n");
gstate.report_project_error(*(atp->result), 0, "process was aborted\n",CLIENT_COMPUTING);
} else {
atp->state = PROCESS_EXITED;
atp->exit_status = exit_code;
@ -415,7 +417,7 @@ bool ACTIVE_TASK_SET::poll() {
//if a nonzero error code, then report it
if(exit_code)
{
gstate.report_project_error(*(atp->result),0,"process exited with a non zero exit code\n");
gstate.report_project_error(*(atp->result),0,"process exited with a non zero exit code\n",CLIENT_COMPUTING);
}
}
CloseHandle(atp->pid_handle);
@ -444,7 +446,7 @@ bool ACTIVE_TASK_SET::poll() {
if (atp->state == PROCESS_ABORT_PENDING) {
atp->state = PROCESS_ABORTED;
atp->result->active_task_state = PROCESS_ABORTED;
gstate.report_project_error(*(atp->result),0,"process was aborted\n");
gstate.report_project_error(*(atp->result),0,"process was aborted\n",CLIENT_COMPUTING);
} else {
if (WIFEXITED(stat)) {
atp->state = PROCESS_EXITED;
@ -455,7 +457,7 @@ bool ACTIVE_TASK_SET::poll() {
//if exit_status != 0, then we don't need to upload the files for the result of this app
if(atp->exit_status)
{
gstate.report_project_error(*(atp->result),0,"process exited with a nonzero exit code\n");
gstate.report_project_error(*(atp->result),0,"process exited with a nonzero exit code\n",CLIENT_COMPUTING);
}
if (log_flags.task_debug) printf("process exited: status %d\n", atp->exit_status);
} else if (WIFSIGNALED(stat)) {
@ -463,11 +465,11 @@ bool ACTIVE_TASK_SET::poll() {
atp->signal = WTERMSIG(stat);
atp->result->signal = atp->signal;
atp->result->active_task_state = PROCESS_WAS_SIGNALED;
gstate.report_project_error(*(atp->result),0,"process was signaled\n");
gstate.report_project_error(*(atp->result),0,"process was signaled\n",CLIENT_COMPUTING);
if (log_flags.task_debug) printf("process was signaled: %d\n", atp->signal);
} else {
atp->state = PROCESS_EXIT_UNKNOWN;
atp->result->state = PROCESS_EXIT_UNKNOWN;
atp->state = PROCESS_EXIT_UNKNOWN;
atp->result->state = PROCESS_EXIT_UNKNOWN;
}
}
@ -643,7 +645,9 @@ int ACTIVE_TASK_SET::restart_tasks() {
if (retval) {
fprintf(stderr, "ACTIVE_TASKS::restart_tasks(); restart failed: %d\n", retval);
atp->result->active_task_state = PROCESS_COULDNT_START;
gstate.report_project_error(*(atp->result),0,"Couldn't restart the app for this result.\n");
atp->result->client_state = CLIENT_COMPUTING;
gstate.report_project_error(*(atp->result),retval,"Couldn't restart the app for this result.\n",CLIENT_COMPUTING);
active_tasks.erase(iter);
} else {
iter++;

View File

@ -762,7 +762,8 @@ bool CLIENT_STATE::garbage_collect() {
if(rp->wup->had_failure(failnum)) {
// If we don't already have an error for this file
if (rp->state < RESULT_READY_TO_ACK) {
report_project_error(*rp,failnum,"The work_unit corresponding to this result had an error");
// the result wu corresponding to this result had an error downloading some input file(s).
report_project_error(*rp,0,"The work_unit corresponding to this result had an error",CLIENT_DOWNLOADING);
}
} else {
rp->wup->ref_cnt++;
@ -774,8 +775,10 @@ bool CLIENT_STATE::garbage_collect() {
// will be cleaned up after the server is notified
//
if(rp->output_files[i].file_info->had_failure(failnum)) {
if (rp->state < RESULT_READY_TO_ACK) {
report_project_error(*rp,failnum,"The outputfile corresponding to this result had an error");
if (rp->state < RESULT_READY_TO_ACK) {
// had an error uploading a file for this result
rp->client_state = CLIENT_UPLOADING;
report_project_error(*rp,0,"The outputfile corresponding to this result had an error",CLIENT_UPLOADING);
}
} else {
rp->output_files[i].file_info->ref_cnt++;
@ -860,6 +863,7 @@ bool CLIENT_STATE::update_results() {
// before moving on
if (rp->is_upload_done()) {
rp->state = RESULT_READY_TO_ACK;
rp->client_state = CLIENT_DONE;
action = true;
}
break;
@ -952,19 +956,74 @@ void CLIENT_STATE::set_client_state_dirty(char* source) {
// Report error back to project, setting result state to finished and backing
// off on the project. The error will appear in the stderr_out field of
// the result
//
int CLIENT_STATE::report_project_error( RESULT &res,int err_num, char *err_msg ) {
// state is the desired client_state of the result after the call to this function is made.
int CLIENT_STATE::report_project_error( RESULT &res,int err_num, char *err_msg , int state) {
char total_err[500];
unsigned int i;
FILE_INFO* fip;
res.state = RESULT_READY_TO_ACK;
int failnum;
//if this result is already in a state of error, then do nothing
if(res.state == RESULT_ERROR)
{
return 0;
}
res.state = RESULT_ERROR;
scheduler_op->backoff(res.project,"");
sprintf( total_err, "BOINC Core Client: Err %d: %s\n<active_task_state>%d</active_task_state>\n<exit_status>%d</exit_status>\n<signal>%d</signal>\n", err_num, err_msg,res.active_task_state,res.exit_status,res.signal );
res.client_state = state;
sprintf( total_err,
"<message>%s</message>\n"
"<active_task_state>%d</active_task_state>\n"
"<exit_status>%d</exit_status>\n"
"<signal>%d</signal>\n",
err_msg,
res.active_task_state,
res.exit_status,
res.signal );
if( strlen(res.stderr_out)+strlen(total_err) < STDERR_MAX_LEN ) {
strcat(res.stderr_out, total_err );
}
if((res.client_state == CLIENT_COMPUTING) && (err_num))
{
sprintf(total_err,"<couldnt_start>%d</couldnt_start>\n",err_num);
}
if(res.client_state == CLIENT_DOWNLOADING)
{
for (i=0;i<res.wup->input_files.size();i++) {
if(res.wup->input_files[i].file_info->had_failure(failnum))
{
sprintf(total_err,
"<download_error>\n"
" <file_name>%s</file_name>\n"
" <error_code>%d</error_code>\n"
"</download_error>\n"
,res.wup->input_files[i].file_info->name,failnum);
}
}
}
if(res.client_state == CLIENT_UPLOADING)
{
for (i=0; i<res.output_files.size(); i++) {
// If one of the file infos had a failure,
if(res.output_files[i].file_info->had_failure(failnum)) {
sprintf(total_err,
"<upload_error>\n"
" <file_name>%s</file_name>\n"
" <error_code>%d</error_code>\n"
"</upload_error>\n"
,res.output_files[i].file_info->name,failnum);
}
}
}
if(strlen(res.stderr_out)+strlen(total_err) < STDERR_MAX_LEN ) {
strcat( res.stderr_out, total_err );
}

View File

@ -126,7 +126,20 @@ public:
int make_scheduler_request(PROJECT*, double);
void handle_scheduler_reply(PROJECT*, char* scheduler_url);
void set_client_state_dirty(char*);
int report_project_error( RESULT &res, int err_num, char *err_msg );
int report_project_error( RESULT &res,int err_num, char *err_msg, int state);
//state corresponds to the client_state that the result should be in after the call
//This function gets called anytime there is an error regarding a result.
//It appends all the error information necessary for the backend to stderr_out for res.
//It goes through all input and output files for this result and prints necessary error codes
//In the case of failure to start or restart an Active_task, err_num should be set.
//This function is called in the following situations:
//1.When the active_task could not start or restart , in which case
//err_num should be set to the appropriate error_code.
//2.It is called when we fail in downloading an input file for the work unit of res or uploading
//the outputfiles for res. in which case err_num and err_msg are irrelavent, the function will take care of reporting these
//3.When the active_task exits with a non_zero error code or it gets signaled, relavant info
//is printed to stderr_out of res.
int add_project(char* master_url, char* authenticator);
int change_project(int index, char* master_url, char* authenticator);
int quit_project(int index);

View File

@ -599,6 +599,7 @@ void RESULT::clear() {
exit_status = 0;
active_task_state = 0;
signal = 0;
client_state = CLIENT_UNINITIALIZED;
strcpy(stderr_out, "");
app = NULL;
wup = NULL;
@ -664,13 +665,15 @@ int RESULT::write(FILE* out, bool to_server) {
FILE_INFO* fip;
int n;
//if we didn't have an error with this result, (in which case we would have called report_project_error():
fprintf(out,
"<result>\n"
" <name>%s</name>\n"
" <exit_status>%d</exit_status>\n"
" <final_cpu_time>%f</final_cpu_time>\n",
" <client_state>%d</client_state>\n"
" <final_cpu_time>%f</final_cpu_time>\n",
name,
exit_status,
client_state,
final_cpu_time
);
n = strlen(stderr_out);

View File

@ -1,4 +1,4 @@
// The contents of this file are subject to the Mozilla Public License
// The contents of this file are subject to the Mozilla Public License
// Version 1.0 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
// http://www.mozilla.org/MPL/
@ -206,6 +206,13 @@ struct WORKUNIT {
// Received ack from server, can delete result
#define RESULT_ERROR 5
//The following are the states that the client is in according to the result.
#define CLIENT_UNINITIALIZED 0
#define CLIENT_DOWNLOADING 1
#define CLIENT_COMPUTING 2
#define CLIENT_UPLOADING 3
#define CLIENT_DONE 4
struct RESULT {
char name[256];
char wu_name[256];
@ -218,6 +225,8 @@ struct RESULT {
int signal; //the signal caught by the active_task, makes sense only if active_task_state is PROCESS_SIGNALED
int active_task_state; // the state of the active task corresponding to this result
char stderr_out[STDERR_MAX_LEN];
int client_state; //the state of the client according to this result, as defined above. this information is passed back with the scheduler RPC to the scheduler server when there is any error related to the result or when the result is done.
APP* app;
WORKUNIT* wup;
PROJECT* project;

View File

@ -180,12 +180,12 @@ bool CLIENT_STATE::start_apps() {
atp->init(rp);
retval = active_tasks.insert(atp);
//couldn't start process
if(retval)
{
if(retval) {
atp->state = PROCESS_COULDNT_START;
atp->result->active_task_state = PROCESS_COULDNT_START;
report_project_error(*(atp->result),0,"Couldn't start the app for this result.\n");
}
report_project_error(*(atp->result),retval,"Couldn't start the app for this result.\n",CLIENT_COMPUTING);
}
action = true;
set_client_state_dirty("start_apps");
app_started = time(0);

View File

@ -37,7 +37,7 @@
#define PERS_GIVEUP (60*60*24*7*2) // 2 weeks */
#define PERS_RETRY_DELAY_MIN 1
#define PERS_RETRY_DELAY_MAX 30
#define PERS_GIVEUP 50
#define PERS_GIVEUP 30
// give up on xfer if this time elapses since last byte xferred
class PERS_FILE_XFER {