*** empty log message ***

svn path=/trunk/boinc/; revision=4302
This commit is contained in:
David Anderson 2004-10-07 19:18:37 +00:00
parent b02a64d34e
commit 81406ae66d
14 changed files with 159 additions and 158 deletions

View File

@ -18134,8 +18134,41 @@ David 6 Oct 2004
lib/
crypt_prog.C
David 6 Oct 2004
- boinc_resolve_filename() fix bug where link file is empty
(from Markku Degerholm)
lib/
app_ipc.C
David 7 Oct 2004
- return formatted msg in file_upload_handler
sched/
file_upload_handler.C
David 7 Oct 2004
- Housecleaning of result_report_error() and related code:
- removed RESULT::active_task_state field
- removed RESULT::signal field
- removed "errnum" argument from result_report_error()
- return XML text for upload errors
- clean up logic of result_report_error()
- changed FILE_INFO::had_failure() to optionally return error text
- changed RESULT_COMPUTE_DONE to RESULT_COMPUTE_ERROR
- renamed APP_VERSION::had_failure() to had_download_failure()
(same work WORKUNIT)
- schedule_cpus(): if any start or resume fails,
set must_schedule_cpus so we'll try something else
client/
app_control.C
app_start.C
client_state.C,h
client_types.C,h
cs_apps.C
cs_scheduler.C
cs_statefile.C
lib/
prefs.C
result_state.h

View File

@ -199,7 +199,6 @@ bool ACTIVE_TASK::handle_exited_app(unsigned long exit_code) {
result->final_cpu_time = checkpoint_cpu_time;
if (state == PROCESS_ABORT_PENDING) {
state = PROCESS_ABORTED;
result->active_task_state = PROCESS_ABORTED;
} else {
state = PROCESS_EXITED;
exit_status = exit_code;
@ -207,7 +206,7 @@ bool ACTIVE_TASK::handle_exited_app(unsigned long exit_code) {
if (exit_code) {
char szError[1024];
gstate.report_result_error(
*result, 0,
*result,
"%s - exit code %d (0x%x)",
windows_format_error_string(exit_code, szError, sizeof(szError)),
exit_code, exit_code
@ -220,19 +219,14 @@ bool ACTIVE_TASK::handle_exited_app(unsigned long exit_code) {
return true;
}
if (!finish_file_present()) {
#if 0
state = PROCESS_IN_LIMBO;
#else
scheduler_state = CPU_SCHED_PREEMPTED;
state = PROCESS_UNINITIALIZED;
close_process_handles();
#endif
limbo_message(*this);
return true;
}
}
result->exit_status = exit_status;
result->active_task_state = PROCESS_EXITED;
}
if (app_client_shm.shm) {
@ -253,7 +247,6 @@ bool ACTIVE_TASK::handle_exited_app(int stat) {
result->final_cpu_time = checkpoint_cpu_time;
if (state == PROCESS_ABORT_PENDING) {
state = PROCESS_ABORTED;
result->active_task_state = PROCESS_ABORTED;
} else {
if (WIFEXITED(stat)) {
state = PROCESS_EXITED;
@ -261,7 +254,7 @@ bool ACTIVE_TASK::handle_exited_app(int stat) {
if (exit_status) {
gstate.report_result_error(
*result, 0,
*result,
"process exited with code %d (0x%x)",
exit_status, exit_status
);
@ -298,17 +291,16 @@ bool ACTIVE_TASK::handle_exited_app(int stat) {
}
}
result->exit_status = exit_status;
result->active_task_state = PROCESS_EXITED;
scope_messages.printf(
"ACTIVE_TASK::handle_exited_app(): process exited: status %d\n",
exit_status
);
} else if (WIFSIGNALED(stat)) {
int signal = WTERMSIG(stat);
int got_signal = WTERMSIG(stat);
// if the process was externally killed, allow it to restart.
//
switch(signal) {
switch(got_signal) {
case SIGHUP:
case SIGINT:
case SIGQUIT:
@ -322,11 +314,9 @@ bool ACTIVE_TASK::handle_exited_app(int stat) {
exit_status = stat;
result->exit_status = exit_status;
state = PROCESS_WAS_SIGNALED;
signal = signal;
result->signal = signal;
result->active_task_state = PROCESS_WAS_SIGNALED;
signal = got_signal;
gstate.report_result_error(
*result, 0, "process got signal %d", signal
*result, "process got signal %d", signal
);
scope_messages.printf("ACTIVE_TASK::handle_exited_app(): process got signal %d\n", signal);
} else {
@ -547,12 +537,11 @@ bool ACTIVE_TASK_SET::check_rsc_limits_exceeded() {
int ACTIVE_TASK::abort_task(char* msg) {
if (state == PROCESS_EXECUTING || state == PROCESS_SUSPENDED) {
state = PROCESS_ABORT_PENDING;
result->active_task_state = PROCESS_ABORT_PENDING;
kill_task();
} else {
state = PROCESS_ABORTED;
}
gstate.report_result_error(*result, ERR_RSC_LIMIT_EXCEEDED, msg);
gstate.report_result_error(*result, msg);
return 0;
}

View File

@ -225,6 +225,8 @@ static int setup_file(
//
// Current dir is top-level BOINC dir
//
// postcondition: ACTIVE_TASK::state is set correctly
//
int ACTIVE_TASK::start(bool first_time) {
char exec_name[256], file_path[256], buf[256], exec_path[256];
unsigned int i;
@ -364,8 +366,7 @@ int ACTIVE_TASK::start(bool first_time) {
windows_error_string(szError, sizeof(szError));
state = PROCESS_COULDNT_START;
result->active_task_state = PROCESS_COULDNT_START;
gstate.report_result_error(*result, ERR_EXEC, "CreateProcess() failed - %s", szError);
gstate.report_result_error(*result, "CreateProcess() failed - %s", szError);
msg_printf(wup->project, MSG_ERROR, "CreateProcess() failed - %s", szError);
return ERR_EXEC;
}
@ -393,9 +394,8 @@ int ACTIVE_TASK::start(bool first_time) {
pid = fork();
if (pid == -1) {
state = PROCESS_COULDNT_START;
result->active_task_state = PROCESS_COULDNT_START;
gstate.report_result_error(*result, -1, "fork(): %s", strerror(errno));
msg_printf(wup->project, MSG_ERROR, "fork(): %s", strerror(errno));
gstate.report_result_error(*result, "fork() failed: %s", strerror(errno));
msg_printf(wup->project, MSG_ERROR, "fork() failed: %s", strerror(errno));
return ERR_FORK;
}
if (pid == 0) {
@ -438,12 +438,11 @@ int ACTIVE_TASK::start(bool first_time) {
#endif
state = PROCESS_EXECUTING;
result->active_task_state = PROCESS_EXECUTING;
return 0;
}
// Resume the task if it was previously running
// Otherwise, start it
// Resume the task if it was previously running; otherwise start it
// Postcondition: "state" is set correctly
//
int ACTIVE_TASK::resume_or_start() {
char* str = "??";
@ -462,7 +461,10 @@ int ACTIVE_TASK::resume_or_start() {
retval = start(false);
str = "Restarting";
}
if (retval) return retval;
if (retval) {
state = PROCESS_COULDNT_START;
return retval;
}
break;
case PROCESS_SUSPENDED:
retval = unsuspend();
@ -472,6 +474,7 @@ int ACTIVE_TASK::resume_or_start() {
MSG_ERROR,
"ACTIVE_TASK::resume_or_start(): could not unsuspend active_task"
);
state = PROCESS_COULDNT_START;
return retval;
}
str = "Resuming";
@ -515,9 +518,8 @@ int ACTIVE_TASK_SET::restart_tasks(int max_tasks) {
get_slot_dir(atp->slot, atp->slot_dir);
if (!gstate.input_files_available(result)) {
msg_printf(atp->wup->project, MSG_ERROR, "ACTIVE_TASKS::restart_tasks(); missing files\n");
atp->result->active_task_state = PROCESS_COULDNT_START;
gstate.report_result_error(
*(atp->result), ERR_FILE_MISSING,
*(atp->result),
"One or more missing files"
);
iter = active_tasks.erase(iter);
@ -548,9 +550,8 @@ int ACTIVE_TASK_SET::restart_tasks(int max_tasks) {
if (retval) {
msg_printf(atp->wup->project, MSG_ERROR, "ACTIVE_TASKS::restart_tasks(); restart failed: %d\n", retval);
atp->result->active_task_state = PROCESS_COULDNT_START;
gstate.report_result_error(
*(atp->result), retval,
*(atp->result),
"Couldn't restart the app for this result: %d", retval
);
iter = active_tasks.erase(iter);

View File

@ -723,6 +723,7 @@ bool CLIENT_STATE::garbage_collect() {
bool action = false, found;
string error_msgs;
PROJECT* project;
char buf[1024];
SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_STATE);
@ -769,43 +770,42 @@ bool CLIENT_STATE::garbage_collect() {
}
// See if the files for this result's workunit had
// any errors (download failure, MD5, RSA, etc)
// and we don't already have an error for this file
// and we don't already have an error for this result
//
if (!rp->ready_to_report) {
wup = rp->wup;
if (wup->had_failure(failnum)) {
if (wup->had_download_failure(failnum)) {
wup->get_file_errors(error_msgs);
report_result_error(
*rp, 0, "WU download error: %s", error_msgs.c_str()
*rp, "WU download error: %s", error_msgs.c_str()
);
} else if (wup->avp && wup->avp->had_failure(failnum)) {
} else if (wup->avp && wup->avp->had_download_failure(failnum)) {
wup->avp->get_file_errors(error_msgs);
report_result_error(
*rp, 0, "app_version download error: %s", error_msgs.c_str()
*rp, "app_version download error: %s", error_msgs.c_str()
);
}
}
bool found_error = false;
std::string error_str;
for (i=0; i<rp->output_files.size(); i++) {
// If one of the output files had an upload failure,
// mark the result as done and report the error.
// The result, workunits, and file infos
// will be cleaned up after the server is notified
//
if (rp->output_files[i].file_info->had_failure(failnum)) {
if (!rp->ready_to_report) {
// had an error uploading a file for this result
//
switch (failnum) {
case ERR_FILE_TOO_BIG:
report_result_error(*rp, 0, "Output file exceeded size limit");
break;
default:
report_result_error(*rp, 0, "Output file error: %d", failnum);
}
if (!rp->ready_to_report) {
fip = rp->output_files[i].file_info;
if (fip->had_failure(failnum, buf)) {
found_error = true;
error_str += buf;
}
}
rp->output_files[i].file_info->ref_cnt++;
}
if (found_error) {
report_result_error(*rp, error_str.c_str());
}
rp->wup->ref_cnt++;
result_iter++;
}
@ -907,15 +907,11 @@ bool CLIENT_STATE::update_results() {
vector<RESULT*>::iterator result_iter;
bool action = false;
// delete RESULTs that have been finished and reported;
// reference-count files referred to by other results
//
result_iter = results.begin();
while (result_iter != results.end()) {
rp = *result_iter;
// The result has been acked by the scheduling server.
// It will be deleted on the next garbage collection,
// which we trigger by setting action to true
if (rp->got_server_ack) {
action = true;
}
@ -932,21 +928,7 @@ bool CLIENT_STATE::update_results() {
action = true;
}
break;
// app_finished() transitions to either RESULT_COMPUTE_DONE or
// RESULT_FILES_UPLOADING. RESULT_COMPUTE_DONE is a dead-end state
// indicating we had an error at the end of computation.
// case RESULT_FILES_DOWNLOADED:
// break;
// case RESULT_COMPUTE_DONE:
// rp->state = RESULT_FILES_UPLOADING;
// action = true;
// break;
case RESULT_FILES_UPLOADING:
// Once the computation has been done, check that the necessary
// files have been uploaded before moving on
//
if (rp->is_upload_done()) {
rp->ready_to_report = true;
rp->state = RESULT_FILES_UPLOADED;
@ -981,22 +963,12 @@ bool CLIENT_STATE::time_to_exit() {
}
// Call this when a result has a nonrecoverable error.
// Append a description of the error to the stderr_out field of the result.
//
// Go through the input and output files for this result
// and generates error messages for upload/download failures.
//
// This function is called in the following situations:
// 1. When the active_task could not start or restart,
// in which case err_num is set to an OS-specific error_code.
// and err_msg has an OS-supplied string.
// 2. when we fail in downloading an input file or uploading an output file,
// in which case err_num and err_msg are zero.
// 3. When the active_task exits with a non_zero error code
// or it gets signaled.
// - back off on contacting the project's scheduler
// (so don't crash over and over)
// - Append a description of the error to result.stderr_out
//
int CLIENT_STATE::report_result_error(
RESULT& res, int err_num, const char* format, ...
RESULT& res, const char* format, ...
) {
char buf[MAX_BLOB_LEN], err_msg[MAX_BLOB_LEN];
unsigned int i;
@ -1018,44 +990,39 @@ int CLIENT_STATE::report_result_error(
sprintf(buf, "Unrecoverable error for result %s (%s)", res.name, err_msg);
scheduler_op->backoff(res.project, buf);
sprintf(
buf,
"<message>%s\n</message>\n"
"<active_task_state>%d</active_task_state>\n"
"<signal>%d</signal>\n",
err_msg,
res.active_task_state,
res.signal
);
sprintf( buf, "<message>%s\n</message>\n", err_msg);
res.stderr_out.append(buf);
if ((res.state == RESULT_FILES_DOWNLOADED) && err_num) {
sprintf(buf,"<couldnt_start>%d</couldnt_start>\n", err_num);
res.stderr_out.append(buf);
if (!res.exit_status) {
res.exit_status = ERR_RESULT_START;
}
}
if (res.state == RESULT_NEW) {
for (i=0;i<res.wup->input_files.size();i++) {
if (res.wup->input_files[i].file_info->had_failure(failnum)) {
sprintf(buf,
"<download_error>\n"
" <file_name>%s</file_name>\n"
" <error_code>%d</error_code>\n"
"</download_error>\n",
res.wup->input_files[i].file_info->name, failnum
);
res.stderr_out.append(buf);
}
}
switch(res.state) {
case RESULT_NEW:
case RESULT_FILES_DOWNLOADING:
// called from:
// CLIENT_STATE::garbage_collect()
// if WU or app_version had a download failure
//
if (!res.exit_status) {
res.exit_status = ERR_RESULT_DOWNLOAD;
}
}
break;
if (res.state == RESULT_COMPUTE_DONE) {
case RESULT_FILES_DOWNLOADED:
// called from:
// ACTIVE_TASK::start (if couldn't start app)
// ACTIVE_TASK::restart (if files missing)
// ACITVE_TASK_SET::restart_tasks (catch other error returns)
// ACTIVE_TASK::handle_exited_app (on nonzero exit or signal)
// ACTIVE_TASK::abort_task (if exceeded resource limit)
// CLIENT_STATE::schedule_cpus (catch-all for resume/start errors)
//
if (!res.exit_status) {
res.exit_status = ERR_RESULT_START;
}
break;
case RESULT_FILES_UPLOADING:
// called from
// CLIENT_STATE::garbage_collect() if result had an upload error
//
for (i=0; i<res.output_files.size(); i++) {
if (res.output_files[i].file_info->had_failure(failnum)) {
sprintf(buf,
@ -1071,6 +1038,12 @@ int CLIENT_STATE::report_result_error(
if (!res.exit_status) {
res.exit_status = ERR_RESULT_UPLOAD;
}
break;
case RESULT_COMPUTE_ERROR:
break;
case RESULT_FILES_UPLOADED:
msg_printf(res.project, MSG_ERROR, "report_result_error() called unexpectedly");
break;
}
res.stderr_out = res.stderr_out.substr(0,MAX_BLOB_LEN-1);

View File

@ -165,8 +165,7 @@ public:
APP_VERSION* lookup_app_version(APP*, int);
ACTIVE_TASK* lookup_active_task_by_result(RESULT*);
int detach_project(PROJECT*);
int report_result_error(RESULT &res, int err_num, const char *format, ...);
// flag a result as having an error
int report_result_error(RESULT&, const char *format, ...);
int reset_project(PROJECT*);
private:
int link_app(PROJECT*, APP*);

View File

@ -758,9 +758,21 @@ int FILE_INFO::merge_info(FILE_INFO& new_info) {
// Returns true if the file had an unrecoverable error
// (couldn't download, RSA/MD5 check failed, etc)
//
bool FILE_INFO::had_failure(int& failnum) {
bool FILE_INFO::had_failure(int& failnum, char* buf) {
if (status != FILE_NOT_PRESENT && status != FILE_PRESENT) {
failnum = status;
if (buf) {
sprintf(buf,
"<file_xfer_error>\n"
" <file_name>%s</file_name>\n"
" <error_code>%d</error_code>\n"
" <error_message>%s</error_message>\n"
"</file_xfer_error>\n",
name,
status,
error_msg.c_str()
);
}
return true;
}
return false;
@ -821,7 +833,7 @@ int APP_VERSION::write(MIOFILE& out) {
return 0;
}
bool APP_VERSION::had_failure(int& failnum) {
bool APP_VERSION::had_download_failure(int& failnum) {
unsigned int i;
for (i=0; i<app_files.size();i++) {
@ -833,14 +845,16 @@ bool APP_VERSION::had_failure(int& failnum) {
}
void APP_VERSION::get_file_errors(string& str) {
int x;
int errnum;
unsigned int i;
FILE_INFO* fip;
char buf[1024];
str = "couldn't get input files:\n";
for (i=0; i<app_files.size();i++) {
fip = app_files[i].file_info;
if (fip->had_failure(x)) {
str = str + fip->name + ": " + fip->error_msg + "\n";
if (fip->had_failure(errnum, buf)) {
str = str + buf;
}
}
}
@ -973,7 +987,7 @@ int WORKUNIT::write(MIOFILE& out) {
return 0;
}
bool WORKUNIT::had_failure(int& failnum) {
bool WORKUNIT::had_download_failure(int& failnum) {
unsigned int i;
for (i=0;i<input_files.size();i++) {
@ -988,11 +1002,13 @@ void WORKUNIT::get_file_errors(string& str) {
int x;
unsigned int i;
FILE_INFO* fip;
char buf[1024];
str = "couldn't get input files:\n";
for (i=0;i<input_files.size();i++) {
fip = input_files[i].file_info;
if (fip->had_failure(x)) {
str = str + fip->name + ": " + fip->error_msg + "\n";
if (fip->had_failure(x, buf)) {
str = str + buf;
}
}
}
@ -1021,8 +1037,6 @@ void RESULT::clear() {
got_server_ack = false;
final_cpu_time = 0;
exit_status = 0;
active_task_state = 0;
signal = 0;
stderr_out = "";
app = NULL;
wup = NULL;
@ -1188,7 +1202,6 @@ int RESULT::write_gui(MIOFILE& out) {
return 0;
}
// this is called after the result state is RESULT_COMPUTE_DONE.
// Returns true if the result's output files are all either
// successfully uploaded or have unrecoverable errors
//

View File

@ -105,7 +105,7 @@ public:
char* get_next_url(bool);
char* get_current_url(bool);
bool is_correct_url_type(bool, STRING256);
bool had_failure(int& failnum);
bool had_failure(int& failnum, char* buf=0);
bool verify_existing_file();
int merge_info(FILE_INFO&);
int verify_downloaded_file();
@ -259,7 +259,7 @@ struct APP_VERSION {
int parse(MIOFILE&);
int write(MIOFILE&);
bool had_failure(int& failnum);
bool had_download_failure(int& failnum);
void get_file_errors(std::string&);
void clear_errors();
};
@ -284,7 +284,7 @@ struct WORKUNIT {
int parse(MIOFILE&);
int write(MIOFILE&);
bool had_failure(int& failnum);
bool had_download_failure(int& failnum);
void get_file_errors(std::string&);
};
@ -300,17 +300,13 @@ struct RESULT {
bool got_server_ack;
// we're received the ack for this result from the server
double final_cpu_time;
int state; // state of this result, see lib/result_state.h
int state; // state of this result: see lib/result_state.h
int exit_status; // return value from the application
int signal; // the signal caught by the active_task,
// defined only if active_task_state is PROCESS_SIGNALED
int active_task_state; // the state of the active task corresponding to this result
std::string stderr_out;
// the concatenation of:
//
// - if report_result_error() is called for this result:
// <message>x</message>
// <active_task_state>x</active_task_state>
// <exit_status>x</exit_status>
// <signal>x</signal>
// - if called in FILES_DOWNLOADED state:

View File

@ -121,11 +121,8 @@ int CLIENT_STATE::app_finished(ACTIVE_TASK& at) {
}
if (had_error) {
// dead-end state indicating we had an error at end of computation;
// do not move to RESULT_FILES_UPLOADING
rp->state = RESULT_COMPUTE_DONE;
rp->state = RESULT_COMPUTE_ERROR;
} else {
// can now upload files.
rp->state = RESULT_FILES_UPLOADING;
}
PROJECT* p = rp->project;
@ -178,8 +175,9 @@ bool CLIENT_STATE::handle_finished_apps() {
return action;
}
// Returns true if all the input files for a result are available
// locally, false otherwise
// Returns true if all the input files for a result are present
// (both WU and app version)
// false otherwise
//
bool CLIENT_STATE::input_files_available(RESULT* rp) {
WORKUNIT* wup = rp->wup;
@ -308,14 +306,6 @@ bool CLIENT_STATE::schedule_largest_debt_project(double expected_pay_off) {
PROJECT* p = projects[i];
if (!p->next_runnable_result) continue;
if (p->non_cpu_intensive) continue;
if (!input_files_available(projects[i]->next_runnable_result)) {
report_result_error(
*(p->next_runnable_result), ERR_FILE_MISSING,
"One or more missing files"
);
p->next_runnable_result = NULL;
continue;
}
if (first || p->anticipated_debt > best_debt) {
first = false;
best_project = p;
@ -488,12 +478,13 @@ bool CLIENT_STATE::schedule_cpus() {
) {
retval = atp->resume_or_start();
if (retval) {
atp->state = PROCESS_COULDNT_START;
atp->result->active_task_state = PROCESS_COULDNT_START;
report_result_error(
*(atp->result), retval,
"Couldn't start the app for this result: error %d", retval
*(atp->result), "Couldn't start or resume: %d", retval
);
// if we couldn't run something, reschedule
//
must_schedule_cpus = true;
continue;
}
atp->scheduler_state = CPU_SCHED_SCHEDULED;

View File

@ -365,7 +365,7 @@ double CLIENT_STATE::ettprc(PROJECT *p, int k) {
) {
RESULT *rp = *iter;
if (rp->project != p
|| rp->state >= RESULT_COMPUTE_DONE
|| rp->state > RESULT_FILES_DOWNLOADED
|| rp->ready_to_report
) continue;
if (num_results_to_skip > 0) {

View File

@ -145,6 +145,7 @@ int CLIENT_STATE::parse_state_file() {
file_infos.push_back(fip);
// If the file had a failure before,
// don't start another file transfer
//
if (fip->had_failure(failnum)) {
if (fip->pers_file_xfer) {
delete fip->pers_file_xfer;

View File

@ -27,6 +27,7 @@ show_name("Brian Boshes");
show_name("Karl Chen");
show_name("Pietro Cicotti");
show_name("Seth Cooper");
show_name("Markku Degerholm");
show_name("James Drews");
show_name("Michael Gary");
show_name("Gary Gibson");

View File

@ -258,6 +258,9 @@ int boinc_resolve_filename(const char *virtual_name, char *physical_name, int le
fp = boinc_fopen(virtual_name, "r");
if (!fp) return ERR_FOPEN;
// must initialize buf since fgets() on an empty file won't do anything
//
buf[0] = 0;
fgets(buf, 512, fp);
fclose(fp);

View File

@ -260,4 +260,5 @@ int GLOBAL_PREFS::write(FILE* f) {
max_bytes_sec_down
);
return 0;
}
}

View File

@ -25,17 +25,17 @@
// (because of the >= comparison in current_work_buf_days())
//
#define RESULT_NEW 0
// New result, files may still need to be downloaded
// New result
#define RESULT_FILES_DOWNLOADING 1
// Input files for result are being downloaded
// Input files for result (WU, app version) are being downloaded
#define RESULT_FILES_DOWNLOADED 2
// Files are downloaded, result can be computed
#define RESULT_COMPUTE_DONE 3
// Computation is done, if no error then files need to be uploaded
// Files are downloaded, result can be (or is being) computed
#define RESULT_COMPUTE_ERROR 3
// computation failed; no file upload
#define RESULT_FILES_UPLOADING 4
// Output files for result are being uploaded
#define RESULT_FILES_UPLOADED 5
// Files are uploaded, notify scheduling server
// Files are uploaded, notify scheduling server at some point
#endif