mirror of https://github.com/BOINC/boinc.git
parent
d3dba8aa55
commit
727214d3fa
136
client/app.C
136
client/app.C
|
@ -111,7 +111,7 @@ int ACTIVE_TASK::init(RESULT* rp) {
|
|||
// Current dir is top-level BOINC dir
|
||||
//
|
||||
int ACTIVE_TASK::start(bool first_time) {
|
||||
char exec_name[256], file_path[256], link_path[256], temp[256], exec_path[256];
|
||||
char exec_name[256], file_path[256], link_path[256], buf[256], exec_path[256];
|
||||
unsigned int i;
|
||||
FILE_REF file_ref;
|
||||
FILE_INFO* fip;
|
||||
|
@ -156,9 +156,11 @@ int ACTIVE_TASK::start(bool first_time) {
|
|||
sprintf(init_data_path, "%s%s%s", slot_dir, PATH_SEPARATOR, INIT_DATA_FILE);
|
||||
f = fopen(init_data_path, "w");
|
||||
if (!f) {
|
||||
if (log_flags.task_debug) {
|
||||
printf("Failed to open core to app prefs file %s.\n", init_data_path);
|
||||
}
|
||||
sprintf(
|
||||
buf, "Failed to open core-to-app prefs file %s.\n",
|
||||
init_data_path
|
||||
);
|
||||
show_message(wup->project, buf, MSG_ERROR);
|
||||
return ERR_FOPEN;
|
||||
}
|
||||
retval = write_init_data_file(f, aid);
|
||||
|
@ -169,9 +171,11 @@ int ACTIVE_TASK::start(bool first_time) {
|
|||
sprintf(graphics_data_path, "%s%s%s", slot_dir, PATH_SEPARATOR, GRAPHICS_DATA_FILE);
|
||||
f = fopen(graphics_data_path, "w");
|
||||
if (!f) {
|
||||
if (log_flags.task_debug) {
|
||||
printf("Failed to open core to app graphics prefs file %s.\n", graphics_data_path);
|
||||
}
|
||||
sprintf(
|
||||
buf, "Failed to open core-to-app graphics prefs file %s.\n",
|
||||
graphics_data_path
|
||||
);
|
||||
show_message(wup->project, buf, MSG_ERROR);
|
||||
return ERR_FOPEN;
|
||||
}
|
||||
retval = write_graphics_file(f, &gi);
|
||||
|
@ -180,9 +184,8 @@ int ACTIVE_TASK::start(bool first_time) {
|
|||
sprintf(fd_init_path, "%s%s%s", slot_dir, PATH_SEPARATOR, FD_INIT_FILE);
|
||||
f = fopen(fd_init_path, "w");
|
||||
if (!f) {
|
||||
if(log_flags.task_debug) {
|
||||
printf("Failed to open init file %s.\n", fd_init_path);
|
||||
}
|
||||
sprintf(buf, "Failed to open init file %s.\n", fd_init_path);
|
||||
show_message(wup->project, buf, MSG_ERROR);
|
||||
return ERR_FOPEN;
|
||||
}
|
||||
|
||||
|
@ -197,13 +200,14 @@ int ACTIVE_TASK::start(bool first_time) {
|
|||
}
|
||||
if (first_time) {
|
||||
sprintf(link_path, "%s%s%s", slot_dir, PATH_SEPARATOR, fip->name);
|
||||
sprintf(temp, "..%s..%s%s", PATH_SEPARATOR, PATH_SEPARATOR, file_path );
|
||||
retval = boinc_link( temp, link_path);
|
||||
sprintf(buf, "..%s..%s%s", PATH_SEPARATOR, PATH_SEPARATOR, file_path );
|
||||
retval = boinc_link( buf, link_path);
|
||||
if (log_flags.task_debug) {
|
||||
printf("link %s to %s\n", file_path, link_path);
|
||||
}
|
||||
if (retval) {
|
||||
perror("link");
|
||||
sprintf(buf, "Can't link %s to %s\n", file_path, link_path);
|
||||
show_message(wup->project, buf, MSG_ERROR);
|
||||
fclose(f);
|
||||
return retval;
|
||||
}
|
||||
|
@ -218,20 +222,21 @@ int ACTIVE_TASK::start(bool first_time) {
|
|||
if (strlen(file_ref.open_name)) {
|
||||
if (first_time) {
|
||||
sprintf(link_path, "%s%s%s", slot_dir, PATH_SEPARATOR, file_ref.open_name);
|
||||
sprintf(temp, "..%s..%s%s", PATH_SEPARATOR, PATH_SEPARATOR, file_path );
|
||||
sprintf(buf, "..%s..%s%s", PATH_SEPARATOR, PATH_SEPARATOR, file_path );
|
||||
if (log_flags.task_debug) {
|
||||
printf("link %s to %s\n", file_path, link_path);
|
||||
}
|
||||
retval = boinc_link(temp, link_path);
|
||||
retval = boinc_link(buf, link_path);
|
||||
if (retval) {
|
||||
perror("link");
|
||||
sprintf(buf, "Can't link %s to %s\n", file_path, link_path);
|
||||
show_message(wup->project, buf, MSG_ERROR);
|
||||
fclose(f);
|
||||
return retval;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
sprintf(temp, "..%s..%s%s", PATH_SEPARATOR, PATH_SEPARATOR, file_path);
|
||||
retval = write_fd_init_file(f, temp, file_ref.fd, 1);
|
||||
sprintf(buf, "..%s..%s%s", PATH_SEPARATOR, PATH_SEPARATOR, file_path);
|
||||
retval = write_fd_init_file(f, buf, file_ref.fd, 1);
|
||||
if (retval) return retval;
|
||||
}
|
||||
}
|
||||
|
@ -243,24 +248,22 @@ int ACTIVE_TASK::start(bool first_time) {
|
|||
get_pathname(file_ref.file_info, file_path);
|
||||
if (strlen(file_ref.open_name)) {
|
||||
if (first_time) {
|
||||
// the following is a relic of using hard links. not needed
|
||||
//int fd = creat(file_path, 0660);
|
||||
//close(fd);
|
||||
sprintf(link_path, "%s%s%s", slot_dir, PATH_SEPARATOR, file_ref.open_name);
|
||||
sprintf(temp, "..%s..%s%s", PATH_SEPARATOR, PATH_SEPARATOR, file_path );
|
||||
sprintf(buf, "..%s..%s%s", PATH_SEPARATOR, PATH_SEPARATOR, file_path );
|
||||
if (log_flags.task_debug) {
|
||||
printf("link %s to %s\n", file_path, link_path);
|
||||
}
|
||||
retval = boinc_link(temp, link_path);
|
||||
retval = boinc_link(buf, link_path);
|
||||
if (retval) {
|
||||
sprintf(buf, "Can't link %s to %s\n", file_path, link_path);
|
||||
show_message(wup->project, buf, MSG_ERROR);
|
||||
fclose(f);
|
||||
perror("link");
|
||||
return retval;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
sprintf(temp, "..%s..%s%s", PATH_SEPARATOR, PATH_SEPARATOR, file_path);
|
||||
retval = write_fd_init_file(f, temp, file_ref.fd, 0);
|
||||
sprintf(buf, "..%s..%s%s", PATH_SEPARATOR, PATH_SEPARATOR, file_path);
|
||||
retval = write_fd_init_file(f, buf, file_ref.fd, 0);
|
||||
if (retval) return retval;
|
||||
}
|
||||
}
|
||||
|
@ -284,10 +287,7 @@ int ACTIVE_TASK::start(bool first_time) {
|
|||
|
||||
// NOTE: in Windows, stderr is redirected within boinc_init();
|
||||
|
||||
sprintf( cmd_line, "%s %s", exec_path, wup->command_line );
|
||||
// Need to condense argv into a single string
|
||||
//if (log_flags.task_debug) print_argv(argv);
|
||||
//
|
||||
sprintf(cmd_line, "%s %s", exec_path, wup->command_line);
|
||||
full_path(slot_dir, slotdirpath);
|
||||
if (!CreateProcess(exec_path,
|
||||
cmd_line,
|
||||
|
@ -307,13 +307,13 @@ int ACTIVE_TASK::start(bool first_time) {
|
|||
NULL, win_error, 0, (LPTSTR)&lpMsgBuf, 0, errorargs
|
||||
);
|
||||
|
||||
// check for an error; if there is one, set error information for the currect result
|
||||
if (win_error) {
|
||||
gstate.report_project_error(*result, win_error, (LPTSTR)&lpMsgBuf);
|
||||
gstate.report_result_error(*result, win_error, (LPTSTR)&lpMsgBuf);
|
||||
LocalFree(lpMsgBuf);
|
||||
return -1;
|
||||
}
|
||||
fprintf(stdout, "CreateProcess: %s\n", (LPCTSTR)lpMsgBuf);
|
||||
sprintf(buf, "CreateProcess: %s\n", (LPCTSTR)lpMsgBuf);
|
||||
show_message(wup->project, buf, MSG_ERROR);
|
||||
LocalFree(lpMsgBuf);
|
||||
}
|
||||
pid = process_info.dwProcessId;
|
||||
|
@ -343,8 +343,8 @@ int ACTIVE_TASK::start(bool first_time) {
|
|||
argv[0] = exec_name;
|
||||
parse_command_line(wup->command_line, argv+1);
|
||||
if (log_flags.task_debug) print_argv(argv);
|
||||
boinc_resolve_filename(exec_name, temp, sizeof(temp));
|
||||
retval = execv(temp, argv);
|
||||
boinc_resolve_filename(exec_name, buf, sizeof(buf));
|
||||
retval = execv(buf, argv);
|
||||
fprintf(stderr, "execv failed: %d\n", retval);
|
||||
perror("execv");
|
||||
exit(1);
|
||||
|
@ -423,8 +423,12 @@ bool ACTIVE_TASK_SET::poll() {
|
|||
for (int i=0; i<active_tasks.size(); i++) {
|
||||
atp = active_tasks[i];
|
||||
if (GetExitCodeProcess(atp->pid_handle, &exit_code)) {
|
||||
//
|
||||
// Get the elapsed CPU time
|
||||
if (GetProcessTimes(atp->pid_handle, &creation_time, &exit_time, &kernel_time, &user_time)) {
|
||||
if (GetProcessTimes(
|
||||
atp->pid_handle, &creation_time, &exit_time,
|
||||
&kernel_time, &user_time
|
||||
)) {
|
||||
tKernel.LowPart = kernel_time.dwLowDateTime;
|
||||
tKernel.HighPart = kernel_time.dwHighDateTime;
|
||||
tUser.LowPart = user_time.dwLowDateTime;
|
||||
|
@ -439,7 +443,7 @@ bool ACTIVE_TASK_SET::poll() {
|
|||
if (atp->state == PROCESS_ABORT_PENDING) {
|
||||
atp->state = PROCESS_ABORTED;
|
||||
atp->result->active_task_state = PROCESS_ABORTED;
|
||||
gstate.report_project_error(
|
||||
gstate.report_result_error(
|
||||
*(atp->result), 0, "process was aborted\n"
|
||||
);
|
||||
} else {
|
||||
|
@ -449,7 +453,7 @@ bool ACTIVE_TASK_SET::poll() {
|
|||
atp->result->active_task_state = PROCESS_EXITED;
|
||||
//if a nonzero error code, then report it
|
||||
if (exit_code) {
|
||||
gstate.report_project_error(
|
||||
gstate.report_result_error(
|
||||
*(atp->result), 0,
|
||||
"process exited with a non zero exit code\n"
|
||||
);
|
||||
|
@ -482,7 +486,7 @@ bool ACTIVE_TASK_SET::poll() {
|
|||
if (atp->state == PROCESS_ABORT_PENDING) {
|
||||
atp->state = PROCESS_ABORTED;
|
||||
atp->result->active_task_state = PROCESS_ABORTED;
|
||||
gstate.report_project_error(
|
||||
gstate.report_result_error(
|
||||
*(atp->result), 0, "process was aborted\n"
|
||||
);
|
||||
} else {
|
||||
|
@ -491,24 +495,30 @@ bool ACTIVE_TASK_SET::poll() {
|
|||
atp->exit_status = WEXITSTATUS(stat);
|
||||
atp->result->exit_status = atp->exit_status;
|
||||
atp->result->active_task_state = PROCESS_EXITED;
|
||||
// If exit_status != 0, then we don't need to upload the
|
||||
// files for the result of this app
|
||||
|
||||
// If exit_status is nonzero, then we don't need to upload the
|
||||
// output files
|
||||
//
|
||||
if(atp->exit_status) {
|
||||
gstate.report_project_error(
|
||||
gstate.report_result_error(
|
||||
*(atp->result), 0,
|
||||
"process exited with a nonzero exit code\n"
|
||||
);
|
||||
}
|
||||
if (log_flags.task_debug) printf("process exited: status %d\n", atp->exit_status);
|
||||
if (log_flags.task_debug) {
|
||||
printf("process exited: status %d\n", atp->exit_status);
|
||||
}
|
||||
} else if (WIFSIGNALED(stat)) {
|
||||
atp->state = PROCESS_WAS_SIGNALED;
|
||||
atp->signal = WTERMSIG(stat);
|
||||
atp->result->signal = atp->signal;
|
||||
atp->result->active_task_state = PROCESS_WAS_SIGNALED;
|
||||
gstate.report_project_error(
|
||||
gstate.report_result_error(
|
||||
*(atp->result), 0, "process was signaled\n"
|
||||
);
|
||||
if (log_flags.task_debug) printf("process was signaled: %d\n", atp->signal);
|
||||
if (log_flags.task_debug) {
|
||||
printf("process was signaled: %d\n", atp->signal);
|
||||
}
|
||||
} else {
|
||||
atp->state = PROCESS_EXIT_UNKNOWN;
|
||||
atp->result->state = PROCESS_EXIT_UNKNOWN;
|
||||
|
@ -562,15 +572,15 @@ bool ACTIVE_TASK::read_stderr_file() {
|
|||
// Wait up to wait_time seconds for all processes in this set to exit
|
||||
//
|
||||
int ACTIVE_TASK_SET::wait_for_exit(double wait_time) {
|
||||
bool all_exited;
|
||||
unsigned int i,n;
|
||||
ACTIVE_TASK *atp;
|
||||
bool all_exited;
|
||||
unsigned int i,n;
|
||||
ACTIVE_TASK *atp;
|
||||
|
||||
for( i=0;i<10;i++ ) {
|
||||
for (i=0; i<10; i++) {
|
||||
boinc_sleep(wait_time/10.0);
|
||||
all_exited = true;
|
||||
|
||||
for (n=0;n<active_tasks.size();n++) {
|
||||
for (n=0; n<active_tasks.size(); n++) {
|
||||
atp = active_tasks[n];
|
||||
if (!atp->task_exited()) {
|
||||
all_exited = false;
|
||||
|
@ -606,8 +616,12 @@ void ACTIVE_TASK_SET::suspend_all() {
|
|||
ACTIVE_TASK* atp;
|
||||
for (i=0; i<active_tasks.size(); i++) {
|
||||
atp = active_tasks[i];
|
||||
if(atp->suspend()) {
|
||||
fprintf(stderr, "ACTIVE_TASK_SET::suspend_all(): could not suspend active_task\n");
|
||||
if (atp->suspend()) {
|
||||
show_message(
|
||||
atp->wup->project,
|
||||
"ACTIVE_TASK_SET::suspend_all(): could not suspend active_task\n",
|
||||
MSG_ERROR
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -619,8 +633,12 @@ void ACTIVE_TASK_SET::unsuspend_all() {
|
|||
ACTIVE_TASK* atp;
|
||||
for (i=0; i<active_tasks.size(); i++) {
|
||||
atp = active_tasks[i];
|
||||
if(atp->unsuspend()) {
|
||||
fprintf(stderr, "ACTIVE_TASK_SET::unsuspend_all(): could not unsuspend active_task\n");
|
||||
if (atp->unsuspend()) {
|
||||
show_message(
|
||||
atp->wup->project,
|
||||
"ACTIVE_TASK_SET::unsuspend_all(): could not unsuspend active_task\n",
|
||||
MSG_ERROR
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -633,7 +651,10 @@ void ACTIVE_TASK_SET::request_tasks_exit() {
|
|||
for (i=0; i<active_tasks.size(); i++) {
|
||||
atp = active_tasks[i];
|
||||
if(atp->request_exit()) {
|
||||
fprintf(stderr, "ACTIVE_TASK_SET::request_tasks_exit(): could not exit active_task\n");
|
||||
show_message(atp->wup->project,
|
||||
"ACTIVE_TASK_SET::exit_tasks(): could not suspend active_task\n",
|
||||
MSG_ERROR
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -705,13 +726,14 @@ int ACTIVE_TASK_SET::restart_tasks() {
|
|||
atp->result->is_active = true;
|
||||
retval = atp->start(false);
|
||||
if (log_flags.task) {
|
||||
printf("restarting application for result %s\n", atp->result->name);
|
||||
sprintf(buf, "restarting computation for result %s\n", atp->result->name);
|
||||
show_message(atp->wup->project, buf, MSG_INFO);
|
||||
}
|
||||
if (retval) {
|
||||
sprintf(buf, "ACTIVE_TASKS::restart_tasks(); restart failed: %d\n", retval);
|
||||
show_message(atp->wup->project, buf, MSG_ERROR);
|
||||
atp->result->active_task_state = PROCESS_COULDNT_START;
|
||||
gstate.report_project_error(
|
||||
gstate.report_result_error(
|
||||
*(atp->result), retval,
|
||||
"Couldn't restart the app for this result.\n"
|
||||
);
|
||||
|
|
|
@ -1006,16 +1006,13 @@ bool CLIENT_STATE::garbage_collect() {
|
|||
// See if the files for this result's workunit had
|
||||
// any errors (MD5, RSA, etc)
|
||||
//
|
||||
if(rp->wup->had_failure(failnum)) {
|
||||
if (rp->wup->had_failure(failnum)) {
|
||||
// If we don't already have an error for this file
|
||||
if (!rp->ready_to_ack) {
|
||||
// the wu corresponding to this result
|
||||
// had an error downloading some input file(s).
|
||||
//
|
||||
report_project_error(
|
||||
*rp,0,
|
||||
"The work_unit corresponding to this result had an error"
|
||||
);
|
||||
report_result_error(*rp, 0, "Couldn't get input files");
|
||||
}
|
||||
} else {
|
||||
rp->wup->ref_cnt++;
|
||||
|
@ -1030,9 +1027,7 @@ bool CLIENT_STATE::garbage_collect() {
|
|||
if (!rp->ready_to_ack) {
|
||||
// had an error uploading a file for this result
|
||||
//
|
||||
report_project_error(*rp,0,
|
||||
"An output file of this result had an error"
|
||||
);
|
||||
report_result_error(*rp, 0, "Couldn't upload files");
|
||||
}
|
||||
} else {
|
||||
rp->output_files[i].file_info->ref_cnt++;
|
||||
|
@ -1277,37 +1272,39 @@ void CLIENT_STATE::set_client_state_dirty(char* source) {
|
|||
}
|
||||
|
||||
// Call this when a result has a nonrecoverable error.
|
||||
// The error will appear in the stderr_out field of the result.
|
||||
// Append a description of the error to the stderr_out field of the result.
|
||||
//
|
||||
// It goes through the input and output files for this result
|
||||
// Go through the input and output files for this result
|
||||
// and generates error messages for upload/download failures.
|
||||
//
|
||||
// This function is called in the following situations:
|
||||
// 1. When the active_task could not start or restart,
|
||||
// in which case err_num is set to the an OS-specific error_code.
|
||||
// in which case err_num is set to an OS-specific error_code.
|
||||
// and err_msg has an OS-supplied string.
|
||||
// 2. when we fail in downloading an input file or uploading an output file,
|
||||
// in which case err_num and err_msg are zero.
|
||||
// 3. When the active_task exits with a non_zero error code
|
||||
// or it gets signaled, relevant info is printed to stderr_out of res.
|
||||
// or it gets signaled.
|
||||
//
|
||||
int CLIENT_STATE::report_project_error(
|
||||
int CLIENT_STATE::report_result_error(
|
||||
RESULT& res, int err_num, char *err_msg
|
||||
) {
|
||||
char total_err[MAX_BLOB_LEN];
|
||||
char buf[MAX_BLOB_LEN];
|
||||
unsigned int i;
|
||||
int failnum;
|
||||
|
||||
// only do this once per result
|
||||
//
|
||||
if (res.ready_to_ack) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
res.ready_to_ack = true;
|
||||
|
||||
scheduler_op->backoff(res.project, "");
|
||||
scheduler_op->backoff(res.project, "Backing off because a result failed");
|
||||
|
||||
sprintf(
|
||||
total_err,
|
||||
buf,
|
||||
"<message>%s</message>\n"
|
||||
"<active_task_state>%d</active_task_state>\n"
|
||||
"<exit_status>%d</exit_status>\n"
|
||||
|
@ -1318,29 +1315,29 @@ int CLIENT_STATE::report_project_error(
|
|||
res.signal
|
||||
);
|
||||
|
||||
if (strlen(res.stderr_out)+strlen(total_err) < MAX_BLOB_LEN) {
|
||||
strcat(res.stderr_out, total_err );
|
||||
if (strlen(res.stderr_out) + strlen(buf) < MAX_BLOB_LEN) {
|
||||
strcat(res.stderr_out, buf );
|
||||
}
|
||||
|
||||
if ((res.state == RESULT_FILES_DOWNLOADED) && err_num) {
|
||||
sprintf(total_err,"<couldnt_start>%d</couldnt_start>\n", err_num);
|
||||
if (strlen(res.stderr_out)+strlen(total_err) < MAX_BLOB_LEN) {
|
||||
strcat(res.stderr_out, total_err );
|
||||
sprintf(buf,"<couldnt_start>%d</couldnt_start>\n", err_num);
|
||||
if (strlen(res.stderr_out) + strlen(buf) < MAX_BLOB_LEN) {
|
||||
strcat(res.stderr_out, buf );
|
||||
}
|
||||
}
|
||||
|
||||
if (res.state == RESULT_NEW) {
|
||||
for (i=0;i<res.wup->input_files.size();i++) {
|
||||
if (res.wup->input_files[i].file_info->had_failure(failnum)) {
|
||||
sprintf(total_err,
|
||||
sprintf(buf,
|
||||
"<download_error>\n"
|
||||
" <file_name>%s</file_name>\n"
|
||||
" <error_code>%d</error_code>\n"
|
||||
"</download_error>\n",
|
||||
res.wup->input_files[i].file_info->name, failnum
|
||||
);
|
||||
if (strlen(res.stderr_out)+strlen(total_err) < MAX_BLOB_LEN ) {
|
||||
strcat( res.stderr_out, total_err );
|
||||
if (strlen(res.stderr_out) + strlen(buf) < MAX_BLOB_LEN ) {
|
||||
strcat( res.stderr_out, buf );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1349,15 +1346,15 @@ int CLIENT_STATE::report_project_error(
|
|||
if (res.state == RESULT_COMPUTE_DONE) {
|
||||
for (i=0; i<res.output_files.size(); i++) {
|
||||
if (res.output_files[i].file_info->had_failure(failnum)) {
|
||||
sprintf(total_err,
|
||||
sprintf(buf,
|
||||
"<upload_error>\n"
|
||||
" <file_name>%s</file_name>\n"
|
||||
" <error_code>%d</error_code>\n"
|
||||
"</upload_error>\n",
|
||||
res.output_files[i].file_info->name, failnum
|
||||
);
|
||||
if (strlen(res.stderr_out)+strlen(total_err) < MAX_BLOB_LEN ) {
|
||||
strcat( res.stderr_out, total_err );
|
||||
if (strlen(res.stderr_out) + strlen(buf) < MAX_BLOB_LEN ) {
|
||||
strcat( res.stderr_out, buf );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -156,7 +156,7 @@ public:
|
|||
int make_scheduler_request(PROJECT*, double);
|
||||
int handle_scheduler_reply(PROJECT*, char* scheduler_url, int& nresults);
|
||||
void set_client_state_dirty(char*);
|
||||
int report_project_error( RESULT &res,int err_num, char *err_msg);
|
||||
int report_result_error(RESULT &res, int err_num, char *err_msg);
|
||||
// flag a result as having an error
|
||||
int add_project(char* master_url, char* authenticator);
|
||||
int change_project(int index, char* master_url, char* authenticator);
|
||||
|
|
|
@ -712,9 +712,6 @@ int RESULT::write(FILE* out, bool to_server) {
|
|||
FILE_INFO* fip;
|
||||
int n, retval;
|
||||
|
||||
// If we didn't have an error with this result, (in which case
|
||||
// we would have called report_project_error():
|
||||
|
||||
fprintf(out,
|
||||
"<result>\n"
|
||||
" <name>%s</name>\n"
|
||||
|
|
|
@ -112,10 +112,13 @@ bool CLIENT_STATE::handle_running_apps() {
|
|||
unsigned int i;
|
||||
ACTIVE_TASK* atp;
|
||||
bool action = false;
|
||||
char buf[256];
|
||||
|
||||
for (i=0; i<active_tasks.active_tasks.size(); i++) {
|
||||
atp = active_tasks.active_tasks[i];
|
||||
if (atp->state != PROCESS_RUNNING) {
|
||||
sprintf(buf, "computation for result %s finished\n", atp->wup->name);
|
||||
show_message(atp->wup->project, buf, MSG_INFO);
|
||||
if (log_flags.task_debug) {
|
||||
printf(
|
||||
"task finished; pid %d, status %d\n",
|
||||
|
@ -160,8 +163,8 @@ bool CLIENT_STATE::start_apps() {
|
|||
RESULT* rp;
|
||||
ACTIVE_TASK* atp;
|
||||
bool action = false;
|
||||
int open_slot;
|
||||
int retval;
|
||||
int open_slot, retval;
|
||||
char buf[256];
|
||||
|
||||
for (i=0; i<results.size(); i++) {
|
||||
|
||||
|
@ -179,8 +182,9 @@ bool CLIENT_STATE::start_apps() {
|
|||
// 3) all the input files for the result are locally available
|
||||
//
|
||||
if (rp->state == RESULT_FILES_DOWNLOADED && !rp->is_active ) {
|
||||
if (log_flags.task_debug) {
|
||||
printf("starting application for result %s\n", rp->name);
|
||||
if (log_flags.task) {
|
||||
sprintf(buf, "starting computation for result %s\n", rp->name);
|
||||
show_message(rp->project, buf, MSG_INFO);
|
||||
}
|
||||
rp->is_active = true;
|
||||
atp = new ACTIVE_TASK;
|
||||
|
@ -193,7 +197,7 @@ bool CLIENT_STATE::start_apps() {
|
|||
if (retval) {
|
||||
atp->state = PROCESS_COULDNT_START;
|
||||
atp->result->active_task_state = PROCESS_COULDNT_START;
|
||||
report_project_error(
|
||||
report_result_error(
|
||||
*(atp->result), retval,
|
||||
"Couldn't start the app for this result.\n"
|
||||
);
|
||||
|
|
|
@ -44,7 +44,6 @@ int FILE_XFER::init_download(FILE_INFO& file_info) {
|
|||
|
||||
fip = &file_info;
|
||||
get_pathname(fip, pathname);
|
||||
// Check the current file size
|
||||
if (file_size(pathname, f_size)) {
|
||||
f_size = 0;
|
||||
}
|
||||
|
@ -59,8 +58,7 @@ int FILE_XFER::init_download(FILE_INFO& file_info) {
|
|||
int FILE_XFER::init_upload(FILE_INFO& file_info) {
|
||||
// If upload_offset < 0, we need to query the upload handler
|
||||
// for the offset information
|
||||
// TODO: give priority to URL of unfinished upload if there
|
||||
// are multiple choices
|
||||
// TODO: give priority to unfinished upload if there are multiple choices
|
||||
//
|
||||
fip = &file_info;
|
||||
get_pathname(fip, pathname);
|
||||
|
|
|
@ -30,9 +30,22 @@
|
|||
LOG_FLAGS log_flags;
|
||||
|
||||
LOG_FLAGS::LOG_FLAGS() {
|
||||
task = file_xfer = sched_ops = state_debug = false;
|
||||
task_debug = file_xfer_debug = sched_op_debug = false;
|
||||
http_debug = time_debug = net_xfer_debug = false;
|
||||
|
||||
// informational output is on by default
|
||||
//
|
||||
task = true;
|
||||
file_xfer = true;
|
||||
sched_ops = true;
|
||||
|
||||
// debugging output is off by default
|
||||
//
|
||||
state_debug = false;
|
||||
task_debug = false;
|
||||
file_xfer_debug = false;
|
||||
sched_op_debug = false;
|
||||
http_debug = false;
|
||||
time_debug = false;
|
||||
net_xfer_debug = false;
|
||||
measurement_debug = false;
|
||||
}
|
||||
|
||||
|
@ -48,48 +61,37 @@ int LOG_FLAGS::parse(FILE* in) {
|
|||
else if (match_tag(buf, "<task/>")) {
|
||||
task = true;
|
||||
continue;
|
||||
}
|
||||
else if (match_tag(buf, "<file_xfer/>")) {
|
||||
} else if (match_tag(buf, "<file_xfer/>")) {
|
||||
file_xfer = true;
|
||||
continue;
|
||||
}
|
||||
else if (match_tag(buf, "<sched_ops/>")) {
|
||||
} else if (match_tag(buf, "<sched_ops/>")) {
|
||||
sched_ops = true;
|
||||
continue;
|
||||
}
|
||||
else if (match_tag(buf, "<state_debug/>")) {
|
||||
} else if (match_tag(buf, "<state_debug/>")) {
|
||||
state_debug = true;
|
||||
continue;
|
||||
}
|
||||
else if (match_tag(buf, "<task_debug/>")) {
|
||||
} else if (match_tag(buf, "<task_debug/>")) {
|
||||
task_debug = true;
|
||||
continue;
|
||||
}
|
||||
else if (match_tag(buf, "<file_xfer_debug/>")) {
|
||||
} else if (match_tag(buf, "<file_xfer_debug/>")) {
|
||||
file_xfer_debug = true;
|
||||
continue;
|
||||
}
|
||||
else if (match_tag(buf, "<sched_op_debug/>")) {
|
||||
} else if (match_tag(buf, "<sched_op_debug/>")) {
|
||||
sched_op_debug = true;
|
||||
continue;
|
||||
}
|
||||
else if (match_tag(buf, "<http_debug/>")) {
|
||||
} else if (match_tag(buf, "<http_debug/>")) {
|
||||
http_debug = true;
|
||||
continue;
|
||||
}
|
||||
else if (match_tag(buf, "<time_debug/>")) {
|
||||
} else if (match_tag(buf, "<time_debug/>")) {
|
||||
time_debug = true;
|
||||
continue;
|
||||
}
|
||||
else if (match_tag(buf, "<net_xfer_debug/>")) {
|
||||
} else if (match_tag(buf, "<net_xfer_debug/>")) {
|
||||
net_xfer_debug = true;
|
||||
continue;
|
||||
}
|
||||
else if (match_tag(buf, "<measurement_debug/>")) {
|
||||
} else if (match_tag(buf, "<measurement_debug/>")) {
|
||||
measurement_debug = true;
|
||||
continue;
|
||||
}
|
||||
else if (match_tag(buf, "<poll_debug/>")) {
|
||||
} else if (match_tag(buf, "<poll_debug/>")) {
|
||||
poll_debug = true;
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -65,6 +65,7 @@ bool PERS_FILE_XFER::start_xfer() {
|
|||
int retval;
|
||||
struct tm *newtime;
|
||||
time_t now;
|
||||
char buf[256];
|
||||
|
||||
now = time(0);
|
||||
|
||||
|
@ -80,7 +81,10 @@ bool PERS_FILE_XFER::start_xfer() {
|
|||
file_xfer = new FILE_XFER;
|
||||
if (gstate.use_http_proxy) {
|
||||
file_xfer->use_http_proxy = true;
|
||||
safe_strncpy(file_xfer->proxy_server_name, gstate.proxy_server_name, sizeof(file_xfer->proxy_server_name));
|
||||
safe_strncpy(
|
||||
file_xfer->proxy_server_name, gstate.proxy_server_name,
|
||||
sizeof(file_xfer->proxy_server_name)
|
||||
);
|
||||
file_xfer->proxy_server_port = gstate.proxy_server_port;
|
||||
}
|
||||
if (is_upload) {
|
||||
|
@ -89,16 +93,18 @@ bool PERS_FILE_XFER::start_xfer() {
|
|||
retval = file_xfer->init_download(*fip);
|
||||
}
|
||||
if (retval) {
|
||||
fprintf(
|
||||
stderr, "couldn't start %s for %s: error %d\n",
|
||||
sprintf(buf,
|
||||
"couldn't start %s for %s: error %d\n",
|
||||
(is_upload ? "upload" : "download"), fip->get_url(), retval
|
||||
);
|
||||
show_message(fip->project, buf, MSG_ERROR);
|
||||
// TODO: do we need to do anything here?
|
||||
} else {
|
||||
retval = gstate.file_xfers->insert(file_xfer);
|
||||
fxp = file_xfer;
|
||||
if (retval) {
|
||||
if (log_flags.file_xfer) {
|
||||
printf( "file_xfer insert failed\n" );
|
||||
show_message(fip->project, "file_xfer insert failed\n", MSG_ERROR);
|
||||
}
|
||||
fxp->file_xfer_retval = retval;
|
||||
handle_xfer_failure(now);
|
||||
|
@ -108,11 +114,12 @@ bool PERS_FILE_XFER::start_xfer() {
|
|||
if (log_flags.file_xfer) {
|
||||
now = time(0);
|
||||
newtime = localtime(&now);
|
||||
printf(
|
||||
sprintf(buf,
|
||||
"started %s of %s to %s at time: %s\n",
|
||||
(is_upload ? "upload" : "download"), fip->name, fip->get_url(),
|
||||
asctime(newtime)
|
||||
);
|
||||
show_message(fip->project, buf, MSG_INFO);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -125,7 +132,7 @@ bool PERS_FILE_XFER::start_xfer() {
|
|||
//
|
||||
bool PERS_FILE_XFER::poll(unsigned int now) {
|
||||
int retval;
|
||||
char pathname[256];
|
||||
char pathname[256], buf[256];
|
||||
|
||||
if (xfer_done) {
|
||||
return false;
|
||||
|
@ -151,10 +158,12 @@ bool PERS_FILE_XFER::poll(unsigned int now) {
|
|||
|
||||
if (fxp->file_xfer_done) {
|
||||
if (log_flags.file_xfer) {
|
||||
printf(
|
||||
"file transfer done for %s; retval %d\n",
|
||||
sprintf(
|
||||
buf,
|
||||
"file transfer done for %s; error code %d\n",
|
||||
fip->get_url(), fxp->file_xfer_retval
|
||||
);
|
||||
show_message(fip->project, buf, MSG_INFO);
|
||||
}
|
||||
if (fxp->file_xfer_retval == 0) {
|
||||
// The transfer finished with no errors.
|
||||
|
@ -174,11 +183,13 @@ bool PERS_FILE_XFER::poll(unsigned int now) {
|
|||
get_pathname(fip, pathname);
|
||||
retval = verify_downloaded_file(pathname, *fip);
|
||||
if (retval) {
|
||||
printf("checksum or signature error for %s\n", fip->name);
|
||||
sprintf(buf, "checksum or signature error for %s\n", fip->name);
|
||||
show_message(fip->project, buf, MSG_ERROR);
|
||||
fip->status = retval;
|
||||
} else {
|
||||
if (log_flags.file_xfer_debug) {
|
||||
printf("MD5 checksum validated for %s\n", pathname);
|
||||
sprintf(buf, "MD5 checksum validated for %s\n", pathname);
|
||||
show_message(fip->project, buf, MSG_INFO);
|
||||
}
|
||||
// Set the appropriate permissions depending on whether
|
||||
// it's an executable or normal file
|
||||
|
@ -206,12 +217,15 @@ bool PERS_FILE_XFER::poll(unsigned int now) {
|
|||
// Handle a transfer failure
|
||||
//
|
||||
void PERS_FILE_XFER::handle_xfer_failure(unsigned int cur_time) {
|
||||
char buf[256];
|
||||
|
||||
// If it was a bad range request, delete the file and start over
|
||||
//
|
||||
if (fxp->file_xfer_retval == HTTP_STATUS_RANGE_REQUEST_ERROR) {
|
||||
fip->delete_file();
|
||||
}
|
||||
|
||||
retry_and_backoff(cur_time);
|
||||
retry_or_backoff(cur_time);
|
||||
|
||||
// See if it's time to give up on the persistent file xfer
|
||||
//
|
||||
|
@ -222,19 +236,19 @@ void PERS_FILE_XFER::handle_xfer_failure(unsigned int cur_time) {
|
|||
fip->status = ERR_GIVEUP_DOWNLOAD;
|
||||
}
|
||||
xfer_done = true;
|
||||
}
|
||||
if (log_flags.file_xfer_debug) {
|
||||
printf("Error: transfer failure for %s: %d\n", fip->name, fip->status);
|
||||
sprintf(buf, "Giving up on file transfer for %s: %d\n", fip->name, fip->status);
|
||||
show_message(fip->project, buf, MSG_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
// Cycle to the next URL, or if we've hit all URLs in this cycle,
|
||||
// backoff and try again later
|
||||
//
|
||||
void PERS_FILE_XFER::retry_and_backoff(unsigned int cur_time) {
|
||||
void PERS_FILE_XFER::retry_or_backoff(unsigned int cur_time) {
|
||||
double exp_backoff;
|
||||
struct tm *newtime;
|
||||
time_t aclock;
|
||||
char buf[256];
|
||||
|
||||
time( &aclock ); /* Get time in seconds */
|
||||
|
||||
|
@ -258,17 +272,18 @@ void PERS_FILE_XFER::retry_and_backoff(unsigned int cur_time) {
|
|||
next_request_time = cur_time+(int)max(PERS_RETRY_DELAY_MIN,min(PERS_RETRY_DELAY_MAX,exp_backoff));
|
||||
}
|
||||
if (log_flags.file_xfer_debug) {
|
||||
printf(
|
||||
sprintf(buf,
|
||||
"exponential back off is %d, current_time is %s\n",
|
||||
(int) exp_backoff,asctime(newtime)
|
||||
);
|
||||
show_message(fip->project, buf, MSG_INFO);
|
||||
}
|
||||
}
|
||||
|
||||
// Parse XML information about a single persistent file transfer
|
||||
//
|
||||
int PERS_FILE_XFER::parse(FILE* fin) {
|
||||
char buf[256];
|
||||
char buf[256], buf2[256];
|
||||
|
||||
while (fgets(buf, 256, fin)) {
|
||||
if (match_tag(buf, "</persistent_file_xfer>")) return 0;
|
||||
|
@ -276,7 +291,10 @@ int PERS_FILE_XFER::parse(FILE* fin) {
|
|||
else if (parse_int(buf, "<first_request_time>", first_request_time)) continue;
|
||||
else if (parse_int(buf, "<next_request_time>", next_request_time)) continue;
|
||||
else if (parse_double(buf, "<time_so_far>", time_so_far)) continue;
|
||||
else fprintf(stderr, "PERS_FILE_XFER::parse(): unrecognized: %s\n", buf);
|
||||
else {
|
||||
sprintf(buf2, "PERS_FILE_XFER::parse(): unrecognized: %s\n", buf);
|
||||
show_message(fip->project, buf2, MSG_ERROR);
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
@ -339,6 +357,6 @@ int PERS_FILE_XFER_SET::remove(PERS_FILE_XFER* pfx) {
|
|||
}
|
||||
iter++;
|
||||
}
|
||||
fprintf(stderr, "PERS_FILE_XFER_SET::remove(): not found\n");
|
||||
show_message(pfx->fip->project, "PERS_FILE_XFER_SET::remove(): not found\n", MSG_ERROR);
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -61,7 +61,7 @@ public:
|
|||
int init(FILE_INFO*, bool is_file_upload);
|
||||
bool poll(unsigned int now);
|
||||
void handle_xfer_failure(unsigned int cur_time);
|
||||
void retry_and_backoff(unsigned int cur_time);
|
||||
void retry_or_backoff(unsigned int cur_time);
|
||||
int write(FILE* fout);
|
||||
int parse(FILE* fin);
|
||||
bool start_xfer();
|
||||
|
|
|
@ -148,10 +148,8 @@ int SCHEDULER_OP::set_min_rpc_time(PROJECT* p) {
|
|||
|
||||
// Back off on the scheduler and output an error msg if needed
|
||||
//
|
||||
void SCHEDULER_OP::backoff( PROJECT* p, char *error_msg ) {
|
||||
if (log_flags.sched_op_debug) {
|
||||
printf(error_msg);
|
||||
}
|
||||
void SCHEDULER_OP::backoff(PROJECT* p, char *error_msg ) {
|
||||
show_message(p, error_msg, MSG_ERROR);
|
||||
|
||||
if (p->master_fetch_failures >= MASTER_FETCH_RETRY_CAP) {
|
||||
p->master_url_fetch_pending = true;
|
||||
|
|
Loading…
Reference in New Issue