client: fix bug that caused delay in job cleanup

If a job has an output file with <copy_file> and <optional>,
and it doesn't create the file,
then the call to boinc_rename() (to move it to the project dir) fails,
and we back off and retry.

Solution: in boinc_rename(), if the rename fails,
check if the file exists, and if it doesn't then don't retry.

Also:
- when writing client messages, use the actual current time
  (dtime()) rather than client_state.now.
- write log msgs when output file renames fail
This commit is contained in:
David Anderson 2015-06-17 14:14:54 -07:00
parent 35d9b15ed8
commit 753ae6e768
4 changed files with 28 additions and 15 deletions

View File

@ -478,21 +478,29 @@ int ACTIVE_TASK::copy_output_files() {
);
sprintf(slotfile, "%s/%s", slot_dir, open_name);
get_pathname(fip, projfile, sizeof(projfile));
#if 1
boinc_rename(slotfile, projfile);
#else
int retval = boinc_rename(slotfile, projfile);
// this isn't a BOINC error.
// it just means the app didn't create an output file
// that it was supposed to.
// the rename fails if the output file isn't there.
//
if (retval) {
msg_printf(wup->project, MSG_INTERNAL_ERROR,
"Can't rename output file %s to %s: %s",
fip->name, projfile, boincerror(retval)
);
if (retval == ERR_FILE_MISSING) {
if (log_flags.slot_debug) {
msg_printf(wup->project, MSG_INFO,
"[slot] output file %s missing, not copying", slotfile
);
}
} else {
msg_printf(wup->project, MSG_INTERNAL_ERROR,
"Can't rename output file %s to %s: %s",
slotfile, projfile, boincerror(retval)
);
}
} else {
if (log_flags.slot_debug) {
msg_printf(wup->project, MSG_INFO,
"[slot] renamed %s to %s", slotfile, projfile
);
}
}
#endif
}
return 0;
}

View File

@ -56,7 +56,8 @@ void show_message(
) {
const char* x;
char message[1024], event_msg[1024], evt_message[2048];
char* time_string = time_to_string(gstate.now);
double t = dtime();
char* time_string = time_to_string(t);
// Cycle the log files if needed
//
@ -84,7 +85,7 @@ void show_message(
default:
strlcpy(event_msg, message, sizeof(event_msg));
}
message_descs.insert(p, priority, (int)gstate.now, event_msg);
message_descs.insert(p, priority, (int)t, event_msg);
// add a notice
//
@ -105,7 +106,7 @@ void show_message(
if (p) {
safe_strcpy(n.project_name, p->get_project_name());
}
n.create_time = n.arrival_time = gstate.now;
n.create_time = n.arrival_time = t;
safe_strcpy(n.category, (priority==MSG_USER_ALERT)?"client":"scheduler");
notices.append(n);
}

View File

@ -273,7 +273,7 @@ int delete_project_owned_file(const char* path, bool retry) {
if (retval && retry) {
if (log_flags.slot_debug) {
msg_printf(0, MSG_INFO,
"delete of %s failed (%d); retrying", path, retval
"[slot] delete of %s failed (%d); retrying", path, retval
);
}
double start = dtime();

View File

@ -628,6 +628,10 @@ int boinc_rename(const char* old, const char* newf) {
retval = boinc_rename_aux(old, newf);
if (retval) {
// if the rename failed, and the file exists,
// retry a few times
//
if (!boinc_file_exists(old)) return ERR_FILE_MISSING;
double start = dtime();
do {
boinc_sleep(drand()*2); // avoid lockstep