mirror of https://github.com/BOINC/boinc.git
- client: fix bugs in runtime estimation of jobs that
have run before but are not currently running. Old: - We maintain the most recent fraction_done in state file. But for apps that checkpoint seldom or never, this is not the relevant value, and frac done may go down when the app runs. - fraction_done_elapsed_time is not initialized, and can have garbage values for jobs that haven't run yet. New: - Record, in the state file, the values of fraction_done and fraction_done_elapsed_time at the most recent checkpoint. When the client starts up, use these values. svn path=/trunk/boinc/; revision=23455
This commit is contained in:
parent
c4e8e186c1
commit
fa459d780c
|
@ -2524,7 +2524,28 @@ Rom 25 Apr 2011
|
|||
screensaver_win.cpp
|
||||
|
||||
David 25 Apr 2011
|
||||
- validator: fix bug when check_pair() returns retry=true.
|
||||
- validator: fix bug when check_pair() returns retry=true,
|
||||
reported by Travis Desell.
|
||||
|
||||
sched/
|
||||
validator.cpp
|
||||
|
||||
David 25 Apr 2011
|
||||
- client: fix bugs in runtime estimation of jobs that
|
||||
have run before but are not currently running.
|
||||
Old:
|
||||
- We maintain the most recent fraction_done in state file.
|
||||
But for apps that checkpoint seldom or never,
|
||||
this is not the relevant value,
|
||||
and frac done may go down when the app runs.
|
||||
- fraction_done_elapsed_time is not initialized,
|
||||
and can have garbage values for jobs that haven't run yet.
|
||||
New:
|
||||
- Record, in the state file, the values of
|
||||
fraction_done and fraction_done_elapsed_time
|
||||
at the most recent checkpoint.
|
||||
When the client starts up, use these values.
|
||||
|
||||
client/
|
||||
app.cpp,h
|
||||
app_control.cpp
|
||||
|
|
|
@ -84,21 +84,26 @@ ACTIVE_TASK::ACTIVE_TASK() {
|
|||
wup = NULL;
|
||||
app_version = NULL;
|
||||
pid = 0;
|
||||
slot = 0;
|
||||
|
||||
_task_state = PROCESS_UNINITIALIZED;
|
||||
slot = 0;
|
||||
checkpoint_cpu_time = 0;
|
||||
checkpoint_elapsed_time = 0;
|
||||
checkpoint_fraction_done = 0;
|
||||
checkpoint_fraction_done_elapsed_time = 0;
|
||||
current_cpu_time = 0;
|
||||
once_ran_edf = false;
|
||||
|
||||
fraction_done = 0;
|
||||
fraction_done_elapsed_time = 0;
|
||||
scheduler_state = CPU_SCHED_UNINITIALIZED;
|
||||
signal = 0;
|
||||
run_interval_start_wall_time = gstate.now;
|
||||
checkpoint_wall_time = 0;
|
||||
elapsed_time = 0;
|
||||
strcpy(slot_dir, "");
|
||||
graphics_mode_acked = MODE_UNSUPPORTED;
|
||||
graphics_mode_ack_timeout = 0;
|
||||
fraction_done = 0;
|
||||
run_interval_start_wall_time = gstate.now;
|
||||
checkpoint_cpu_time = 0;
|
||||
checkpoint_wall_time = 0;
|
||||
current_cpu_time = 0;
|
||||
once_ran_edf = false;
|
||||
elapsed_time = 0;
|
||||
checkpoint_elapsed_time = 0;
|
||||
have_trickle_down = false;
|
||||
send_upload_file_status = false;
|
||||
too_large = false;
|
||||
|
@ -497,7 +502,8 @@ int ACTIVE_TASK::write(MIOFILE& fout) {
|
|||
" <slot>%d</slot>\n"
|
||||
" <checkpoint_cpu_time>%f</checkpoint_cpu_time>\n"
|
||||
" <checkpoint_elapsed_time>%f</checkpoint_elapsed_time>\n"
|
||||
" <fraction_done>%f</fraction_done>\n"
|
||||
" <checkpoint_fraction_done>%f</checkpoint_fraction_done>\n"
|
||||
" <checkpoint_fraction_done_elapsed_time>%f</checkpoint_fraction_done_elapsed_time>\n"
|
||||
" <current_cpu_time>%f</current_cpu_time>\n"
|
||||
" <once_ran_edf>%d</once_ran_edf>\n"
|
||||
" <swap_size>%f</swap_size>\n"
|
||||
|
@ -511,7 +517,8 @@ int ACTIVE_TASK::write(MIOFILE& fout) {
|
|||
slot,
|
||||
checkpoint_cpu_time,
|
||||
checkpoint_elapsed_time,
|
||||
fraction_done,
|
||||
checkpoint_fraction_done,
|
||||
checkpoint_fraction_done_elapsed_time,
|
||||
current_cpu_time,
|
||||
once_ran_edf?1:0,
|
||||
procinfo.swap_size,
|
||||
|
@ -652,6 +659,18 @@ int ACTIVE_TASK::parse(MIOFILE& fin) {
|
|||
elapsed_time = checkpoint_cpu_time;
|
||||
checkpoint_elapsed_time = elapsed_time;
|
||||
}
|
||||
|
||||
// for 6.12.25-26 transition;
|
||||
// old clients write fraction_done to state file;
|
||||
// new clients don't
|
||||
if (fraction_done) {
|
||||
checkpoint_fraction_done = fraction_done;
|
||||
checkpoint_fraction_done_elapsed_time = checkpoint_elapsed_time;
|
||||
fraction_done_elapsed_time = checkpoint_elapsed_time;
|
||||
} else {
|
||||
fraction_done = checkpoint_fraction_done;
|
||||
fraction_done_elapsed_time = checkpoint_fraction_done_elapsed_time;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
else if (parse_str(buf, "<result_name>", result_name, sizeof(result_name))) continue;
|
||||
|
@ -659,9 +678,12 @@ int ACTIVE_TASK::parse(MIOFILE& fin) {
|
|||
else if (parse_int(buf, "<slot>", slot)) continue;
|
||||
else if (parse_int(buf, "<active_task_state>", dummy)) continue;
|
||||
else if (parse_double(buf, "<checkpoint_cpu_time>", checkpoint_cpu_time)) continue;
|
||||
else if (parse_double(buf, "<checkpoint_elapsed_time>", checkpoint_elapsed_time)) continue;
|
||||
else if (parse_double(buf, "<checkpoint_fraction_done>", checkpoint_fraction_done)) continue;
|
||||
else if (parse_double(buf, "<checkpoint_fraction_done_elapsed_time>", checkpoint_fraction_done_elapsed_time)) continue;
|
||||
else if (parse_bool(buf, "once_ran_edf", once_ran_edf)) continue;
|
||||
else if (parse_double(buf, "<fraction_done>", fraction_done)) continue;
|
||||
else if (parse_double(buf, "<checkpoint_elapsed_time>", checkpoint_elapsed_time)) continue;
|
||||
// deprecated - for backwards compat
|
||||
else if (parse_int(buf, "<app_version_num>", n)) continue;
|
||||
else if (parse_double(buf, "<swap_size>", procinfo.swap_size)) continue;
|
||||
else if (parse_double(buf, "<working_set_size>", procinfo.working_set_size)) continue;
|
||||
|
|
20
client/app.h
20
client/app.h
|
@ -81,18 +81,22 @@ struct ACTIVE_TASK {
|
|||
// in episodes before the current one)
|
||||
double checkpoint_elapsed_time;
|
||||
// elapsed time at last checkpoint
|
||||
double fraction_done;
|
||||
// App's estimate of how much of the work unit is done.
|
||||
// Passed from the application via an API call;
|
||||
// will be zero if the app doesn't use this call
|
||||
double fraction_done_elapsed_time;
|
||||
// elapsed time when fraction done was last reported
|
||||
double checkpoint_fraction_done;
|
||||
// fraction done at last checkpoint
|
||||
double checkpoint_fraction_done_elapsed_time;
|
||||
// fraction done elapsed time at last checkpoint
|
||||
double current_cpu_time;
|
||||
// most recent CPU time reported by app
|
||||
bool once_ran_edf;
|
||||
|
||||
// END OF ITEMS SAVED IN STATE FILE
|
||||
|
||||
double fraction_done;
|
||||
// App's estimate of how much of the work unit is done.
|
||||
// Passed from the application via an API call;
|
||||
// will be zero if the app doesn't use this call
|
||||
double fraction_done_elapsed_time;
|
||||
// elapsed time when fraction done was last reported
|
||||
int scheduler_state;
|
||||
int next_scheduler_state; // temp
|
||||
int signal;
|
||||
|
@ -102,8 +106,6 @@ struct ACTIVE_TASK {
|
|||
// wall time at the last checkpoint
|
||||
double elapsed_time;
|
||||
// current total elapsed (running) time
|
||||
int current_disk_usage(double&);
|
||||
// disk used by output files and temp files of this task
|
||||
char slot_dir[256];
|
||||
// directory where process runs (relative)
|
||||
char slot_path[512];
|
||||
|
@ -181,6 +183,8 @@ struct ACTIVE_TASK {
|
|||
int init(RESULT*);
|
||||
void cleanup_task();
|
||||
|
||||
int current_disk_usage(double&);
|
||||
// disk used by output files and temp files of this task
|
||||
void get_free_slot(RESULT*);
|
||||
int start(bool first_time); // start a process
|
||||
int request_exit();
|
||||
|
|
|
@ -1164,6 +1164,8 @@ void ACTIVE_TASK_SET::get_msgs() {
|
|||
atp->checkpoint_wall_time = gstate.now;
|
||||
atp->premature_exit_count = 0;
|
||||
atp->checkpoint_elapsed_time = atp->elapsed_time;
|
||||
atp->checkpoint_fraction_done = atp->fraction_done;
|
||||
atp->checkpoint_fraction_done_elapsed_time = atp->fraction_done_elapsed_time;
|
||||
if (log_flags.checkpoint_debug) {
|
||||
msg_printf(atp->wup->project, MSG_INFO,
|
||||
"[checkpoint] result %s checkpointed",
|
||||
|
|
Loading…
Reference in New Issue