- client: fix bugs in runtime estimation of jobs that

have run before but are not currently running.
    Old:
    - We maintain the most recent fraction_done in state file.
        But for apps that checkpoint seldom or never,
        this is not the relevant value,
        and frac done may go down when the app runs.
    - fraction_done_elapsed_time is not initialized,
        and can have garbage values for jobs that haven't run yet.
    New:
    - Record, in the state file, the values of
        fraction_done and fraction_done_elapsed_time
        at the most recent checkpoint.
        When the client starts up, use these values.


svn path=/trunk/boinc/; revision=23455
This commit is contained in:
David Anderson 2011-04-26 17:02:09 +00:00
parent c4e8e186c1
commit fa459d780c
4 changed files with 70 additions and 21 deletions

View File

@ -2524,7 +2524,28 @@ Rom 25 Apr 2011
screensaver_win.cpp
David 25 Apr 2011
- validator: fix bug when check_pair() returns retry=true.
- validator: fix bug when check_pair() returns retry=true,
reported by Travis Desell.
sched/
validator.cpp
David 25 Apr 2011
- client: fix bugs in runtime estimation of jobs that
have run before but are not currently running.
Old:
- We maintain the most recent fraction_done in state file.
But for apps that checkpoint seldom or never,
this is not the relevant value,
and frac done may go down when the app runs.
- fraction_done_elapsed_time is not initialized,
and can have garbage values for jobs that haven't run yet.
New:
- Record, in the state file, the values of
fraction_done and fraction_done_elapsed_time
at the most recent checkpoint.
When the client starts up, use these values.
client/
app.cpp,h
app_control.cpp

View File

@ -84,21 +84,26 @@ ACTIVE_TASK::ACTIVE_TASK() {
wup = NULL;
app_version = NULL;
pid = 0;
slot = 0;
_task_state = PROCESS_UNINITIALIZED;
slot = 0;
checkpoint_cpu_time = 0;
checkpoint_elapsed_time = 0;
checkpoint_fraction_done = 0;
checkpoint_fraction_done_elapsed_time = 0;
current_cpu_time = 0;
once_ran_edf = false;
fraction_done = 0;
fraction_done_elapsed_time = 0;
scheduler_state = CPU_SCHED_UNINITIALIZED;
signal = 0;
run_interval_start_wall_time = gstate.now;
checkpoint_wall_time = 0;
elapsed_time = 0;
strcpy(slot_dir, "");
graphics_mode_acked = MODE_UNSUPPORTED;
graphics_mode_ack_timeout = 0;
fraction_done = 0;
run_interval_start_wall_time = gstate.now;
checkpoint_cpu_time = 0;
checkpoint_wall_time = 0;
current_cpu_time = 0;
once_ran_edf = false;
elapsed_time = 0;
checkpoint_elapsed_time = 0;
have_trickle_down = false;
send_upload_file_status = false;
too_large = false;
@ -497,7 +502,8 @@ int ACTIVE_TASK::write(MIOFILE& fout) {
" <slot>%d</slot>\n"
" <checkpoint_cpu_time>%f</checkpoint_cpu_time>\n"
" <checkpoint_elapsed_time>%f</checkpoint_elapsed_time>\n"
" <fraction_done>%f</fraction_done>\n"
" <checkpoint_fraction_done>%f</checkpoint_fraction_done>\n"
" <checkpoint_fraction_done_elapsed_time>%f</checkpoint_fraction_done_elapsed_time>\n"
" <current_cpu_time>%f</current_cpu_time>\n"
" <once_ran_edf>%d</once_ran_edf>\n"
" <swap_size>%f</swap_size>\n"
@ -511,7 +517,8 @@ int ACTIVE_TASK::write(MIOFILE& fout) {
slot,
checkpoint_cpu_time,
checkpoint_elapsed_time,
fraction_done,
checkpoint_fraction_done,
checkpoint_fraction_done_elapsed_time,
current_cpu_time,
once_ran_edf?1:0,
procinfo.swap_size,
@ -652,6 +659,18 @@ int ACTIVE_TASK::parse(MIOFILE& fin) {
elapsed_time = checkpoint_cpu_time;
checkpoint_elapsed_time = elapsed_time;
}
// for 6.12.25-26 transition;
// old clients write fraction_done to state file;
// new clients don't
if (fraction_done) {
checkpoint_fraction_done = fraction_done;
checkpoint_fraction_done_elapsed_time = checkpoint_elapsed_time;
fraction_done_elapsed_time = checkpoint_elapsed_time;
} else {
fraction_done = checkpoint_fraction_done;
fraction_done_elapsed_time = checkpoint_fraction_done_elapsed_time;
}
return 0;
}
else if (parse_str(buf, "<result_name>", result_name, sizeof(result_name))) continue;
@ -659,9 +678,12 @@ int ACTIVE_TASK::parse(MIOFILE& fin) {
else if (parse_int(buf, "<slot>", slot)) continue;
else if (parse_int(buf, "<active_task_state>", dummy)) continue;
else if (parse_double(buf, "<checkpoint_cpu_time>", checkpoint_cpu_time)) continue;
else if (parse_double(buf, "<checkpoint_elapsed_time>", checkpoint_elapsed_time)) continue;
else if (parse_double(buf, "<checkpoint_fraction_done>", checkpoint_fraction_done)) continue;
else if (parse_double(buf, "<checkpoint_fraction_done_elapsed_time>", checkpoint_fraction_done_elapsed_time)) continue;
else if (parse_bool(buf, "once_ran_edf", once_ran_edf)) continue;
else if (parse_double(buf, "<fraction_done>", fraction_done)) continue;
else if (parse_double(buf, "<checkpoint_elapsed_time>", checkpoint_elapsed_time)) continue;
// deprecated - for backwards compat
else if (parse_int(buf, "<app_version_num>", n)) continue;
else if (parse_double(buf, "<swap_size>", procinfo.swap_size)) continue;
else if (parse_double(buf, "<working_set_size>", procinfo.working_set_size)) continue;

View File

@ -81,18 +81,22 @@ struct ACTIVE_TASK {
// in episodes before the current one)
double checkpoint_elapsed_time;
// elapsed time at last checkpoint
double fraction_done;
// App's estimate of how much of the work unit is done.
// Passed from the application via an API call;
// will be zero if the app doesn't use this call
double fraction_done_elapsed_time;
// elapsed time when fraction done was last reported
double checkpoint_fraction_done;
// fraction done at last checkpoint
double checkpoint_fraction_done_elapsed_time;
// fraction done elapsed time at last checkpoint
double current_cpu_time;
// most recent CPU time reported by app
bool once_ran_edf;
// END OF ITEMS SAVED IN STATE FILE
double fraction_done;
// App's estimate of how much of the work unit is done.
// Passed from the application via an API call;
// will be zero if the app doesn't use this call
double fraction_done_elapsed_time;
// elapsed time when fraction done was last reported
int scheduler_state;
int next_scheduler_state; // temp
int signal;
@ -102,8 +106,6 @@ struct ACTIVE_TASK {
// wall time at the last checkpoint
double elapsed_time;
// current total elapsed (running) time
int current_disk_usage(double&);
// disk used by output files and temp files of this task
char slot_dir[256];
// directory where process runs (relative)
char slot_path[512];
@ -181,6 +183,8 @@ struct ACTIVE_TASK {
int init(RESULT*);
void cleanup_task();
int current_disk_usage(double&);
// disk used by output files and temp files of this task
void get_free_slot(RESULT*);
int start(bool first_time); // start a process
int request_exit();

View File

@ -1164,6 +1164,8 @@ void ACTIVE_TASK_SET::get_msgs() {
atp->checkpoint_wall_time = gstate.now;
atp->premature_exit_count = 0;
atp->checkpoint_elapsed_time = atp->elapsed_time;
atp->checkpoint_fraction_done = atp->fraction_done;
atp->checkpoint_fraction_done_elapsed_time = atp->fraction_done_elapsed_time;
if (log_flags.checkpoint_debug) {
msg_printf(atp->wup->project, MSG_INFO,
"[checkpoint] result %s checkpointed",