From fa459d780c51e13e8caf5adadfabfc3120b9defa Mon Sep 17 00:00:00 2001 From: David Anderson Date: Tue, 26 Apr 2011 17:02:09 +0000 Subject: [PATCH] - client: fix bugs in runtime estimation of jobs that have run before but are not currently running. Old: - We maintain the most recent fraction_done in state file. But for apps that checkpoint seldom or never, this is not the relevant value, and frac done may go down when the app runs. - fraction_done_elapsed_time is not initialized, and can have garbage values for jobs that haven't run yet. New: - Record, in the state file, the values of fraction_done and fraction_done_elapsed_time at the most recent checkpoint. When the client starts up, use these values. svn path=/trunk/boinc/; revision=23455 --- checkin_notes | 23 ++++++++++++++++++++- client/app.cpp | 46 +++++++++++++++++++++++++++++++----------- client/app.h | 20 ++++++++++-------- client/app_control.cpp | 2 ++ 4 files changed, 70 insertions(+), 21 deletions(-) diff --git a/checkin_notes b/checkin_notes index 2424c1fc87..708a3c638f 100644 --- a/checkin_notes +++ b/checkin_notes @@ -2524,7 +2524,28 @@ Rom 25 Apr 2011 screensaver_win.cpp David 25 Apr 2011 - - validator: fix bug when check_pair() returns retry=true. + - validator: fix bug when check_pair() returns retry=true, + reported by Travis Desell. sched/ validator.cpp + +David 25 Apr 2011 + - client: fix bugs in runtime estimation of jobs that + have run before but are not currently running. + Old: + - We maintain the most recent fraction_done in state file. + But for apps that checkpoint seldom or never, + this is not the relevant value, + and frac done may go down when the app runs. + - fraction_done_elapsed_time is not initialized, + and can have garbage values for jobs that haven't run yet. + New: + - Record, in the state file, the values of + fraction_done and fraction_done_elapsed_time + at the most recent checkpoint. + When the client starts up, use these values. + + client/ + app.cpp,h + app_control.cpp diff --git a/client/app.cpp b/client/app.cpp index 5649acf2c2..16eee9720f 100644 --- a/client/app.cpp +++ b/client/app.cpp @@ -84,21 +84,26 @@ ACTIVE_TASK::ACTIVE_TASK() { wup = NULL; app_version = NULL; pid = 0; - slot = 0; + _task_state = PROCESS_UNINITIALIZED; + slot = 0; + checkpoint_cpu_time = 0; + checkpoint_elapsed_time = 0; + checkpoint_fraction_done = 0; + checkpoint_fraction_done_elapsed_time = 0; + current_cpu_time = 0; + once_ran_edf = false; + + fraction_done = 0; + fraction_done_elapsed_time = 0; scheduler_state = CPU_SCHED_UNINITIALIZED; signal = 0; + run_interval_start_wall_time = gstate.now; + checkpoint_wall_time = 0; + elapsed_time = 0; strcpy(slot_dir, ""); graphics_mode_acked = MODE_UNSUPPORTED; graphics_mode_ack_timeout = 0; - fraction_done = 0; - run_interval_start_wall_time = gstate.now; - checkpoint_cpu_time = 0; - checkpoint_wall_time = 0; - current_cpu_time = 0; - once_ran_edf = false; - elapsed_time = 0; - checkpoint_elapsed_time = 0; have_trickle_down = false; send_upload_file_status = false; too_large = false; @@ -497,7 +502,8 @@ int ACTIVE_TASK::write(MIOFILE& fout) { " %d\n" " %f\n" " %f\n" - " %f\n" + " %f\n" + " %f\n" " %f\n" " %d\n" " %f\n" @@ -511,7 +517,8 @@ int ACTIVE_TASK::write(MIOFILE& fout) { slot, checkpoint_cpu_time, checkpoint_elapsed_time, - fraction_done, + checkpoint_fraction_done, + checkpoint_fraction_done_elapsed_time, current_cpu_time, once_ran_edf?1:0, procinfo.swap_size, @@ -652,6 +659,18 @@ int ACTIVE_TASK::parse(MIOFILE& fin) { elapsed_time = checkpoint_cpu_time; checkpoint_elapsed_time = elapsed_time; } + + // for 6.12.25-26 transition; + // old clients write fraction_done to state file; + // new clients don't + if (fraction_done) { + checkpoint_fraction_done = fraction_done; + checkpoint_fraction_done_elapsed_time = checkpoint_elapsed_time; + fraction_done_elapsed_time = checkpoint_elapsed_time; + } else { + fraction_done = checkpoint_fraction_done; + fraction_done_elapsed_time = checkpoint_fraction_done_elapsed_time; + } return 0; } else if (parse_str(buf, "", result_name, sizeof(result_name))) continue; @@ -659,9 +678,12 @@ int ACTIVE_TASK::parse(MIOFILE& fin) { else if (parse_int(buf, "", slot)) continue; else if (parse_int(buf, "", dummy)) continue; else if (parse_double(buf, "", checkpoint_cpu_time)) continue; + else if (parse_double(buf, "", checkpoint_elapsed_time)) continue; + else if (parse_double(buf, "", checkpoint_fraction_done)) continue; + else if (parse_double(buf, "", checkpoint_fraction_done_elapsed_time)) continue; else if (parse_bool(buf, "once_ran_edf", once_ran_edf)) continue; else if (parse_double(buf, "", fraction_done)) continue; - else if (parse_double(buf, "", checkpoint_elapsed_time)) continue; + // deprecated - for backwards compat else if (parse_int(buf, "", n)) continue; else if (parse_double(buf, "", procinfo.swap_size)) continue; else if (parse_double(buf, "", procinfo.working_set_size)) continue; diff --git a/client/app.h b/client/app.h index 8cc7a45beb..07c8b687d8 100644 --- a/client/app.h +++ b/client/app.h @@ -81,18 +81,22 @@ struct ACTIVE_TASK { // in episodes before the current one) double checkpoint_elapsed_time; // elapsed time at last checkpoint - double fraction_done; - // App's estimate of how much of the work unit is done. - // Passed from the application via an API call; - // will be zero if the app doesn't use this call - double fraction_done_elapsed_time; - // elapsed time when fraction done was last reported + double checkpoint_fraction_done; + // fraction done at last checkpoint + double checkpoint_fraction_done_elapsed_time; + // fraction done elapsed time at last checkpoint double current_cpu_time; // most recent CPU time reported by app bool once_ran_edf; // END OF ITEMS SAVED IN STATE FILE + double fraction_done; + // App's estimate of how much of the work unit is done. + // Passed from the application via an API call; + // will be zero if the app doesn't use this call + double fraction_done_elapsed_time; + // elapsed time when fraction done was last reported int scheduler_state; int next_scheduler_state; // temp int signal; @@ -102,8 +106,6 @@ struct ACTIVE_TASK { // wall time at the last checkpoint double elapsed_time; // current total elapsed (running) time - int current_disk_usage(double&); - // disk used by output files and temp files of this task char slot_dir[256]; // directory where process runs (relative) char slot_path[512]; @@ -181,6 +183,8 @@ struct ACTIVE_TASK { int init(RESULT*); void cleanup_task(); + int current_disk_usage(double&); + // disk used by output files and temp files of this task void get_free_slot(RESULT*); int start(bool first_time); // start a process int request_exit(); diff --git a/client/app_control.cpp b/client/app_control.cpp index 0a52243c41..b9b5871309 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -1164,6 +1164,8 @@ void ACTIVE_TASK_SET::get_msgs() { atp->checkpoint_wall_time = gstate.now; atp->premature_exit_count = 0; atp->checkpoint_elapsed_time = atp->elapsed_time; + atp->checkpoint_fraction_done = atp->fraction_done; + atp->checkpoint_fraction_done_elapsed_time = atp->fraction_done_elapsed_time; if (log_flags.checkpoint_debug) { msg_printf(atp->wup->project, MSG_INFO, "[checkpoint] result %s checkpointed",