diff --git a/checkin_notes b/checkin_notes index f0975653e2..c8dee91fc8 100755 --- a/checkin_notes +++ b/checkin_notes @@ -6276,3 +6276,17 @@ Matt 17 June 2006 html/inc forum.inc email.inc + +David 18 June 2006 + - Don't parse ACTIVE_TASK::scheduler_state from the state file. + This isn't carried across runs of BOINC. + - Remove ACTIVE_TASK_SET::restart_tasks() and CLIENT_STATE::restart_tasks(): + just let the CPU scheduler take care of starting tasks. + - If file upload fails because file is missing, don't retry, + and show appropriate error message + + client/ + app.C,h + app_start.C + client_state.C,h + cs_apps.C diff --git a/client/app.C b/client/app.C index f0e50a2c8e..4abac67350 100644 --- a/client/app.C +++ b/client/app.C @@ -177,10 +177,9 @@ int ACTIVE_TASK::init(RESULT* rp) { max_disk_usage = rp->wup->rsc_disk_bound; max_mem_usage = rp->wup->rsc_memory_bound; non_cpu_intensive = rp->project->non_cpu_intensive; - strcpy(process_control_queue.name, rp->name); strcpy(graphics_request_queue.name, rp->name); - + get_slot_dir(slot, slot_dir); return 0; } @@ -385,7 +384,6 @@ int ACTIVE_TASK::parse(MIOFILE& fin) { strcpy(result_name, ""); strcpy(project_master_url, ""); - scheduler_state = CPU_SCHED_SCHEDULED; while (fin.fgets(buf, 256)) { if (match_tag(buf, "")) { @@ -452,7 +450,6 @@ int ACTIVE_TASK::parse(MIOFILE& fin) { else if (parse_str(buf, "", project_master_url, sizeof(project_master_url))) continue; else if (parse_int(buf, "", app_version_num)) continue; else if (parse_int(buf, "", slot)) continue; - else if (parse_int(buf, "", scheduler_state)) continue; else if (parse_double(buf, "", checkpoint_cpu_time)) continue; else if (parse_double(buf, "", fraction_done)) continue; else if (parse_double(buf, "", current_cpu_time)) continue; @@ -627,4 +624,12 @@ void ACTIVE_TASK_SET::upload_notify_app(FILE_INFO* fip) { } } +void ACTIVE_TASK_SET::init() { + for (unsigned int i=0; iinit(atp->result); + atp->scheduler_state = CPU_SCHED_PREEMPTED; + } +} + const char *BOINC_RCSID_778b61195e = "$Id$"; diff --git a/client/app.h b/client/app.h index bfaaa3bdae..e4fecd530a 100644 --- a/client/app.h +++ b/client/app.h @@ -216,11 +216,11 @@ public: int remove(ACTIVE_TASK*); ACTIVE_TASK* lookup_pid(int); ACTIVE_TASK* lookup_result(RESULT*); + void init(); bool poll(); void suspend_all(bool leave_apps_in_memory=true); void unsuspend_all(); bool is_task_executing(); - int restart_tasks(int max_tasks); void request_tasks_exit(PROJECT* p=0); int wait_for_exit(double, PROJECT* p=0); int exit_tasks(PROJECT* p=0); diff --git a/client/app_start.C b/client/app_start.C index ff873769a1..61d94c1ec2 100644 --- a/client/app_start.C +++ b/client/app_start.C @@ -672,77 +672,6 @@ int ACTIVE_TASK::resume_or_start() { return 0; } -// Restart active tasks without wiping and reinitializing slot directories -// Called at init, with max_tasks = ncpus -// -int ACTIVE_TASK_SET::restart_tasks(int max_tasks) { - vector::iterator iter; - ACTIVE_TASK* atp; - RESULT* result; - int retval, num_tasks_started; - - SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_TASK); - - num_tasks_started = 0; - iter = active_tasks.begin(); - while (iter != active_tasks.end()) { - atp = *iter; - result = atp->result; - atp->init(atp->result); - get_slot_dir(atp->slot, atp->slot_dir); - if (!gstate.input_files_available(result)) { - msg_printf(atp->wup->project, MSG_ERROR, - "Can't restart %s: missing files", atp->result->name - ); - gstate.report_result_error( - *(atp->result), - "One or more missing files" - ); - iter = active_tasks.erase(iter); - delete atp; - continue; - } - - if (atp->scheduler_state != CPU_SCHED_SCHEDULED - || num_tasks_started >= max_tasks - ) { - msg_printf(atp->wup->project, MSG_INFO, - "Deferring task %s", - atp->result->name - ); - - atp->scheduler_state = CPU_SCHED_PREEMPTED; - iter++; - continue; - } - - msg_printf(atp->wup->project, MSG_INFO, - "Resuming task %s using %s version %d", - atp->result->name, - atp->app_version->app->name, - atp->app_version->version_num - ); - retval = atp->start(false); - - if (retval) { - msg_printf(atp->wup->project, MSG_ERROR, - "Restart of task %s failed: %s\n", - atp->result->name, boincerror(retval) - ); - gstate.report_result_error( - *(atp->result), - "Couldn't restart app: %d", retval - ); - iter = active_tasks.erase(iter); - delete atp; - } else { - ++num_tasks_started; - iter++; - } - } - return 0; -} - #if (defined(__APPLE__) && defined(__i386__)) union headeru { diff --git a/client/client_state.C b/client/client_state.C index ba80e2cfa7..0d3be417be 100644 --- a/client/client_state.C +++ b/client/client_state.C @@ -260,8 +260,7 @@ int CLIENT_STATE::init() { // set period start time and reschedule // - must_schedule_cpus = true; - must_enforce_cpu_schedule = true; + request_schedule_cpus("Startup"); cpu_sched_last_time = now; cpu_sched_last_check = now; @@ -271,6 +270,7 @@ int CLIENT_STATE::init() { retval = make_project_dirs(); if (retval) return retval; + active_tasks.init(); active_tasks.report_overdue(); active_tasks.handle_upload_files(); @@ -410,22 +410,6 @@ bool CLIENT_STATE::poll_slow_events() { check_suspend_activities(suspend_reason); -#ifdef NEW_CPU_SCHED - cpu_scheduler.make_schedule(); -#else - // Restart tasks on startup. - // Do this here (rather than CLIENT_STATE::init()) - // so that if we do benchmark on startup, - // we don't immediately suspend apps - // (this fixes a CPDN problem where quitting the app - // right after start kills it) - // - if (!suspend_reason && !tasks_restarted) { - restart_tasks(); - tasks_restarted = true; - } -#endif - // suspend or resume activities (but only if already did startup) // if (tasks_restarted) { @@ -507,6 +491,7 @@ bool CLIENT_STATE::poll_slow_events() { if (!tasks_suspended) { POLL_ACTION(possibly_schedule_cpus, possibly_schedule_cpus ); POLL_ACTION(enforce_schedule , enforce_schedule ); + tasks_restarted = true; } #endif if (!network_suspended) { diff --git a/client/client_state.h b/client/client_state.h index a9309d74ec..29602271cd 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -314,7 +314,6 @@ public: // - an app fails to start (CS::schedule_cpus()) // - any project op is done via RPC (suspend/resume) // - any result op is done via RPC (suspend/resume) - int restart_tasks(); int quit_activities(); void set_ncpus(); double estimate_cpu_time(WORKUNIT&); diff --git a/client/cpu_sched.C b/client/cpu_sched.C index 0c0aeeb449..f20a37606a 100644 --- a/client/cpu_sched.C +++ b/client/cpu_sched.C @@ -360,33 +360,6 @@ bool CLIENT_STATE::possibly_schedule_cpus() { cpu_sched_last_check = now; } -#if 0 // THE FOLLOWING SHOULD NOT BE NECESSARY - - // if the count of running tasks is not either ncpus - // or the count of runnable results a re-schedule is mandatory. - // - if (!must_schedule_cpus) { - int count_running_tasks = 0; - for (i=0; iresult) continue; - if (CPU_SCHED_SCHEDULED != active_tasks.active_tasks[i]->scheduler_state) continue; - if (active_tasks.active_tasks[i]->result->project->non_cpu_intensive) continue; - count_running_tasks++; - } - if (count_running_tasks != ncpus){ - int count_runnable_results = 0; - for (i=0; irunnable()) continue; - if (results[i]->project->non_cpu_intensive) continue; - count_runnable_results++; - } - if (count_running_tasks != count_runnable_results) { - must_schedule_cpus = true; - } - } - } -#endif - if (!must_schedule_cpus) return false; must_schedule_cpus = false; schedule_cpus(); diff --git a/client/cs_apps.C b/client/cs_apps.C index 8f97cd1ac7..8f6303ab36 100644 --- a/client/cs_apps.C +++ b/client/cs_apps.C @@ -234,9 +234,8 @@ int CLIENT_STATE::schedule_result(RESULT* rp) { ACTIVE_TASK *atp = lookup_active_task_by_result(rp); if (!atp) { atp = new ACTIVE_TASK; - atp->init(rp); atp->slot = active_tasks.get_free_slot(); - get_slot_dir(atp->slot, atp->slot_dir); + atp->init(rp); active_tasks.active_tasks.push_back(atp); } atp->next_scheduler_state = CPU_SCHED_SCHEDULED; @@ -295,14 +294,6 @@ double CLIENT_STATE::nearly_runnable_resource_share() { return x; } - -// This is called when the client is initialized. -// Try to restart any tasks that were running when we last shut down. -// -int CLIENT_STATE::restart_tasks() { - return active_tasks.restart_tasks(ncpus); -} - void CLIENT_STATE::set_ncpus() { if (ncpus == 0) ncpus = host_info.p_ncpus; diff --git a/client/pers_file_xfer.C b/client/pers_file_xfer.C index 49a97d6528..65ce7eb3de 100644 --- a/client/pers_file_xfer.C +++ b/client/pers_file_xfer.C @@ -227,7 +227,15 @@ bool PERS_FILE_XFER::poll() { is_upload?"upload":"download", fip->name ); } - check_giveup("server rejected file"); + try_next_url("server rejected file"); + } else if (fxp->file_xfer_retval == ERR_NOT_FOUND) { + if (log_flags.file_xfer) { + msg_printf( + fip->project, MSG_INFO, "Permanently failed %s of %s", + is_upload?"upload":"download", fip->name + ); + } + try_next_url("File not found on client"); } else { if (log_flags.file_xfer) { msg_printf( @@ -239,7 +247,7 @@ bool PERS_FILE_XFER::poll() { handle_xfer_failure(); } - // fxp could have already been freed and zeroed by check_giveup + // fxp could have already been freed and zeroed by try_next_url // so check before trying to remove // if (fxp) { @@ -252,6 +260,23 @@ bool PERS_FILE_XFER::poll() { return false; } +void PERS_FILE_XFER::xfer_failed(const char* why) { + gstate.file_xfers->remove(fxp); + delete fxp; + fxp = NULL; + if (is_upload) { + fip->status = ERR_GIVEUP_UPLOAD; + } else { + fip->status = ERR_GIVEUP_DOWNLOAD; + } + pers_xfer_done = true; + msg_printf( + fip->project, MSG_ERROR, "Giving up on %s of %s: %s", + is_upload?"upload":"download", fip->name, why + ); + fip->error_msg = why; +} + // A file transfer (to a particular server) // has had a failure // TODO ?? transient ? permanent? terminology?? @@ -266,22 +291,9 @@ bool PERS_FILE_XFER::poll() { // If there are more URLs to try, the file_xfer is restarted with these new // urls until a good transfer is made or it completely gives up. // -void PERS_FILE_XFER::check_giveup(const char* why) { +void PERS_FILE_XFER::try_next_url(const char* why) { if (fip->get_next_url(fip->upload_when_present) == NULL) { - gstate.file_xfers->remove(fxp); - delete fxp; - fxp = NULL; - if (is_upload) { - fip->status = ERR_GIVEUP_UPLOAD; - } else { - fip->status = ERR_GIVEUP_DOWNLOAD; - } - pers_xfer_done = true; - msg_printf( - fip->project, MSG_ERROR, "Giving up on %s of %s: %s", - is_upload?"upload":"download", fip->name, why - ); - fip->error_msg = why; + xfer_failed(why); fip->delete_file(); } else { if (is_upload) { @@ -315,7 +327,7 @@ void PERS_FILE_XFER::handle_xfer_failure() { retry_or_backoff(); return; } else { - check_giveup("file was not found on server"); + try_next_url("file was not found on server"); return; } } @@ -323,7 +335,7 @@ void PERS_FILE_XFER::handle_xfer_failure() { // See if it's time to give up on the persistent file xfer // if ((gstate.now - first_request_time) > gstate.file_xfer_giveup_period) { - check_giveup("too much elapsed time"); + try_next_url("too much elapsed time"); } else { retry_or_backoff(); } diff --git a/client/pers_file_xfer.h b/client/pers_file_xfer.h index 468996bc04..e07af31caf 100644 --- a/client/pers_file_xfer.h +++ b/client/pers_file_xfer.h @@ -111,7 +111,8 @@ public: bool poll(); void handle_xfer_failure(); void retry_or_backoff(); - void check_giveup(const char*); + void xfer_failed(const char*); + void try_next_url(const char*); void abort(); int write(MIOFILE& fout); int parse(MIOFILE& fin);