// Berkeley Open Infrastructure for Network Computing // http://boinc.berkeley.edu // Copyright (C) 2005 University of California // // This is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; // either version 2.1 of the License, or (at your option) any later version. // // This software is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // See the GNU Lesser General Public License for more details. // // To view the GNU Lesser General Public License visit // http://www.gnu.org/copyleft/lesser.html // or write to the Free Software Foundation, Inc., // 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // The "policy" part of task execution is here. // The "mechanism" part is in app.C // #include "cpp.h" #ifdef _WIN32 #include "boinc_win.h" #endif #ifndef _WIN32 #include "config.h" #include #include #endif #include "md5_file.h" #include "util.h" #include "error_numbers.h" #include "file_names.h" #include "filesys.h" #include "shmem.h" #include "log_flags.h" #include "client_msgs.h" #include "client_state.h" using std::vector; #define MAX_DEBT (86400) // maximum project debt // Quit running applications, quit benchmarks, // write the client_state.xml file // (in principle we could also terminate net_xfers here, // e.g. flush buffers, but why bother) // int CLIENT_STATE::quit_activities() { int retval; // calculate long-term debts (for state file) // adjust_debts(); retval = active_tasks.exit_tasks(); if (retval) { msg_printf(NULL, MSG_ERROR, "CLIENT_STATE.quit_activities: exit_tasks failed\n"); } retval = write_state_file(); if (retval) { msg_printf(NULL, MSG_ERROR, "CLIENT_STATE.quit_activities: write_state_file failed\n"); } abort_cpu_benchmarks(); return 0; } // Handle a task that has finished. // Mark its output files as present, and delete scratch files. // Don't delete input files because they might be shared with other WUs. // Update state of result record. // int CLIENT_STATE::app_finished(ACTIVE_TASK& at) { RESULT* rp = at.result; FILE_INFO* fip; unsigned int i; char path[256]; int retval; double size; bool had_error = false; // scan the output files, check if missing or too big // Don't bother doing this if result was aborted via GUI if (rp->exit_status != ERR_ABORTED_VIA_GUI) { for (i=0; ioutput_files.size(); i++) { fip = rp->output_files[i].file_info; if (fip->uploaded) continue; get_pathname(fip, path); retval = file_size(path, size); if (retval) { // an output file is unexpectedly absent. // fip->status = retval; had_error = true; } else if (size > fip->max_nbytes) { // Note: this is only checked when the application finishes. // The total disk space is checked while the application is running. // msg_printf( rp->project, MSG_INFO, "Output file %s for result %s exceeds size limit.", fip->name, rp->name ); msg_printf( rp->project, MSG_INFO, "File size: %f bytes. Limit: %f bytes", size, fip->max_nbytes ); fip->delete_file(); fip->status = ERR_FILE_TOO_BIG; had_error = true; } else { if (!fip->upload_when_present && !fip->sticky) { fip->delete_file(); // sets status to NOT_PRESENT } else { retval = md5_file(path, fip->md5_cksum, fip->nbytes); if (retval) { fip->status = retval; had_error = true; } else { fip->status = FILE_PRESENT; } } } } } if (rp->exit_status != 0) { had_error = true; } if (had_error) { rp->state = RESULT_COMPUTE_ERROR; } else { rp->state = RESULT_FILES_UPLOADING; rp->project->update_duration_correction_factor(rp); } double wall_cpu_time = now - cpu_sched_last_time; at.result->project->wall_cpu_time_this_period += wall_cpu_time; total_wall_cpu_time_this_period += wall_cpu_time; total_cpu_time_this_period += at.current_cpu_time - at.cpu_time_at_last_sched; return 0; } // clean up after finished apps // bool CLIENT_STATE::handle_finished_apps() { unsigned int i; ACTIVE_TASK* atp; bool action = false; static double last_time = 0; if (gstate.now - last_time < 1.0) return false; last_time = gstate.now; SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_TASK); for (i=0; itask_state) { case PROCESS_EXITED: case PROCESS_WAS_SIGNALED: case PROCESS_EXIT_UNKNOWN: case PROCESS_COULDNT_START: case PROCESS_ABORTED: msg_printf(atp->wup->project, MSG_INFO, "Computation for result %s finished", atp->result->name ); scope_messages.printf( "CLIENT_STATE::handle_finished_apps(): task finished; pid %d, status %d\n", atp->pid, atp->result->exit_status ); app_finished(*atp); active_tasks.remove(atp); delete atp; set_client_state_dirty("handle_finished_apps"); action = true; } } return action; } // Returns true if all the input files for a result are present // (both WU and app version) // false otherwise // bool CLIENT_STATE::input_files_available(RESULT* rp) { WORKUNIT* wup = rp->wup; FILE_INFO* fip; unsigned int i; APP_VERSION* avp; FILE_REF fr; PROJECT* project = rp->project; avp = wup->avp; for (i=0; iapp_files.size(); i++) { fr = avp->app_files[i]; fip = fr.file_info; if (fip->status != FILE_PRESENT) return false; // don't check file size for anonymous platform // if (!project->anonymous_platform) { if (fip->verify_file(false)) return false; } } for (i=0; iinput_files.size(); i++) { fip = wup->input_files[i].file_info; if (fip->status != FILE_PRESENT) return false; if (fip->verify_file(false)) return false; } return true; } // Choose a "best" runnable result for each project // // Values are returned in project->next_runnable_result // (skip projects for which this is already non-NULL) // // Don't choose results with already_selected == true; // mark chosen results as already_selected. // // The preference order: // 1. results with active tasks that are running // 2. results with active tasks that are preempted (but have a process) // 3. results with active tasks that have no process // 4. results with no active task // void CLIENT_STATE::assign_results_to_projects() { unsigned int i; RESULT* rp; PROJECT* project; // scan results with an ACTIVE_TASK // for (i=0; iresult; if (rp->already_selected) continue; if (!rp->runnable()) continue; project = rp->project; if (!project->next_runnable_result) { project->next_runnable_result = rp; continue; } // see if this task is "better" than the one currently // selected for this project // ACTIVE_TASK *next_atp = lookup_active_task_by_result( project->next_runnable_result ); assert(next_atp != NULL); if ((next_atp->task_state == PROCESS_UNINITIALIZED && atp->process_exists()) || (next_atp->scheduler_state == CPU_SCHED_PREEMPTED && atp->scheduler_state == CPU_SCHED_SCHEDULED) ) { project->next_runnable_result = atp->result; } } // Now consider results that don't have an active task // for (i=0; ialready_selected) continue; if (lookup_active_task_by_result(rp)) continue; if (!rp->runnable()) continue; project = rp->project; if (project->next_runnable_result) continue; project->next_runnable_result = rp; } // mark selected results, so CPU scheduler won't try to consider // a result more than once // for (i=0; inext_runnable_result) { project->next_runnable_result->already_selected = true; } } } // if there's not an active task for the result, make one // int CLIENT_STATE::schedule_result(RESULT* rp) { ACTIVE_TASK *atp = lookup_active_task_by_result(rp); if (!atp) { atp = new ACTIVE_TASK; atp->init(rp); atp->slot = active_tasks.get_free_slot(); get_slot_dir(atp->slot, atp->slot_dir); active_tasks.active_tasks.push_back(atp); } atp->next_scheduler_state = CPU_SCHED_SCHEDULED; return 0; } // Schedule an active task for the project with the largest anticipated debt // among those that have a runnable result. // Return true iff a task was scheduled. // bool CLIENT_STATE::schedule_largest_debt_project(double expected_pay_off) { PROJECT *best_project = NULL; double best_debt = -MAX_DEBT; bool first = true; unsigned int i; for (i=0; inext_runnable_result) continue; if (p->non_cpu_intensive) continue; if (first || p->anticipated_debt > best_debt) { first = false; best_project = p; best_debt = p->anticipated_debt; } } if (!best_project) return false; schedule_result(best_project->next_runnable_result); best_project->anticipated_debt -= expected_pay_off; best_project->next_runnable_result = 0; return true; } // Schedule the active task with the earliest deadline // Return true iff a task was scheduled. // bool CLIENT_STATE::schedule_earliest_deadline_result() { PROJECT *best_project = NULL; RESULT *best_result = NULL; double earliest_deadline=0; bool first = true; unsigned int i; for (i=0; i < results.size(); ++i) { RESULT *rp = results[i]; if (!rp->runnable()) continue; if (rp->project->non_cpu_intensive) continue; if (rp->already_selected) continue; if (first || rp->report_deadline < earliest_deadline) { first = false; best_project = rp->project; best_result = rp; earliest_deadline = rp->report_deadline; } } if (!best_result) return false; // msg_printf(0, MSG_INFO, "earliest deadline: %f %s", earliest_deadline, best_result->name); schedule_result(best_result); best_result->already_selected = true; return true; } // find total resource shares of all projects // double CLIENT_STATE::total_resource_share() { double x = 0; for (unsigned int i=0; inon_cpu_intensive ) { x += projects[i]->resource_share; } } return x; } // same, but only runnable projects (can use CPU right now) // double CLIENT_STATE::runnable_resource_share() { double x = 0; for (unsigned int i=0; irunnable()) { x += p->resource_share; } } return x; } // same, but potentially runnable (could ask for work right now) // double CLIENT_STATE::potentially_runnable_resource_share() { double x = 0; for (unsigned int i=0; ipotentially_runnable()) { x += p->resource_share; } } return x; } // adjust project debts (short, long-term) // void CLIENT_STATE::adjust_debts() { unsigned int i; double total_long_term_debt = 0; double total_short_term_debt = 0; double prrs, rrs; int nprojects = 0; PROJECT *p; double share_frac; double wall_cpu_time = gstate.now - cpu_sched_last_time; SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_TASK); // Total up total and per-project "wall CPU" since last CPU reschedule. // "Wall CPU" is the wall time during which a task was // runnable (at the OS level). // // We use wall CPU for debt calculation // (instead of reported actual CPU) for two reasons: // 1) the process might have paged a lot, so the actual CPU // may be a lot less than wall CPU // 2) BOINC relies on apps to report their CPU time. // Sometimes there are bugs and apps report zero CPU. // It's safer not to trust them. // for (i=0; ischeduler_state != CPU_SCHED_SCHEDULED) continue; if (atp->non_cpu_intensive) continue; atp->result->project->wall_cpu_time_this_period += wall_cpu_time; total_wall_cpu_time_this_period += wall_cpu_time; total_cpu_time_this_period += atp->current_cpu_time - atp->cpu_time_at_last_sched; } time_stats.update_cpu_efficiency( total_wall_cpu_time_this_period, total_cpu_time_this_period ); rrs = runnable_resource_share(); prrs = potentially_runnable_resource_share(); for (i=0; inon_cpu_intensive) continue; nprojects++; // adjust long-term debts // if (p->potentially_runnable()) { share_frac = p->resource_share/prrs; p->long_term_debt += share_frac*total_wall_cpu_time_this_period - p->wall_cpu_time_this_period ; } total_long_term_debt += p->long_term_debt; // adjust short term debts // if (p->runnable()) { share_frac = p->resource_share/rrs; p->short_term_debt += share_frac*total_wall_cpu_time_this_period - p->wall_cpu_time_this_period ; total_short_term_debt += p->short_term_debt; } else { p->short_term_debt = 0; p->anticipated_debt = 0; } scope_messages.printf( "CLIENT_STATE::adjust_debts(): project %s: short-term debt %f\n", p->project_name, p->short_term_debt ); } // long-term debt: // normalize so mean is zero, // short-term debt: // normalize so mean is zero, and limit abs value at MAX_DEBT // double avg_long_term_debt = total_long_term_debt / nprojects; double avg_short_term_debt = total_short_term_debt / nprojects; for (i=0; inon_cpu_intensive) continue; if (p->runnable()) { p->short_term_debt -= avg_short_term_debt; if (p->short_term_debt > MAX_DEBT) { p->short_term_debt = MAX_DEBT; } if (p->short_term_debt < -MAX_DEBT) { p->short_term_debt = -MAX_DEBT; } p->anticipated_debt = p->short_term_debt; //msg_printf(p, MSG_INFO, "debt %f", p->short_term_debt); } p->long_term_debt -= avg_long_term_debt; } } // Schedule active tasks to be run and preempted. // This is called in the do_something() loop // bool CLIENT_STATE::schedule_cpus() { double expected_pay_off; ACTIVE_TASK *atp; PROJECT *p; int retval, j; double vm_limit, elapsed_time; unsigned int i; SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_TASK); if (projects.size() == 0) return false; if (results.size() == 0) return false; // Reschedule every cpu_sched_period seconds, // or if must_schedule_cpus is set // (meaning a new result is available, or a CPU has been freed). // elapsed_time = gstate.now - cpu_sched_last_time; if (must_schedule_cpus) { must_schedule_cpus = false; scope_messages.printf("CLIENT_STATE::schedule_cpus(): must schedule\n"); } else { if (elapsed_time < (global_prefs.cpu_scheduling_period_minutes*60)) { return false; } scope_messages.printf("CLIENT_STATE::schedule_cpus(): time %f\n", elapsed_time); } // mark file xfer results as completed; // TODO: why do this here?? // handle_file_xfer_apps(); // clear temporary variables // for (i=0; inext_runnable_result = NULL; } for (i=0; ialready_selected = false; } set_scheduler_modes(); adjust_debts(); // mark active tasks as preempted // MUST DO THIS AFTER accumulate_work() // for (i=0; inon_cpu_intensive) { atp->next_scheduler_state = CPU_SCHED_SCHEDULED; } else { atp->next_scheduler_state = CPU_SCHED_PREEMPTED; } } expected_pay_off = total_wall_cpu_time_this_period / ncpus; for (j=0; jproject->non_cpu_intensive && rp->runnable()) { schedule_result(rp); } } // preempt, start, and resume tasks // vm_limit = (global_prefs.vm_max_used_pct/100.)*host_info.m_swap; for (i=0; iresult->project->project_name, atp->result->name, atp->scheduler_state); if (atp->scheduler_state == CPU_SCHED_SCHEDULED && atp->next_scheduler_state == CPU_SCHED_PREEMPTED ) { bool preempt_by_quit = !global_prefs.leave_apps_in_memory; preempt_by_quit |= active_tasks.vm_limit_exceeded(vm_limit); atp->preempt(preempt_by_quit); } else if (atp->scheduler_state != CPU_SCHED_SCHEDULED && atp->next_scheduler_state == CPU_SCHED_SCHEDULED ) { retval = atp->resume_or_start(); if (retval) { report_result_error( *(atp->result), "Couldn't start or resume: %d", retval ); request_schedule_cpus("start failed"); continue; } atp->scheduler_state = CPU_SCHED_SCHEDULED; app_started = gstate.now; } atp->cpu_time_at_last_sched = atp->current_cpu_time; } // reset work accounting // doing this at the end of schedule_cpus() because // wall_cpu_time_this_period's can change as apps finish // for (i=0; iwall_cpu_time_this_period = 0; } total_wall_cpu_time_this_period = 0; total_cpu_time_this_period = 0; cpu_sched_last_time = gstate.now; set_client_state_dirty("schedule_cpus"); return true; } // This is called when the client is initialized. // Try to restart any tasks that were running when we last shut down. // int CLIENT_STATE::restart_tasks() { return active_tasks.restart_tasks(ncpus); } void CLIENT_STATE::set_ncpus() { if (host_info.p_ncpus > 0) { ncpus = host_info.p_ncpus; } else { ncpus = 1; } if (ncpus > global_prefs.max_cpus) ncpus = global_prefs.max_cpus; } inline double force_fraction(double f) { if (f < 0) return 0; if (f > 1) return 1; return f; } double CLIENT_STATE::get_fraction_done(RESULT* result) { ACTIVE_TASK* atp = active_tasks.lookup_result(result); return atp ? force_fraction(atp->fraction_done) : 0.0; } // Decide which app version to use for a WU. // Return -1 if can't find one // int CLIENT_STATE::choose_version_num(char* app_name, SCHEDULER_REPLY& sr) { unsigned int i; int best = -1; APP_VERSION* avp; // First look in the scheduler reply // for (i=0; iapp_name)) { return avp->version_num; } } // If not there, use the latest one in our state // for (i=0; iapp_name, app_name)) continue; if (avp->version_num < best) continue; best = avp->version_num; } if (best < 0) { msg_printf(0, MSG_ERROR, "CLIENT_STATE::latest_version_num: no version\n"); } return best; } // handle file-transfer applications // void CLIENT_STATE::handle_file_xfer_apps() { unsigned int i; for (i=0; iwup->avp->app_files.size() == 0 && rp->state == RESULT_FILES_DOWNLOADED) { rp->state = RESULT_FILES_UPLOADING; rp->reset_files(); } } } void CLIENT_STATE::request_schedule_cpus(const char* where) { must_schedule_cpus = true; msg_printf(0, MSG_INFO, "request_reschedule_cpus: %s", where); } const char *BOINC_RCSID_7bf63ad771 = "$Id$";