// Berkeley Open Infrastructure for Network Computing // http://boinc.berkeley.edu // Copyright (C) 2005 University of California // // This is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; // either version 2.1 of the License, or (at your option) any later version. // // This software is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // See the GNU Lesser General Public License for more details. // // To view the GNU Lesser General Public License visit // http://www.gnu.org/copyleft/lesser.html // or write to the Free Software Foundation, Inc., // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // scheduler code related to sending "lost" work // (i.e. results we sent to the host, but which they're not reporting) // // TODO: // - make sure result is still needed (no canonical result yet) // - don't send if project has been reset since first send; // this result may have been the cause of reset // (need to pass last reset time from client) #include "config.h" #include "error_numbers.h" #include "server_types.h" #include "sched_shmem.h" #include "sched_config.h" #include "sched_util.h" #include "sched_msgs.h" #include "sched_send.h" #include "sched_locality.h" #include "sched_resend.h" #ifdef _USING_FCGI_ #include "fcgi_stdio.h" #else #define FCGI_ToFILE(x) (x) #endif // Assign a new deadline for the result; // if it's not likely to complete by this time, return nonzero. // TODO: EXPLAIN THE FORMULA FOR NEW DEADLINE // static int possibly_give_result_new_deadline( DB_RESULT& result, WORKUNIT& wu, SCHEDULER_REPLY& reply ) { const double resend_frac = 0.5; // range [0, 1) int now = time(0); int result_report_deadline = now + (int)(resend_frac*(result.report_deadline - result.sent_time)); if (result_report_deadline < result.report_deadline) { result_report_deadline = result.report_deadline; } if (result_report_deadline > now + wu.delay_bound) { result_report_deadline = now + wu.delay_bound; } // If infeasible, return without modifying result // if (estimate_cpu_duration(wu, reply) > result_report_deadline-now) { log_messages.printf( SCHED_MSG_LOG::MSG_DEBUG, "[RESULT#%d] [HOST#%d] not resending lost result: can't complete in time\n", result.id, reply.host.id ); return 1; } // update result with new report time and sent time // log_messages.printf( SCHED_MSG_LOG::MSG_DEBUG, "[RESULT#%d] [HOST#%d] %s report_deadline (resend lost work)\n", result.id, reply.host.id, result_report_deadline==result.report_deadline?"NO update to":"Updated" ); result.sent_time = now; result.report_deadline = result_report_deadline; return 0; } // resend any jobs that: // 1) we already sent to this host; // 2) are still in progress (i.e. haven't timed out) and // 3) aren't present on the host // Return true if there were any such jobs // bool resend_lost_work( SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, PLATFORM_LIST& platforms, SCHED_SHMEM& ss ) { DB_RESULT result; std::vectorresults; unsigned int i; char buf[256]; char warning_msg[256]; bool did_any = false; int num_eligible_to_resend=0; int num_resent=0; APP* app; APP_VERSION* avp; int retval; sprintf(buf, " where hostid=%d and server_state=%d ", reply.host.id, RESULT_SERVER_STATE_IN_PROGRESS ); while (!result.enumerate(buf)) { bool found = false; for (i=0; i