diff --git a/checkin_notes b/checkin_notes index 8a5dca6c4a..645b1f92da 100755 --- a/checkin_notes +++ b/checkin_notes @@ -6699,3 +6699,18 @@ Bruce 17 May 2005 inc/ forum.inc +David 17 May 2005 + - (from Bruce Allen) fix bug in scheduler in case where + host has been merged, + and results are reported with old (zombie) host ID. + Results in the DB have already been updated to new hostid; + compare with that ID, not old ID + - move mylockf() to sched_util.C + (eliminate duplicate definition in scheduler/file upload handler) + - move delete_file_from_host() to sched_locality.C + + sched/ + file_upload_handler.C + handle_request.C + sched_locality.C,h + sched_util.C,h diff --git a/sched/file_upload_handler.C b/sched/file_upload_handler.C index 51768607f6..cda79ecc67 100644 --- a/sched/file_upload_handler.C +++ b/sched/file_upload_handler.C @@ -131,25 +131,6 @@ int return_success(const char* text) { return 0; } -// Lock entire file for writing. -// returns zero on success, else PID of process currently holding lock, or -1 -// if something else is wrong, for example a bad file descriptor -// -int mylockf(int fd) { - struct flock fl; - fl.l_type=F_WRLCK; - fl.l_whence=SEEK_SET; - fl.l_start=0; - fl.l_len=0; - if (-1 != fcntl(fd, F_SETLK, &fl)) return 0; - - // if lock failed, find out why - errno=0; - fcntl(fd, F_GETLK, &fl); - if (fl.l_pid>0) return fl.l_pid; - return -1; -} - #define BLOCK_SIZE 16382 double bytes_left=-1; diff --git a/sched/handle_request.C b/sched/handle_request.C index 7bb10850ca..bf58967d00 100644 --- a/sched/handle_request.C +++ b/sched/handle_request.C @@ -38,38 +38,21 @@ using namespace std; #include "error_numbers.h" #include "parse.h" #include "util.h" +#include "filesys.h" + +#include "main.h" #include "server_types.h" #include "sched_util.h" -#include "main.h" #include "handle_request.h" #include "sched_msgs.h" #include "sched_send.h" #include "sched_config.h" -#include "../lib/filesys.h" +#include "sched_locality.h" #ifdef _USING_FCGI_ #include "fcgi_stdio.h" #endif -// returns zero if we get lock on file with file descriptor fd. -// returns < 0 if error -// returns PID > 0 if another process has lock -// -int mylockf(int fd) { - struct flock fl; - fl.l_type=F_WRLCK; - fl.l_whence=SEEK_SET; - fl.l_start=0; - fl.l_len=0; - if (-1 != fcntl(fd, F_SETLK, &fl)) return 0; - - // if lock failed, find out why - errno=0; - fcntl(fd, F_GETLK, &fl); - if (fl.l_pid>0) return fl.l_pid; - return -1; -} - // use advisory locking to establish a lock to run a scheduler // instance for this host. Return values same as mylockf(). // @@ -119,9 +102,14 @@ int unmunge_email_addr(DB_USER& user) { return 0; } -// Look up the host and its user, and make sure the authenticator matches. -// If no host ID is supplied, or if RPC seqno mismatch, -// create a new host record and return its ID +// Based on the info in the request message, +// look up the host and its user, and make sure the authenticator matches. +// Some special cases: +// 1) If no host ID is supplied, or if RPC seqno mismatch, +// create a new host record +// 2) If the host record specified by sreq.hostid is a "zombie" +// (i.e. it was merged with another host via the web site) +// then follow links to find the proper host // // POSTCONDITION: // If this returns zero, then: @@ -456,9 +444,7 @@ int handle_global_prefs(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { // New handle completed results // -int handle_results( - SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply -) { +int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { DB_SCHED_RESULT_ITEM_SET result_handler; SCHED_RESULT_ITEM* srip; unsigned int i; @@ -548,7 +534,7 @@ int handle_results( continue; } - if (srip->hostid != sreq.hostid) { + if (srip->hostid != reply.host.id) { log_messages.printf( SCHED_MSG_LOG::CRITICAL, "[HOST#%d] [RESULT#%d %s] got result from wrong host; expected [HOST#%d]\n", @@ -627,7 +613,7 @@ int handle_results( srip->validate_state = VALIDATE_STATE_INVALID; reply.got_bad_result(); } - } // end of loop over all incoming results + } // loop over all incoming results // update all the results we have kept in memory, by storing to database. @@ -1056,10 +1042,6 @@ void process_request( goto leave; } -#if 0 - notify_if_newer_core_version(sreq, reply, *platform, ss); -#endif - handle_global_prefs(sreq, reply); #if 0 @@ -1108,67 +1090,9 @@ leave: } } - -// returns zero if there is a file we can delete. -// -int delete_file_from_host(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply) { - int nfiles = (int)sreq.file_infos.size(); - char buf[256]; - - if (!nfiles) { - - log_messages.printf( - SCHED_MSG_LOG::CRITICAL, - "[HOST#%d]: no disk space but no files we can delete!\n", sreply.host.id - ); - - sprintf(buf, - "No disk space (you must free %.1f MB before BOINC gets space). ", - fabs(max_allowable_disk(sreq, sreply))/1.e6 - ); - - if (sreply.disk_limits.max_used != 0.0) { - strcat(buf, "Review preferences for maximum disk space used."); - } else if (sreply.disk_limits.max_frac != 0.0) { - strcat(buf, "Review preferences for maximum disk percentage used."); - } else if (sreply.disk_limits.min_free != 0.0) { - strcat(buf, "Review preferences for minimum disk free space allowed."); - } - USER_MESSAGE um(buf, "high"); - sreply.insert_message(um); - sreply.set_delay(24*3600); - return 1; - } - - // pick a data file to delete. - // Do this deterministically so that we always tell host to delete the same file. - // But to prevent all hosts from removing 'the same' file, - // choose a file which depends upon the hostid. - // - // Assumption is that if nothing has changed on the host, - // the order in which it reports files is fixed. - // If this is false, we need to sort files into order by name! - // - int j = sreply.host.id % nfiles; - FILE_INFO& fi = sreq.file_infos[j]; - sreply.file_deletes.push_back(fi); - log_messages.printf( - SCHED_MSG_LOG::DEBUG, - "[HOST#%d]: delete file %s (make space)\n", sreply.host.id, fi.name - ); - - // give host 4 hours to nuke the file and come back. - // This might in general be too soon, since host needs to complete any work - // that depends upon this file, before it will be removed by core client. - // - sprintf(buf, "Removing file %s to free up disk space", fi.name); - USER_MESSAGE um(buf, "low"); - sreply.insert_message(um); - sreply.set_delay(4*3600); - return 0; -} - -void debug_sched(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply, const char *trigger) { +void debug_sched( + SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply, const char *trigger +) { char tmpfilename[256]; FILE *fp; @@ -1272,6 +1196,7 @@ void handle_request( #endif // if we got no work, and we have no file space, delete some files + // if (sreply.results.size()==0 && (sreply.wreq.insufficient_disk || sreply.wreq.disk_available<0)) { // try to delete a file to make more space. // Also give some hints to the user about what's going wrong diff --git a/sched/sched_locality.C b/sched/sched_locality.C index f2f14090b0..4ddfc43457 100644 --- a/sched/sched_locality.C +++ b/sched/sched_locality.C @@ -40,6 +40,65 @@ #define VERBOSE_DEBUG +// returns zero if there is a file we can delete. +// +int delete_file_from_host(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply) { + int nfiles = (int)sreq.file_infos.size(); + char buf[256]; + + if (!nfiles) { + log_messages.printf( + SCHED_MSG_LOG::CRITICAL, + "[HOST#%d]: no disk space but no files we can delete!\n", sreply.host.id + ); + + sprintf(buf, + "No disk space (you must free %.1f MB before BOINC gets space). ", + fabs(max_allowable_disk(sreq, sreply))/1.e6 + ); + + if (sreply.disk_limits.max_used != 0.0) { + strcat(buf, "Review preferences for maximum disk space used."); + } else if (sreply.disk_limits.max_frac != 0.0) { + strcat(buf, "Review preferences for maximum disk percentage used."); + } else if (sreply.disk_limits.min_free != 0.0) { + strcat(buf, "Review preferences for minimum disk free space allowed."); + } + USER_MESSAGE um(buf, "high"); + sreply.insert_message(um); + sreply.set_delay(24*3600); + return 1; + } + + // pick a data file to delete. + // Do this deterministically so that we always tell host + // to delete the same file. + // But to prevent all hosts from removing 'the same' file, + // choose a file which depends upon the hostid. + // + // Assumption is that if nothing has changed on the host, + // the order in which it reports files is fixed. + // If this is false, we need to sort files into order by name! + // + int j = sreply.host.id % nfiles; + FILE_INFO& fi = sreq.file_infos[j]; + sreply.file_deletes.push_back(fi); + log_messages.printf( + SCHED_MSG_LOG::DEBUG, + "[HOST#%d]: delete file %s (make space)\n", sreply.host.id, fi.name + ); + + // give host 4 hours to nuke the file and come back. + // This might in general be too soon, since host needs to complete any work + // that depends upon this file, before it will be removed by core client. + // + sprintf(buf, "Removing file %s to free up disk space", fi.name); + USER_MESSAGE um(buf, "low"); + sreply.insert_message(um); + sreply.set_delay(4*3600); + return 0; +} + // returns true if the host already has the file, or if the file is // included with a previous result being sent to this host. // diff --git a/sched/sched_locality.h b/sched/sched_locality.h index eeef5478ca..d6ab6448d6 100644 --- a/sched/sched_locality.h +++ b/sched/sched_locality.h @@ -21,7 +21,10 @@ extern void send_work_locality( SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, PLATFORM& platform, SCHED_SHMEM& ss ); + extern int decrement_disk_space_locality( WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply ); + +extern int delete_file_from_host(SCHEDULER_REQUEST&, SCHEDULER_REPLY&); diff --git a/sched/sched_util.C b/sched/sched_util.C index 4aa1d9cbab..28d685610b 100644 --- a/sched/sched_util.C +++ b/sched/sched_util.C @@ -25,6 +25,7 @@ using namespace std; #include #include #include +#include #include "filesys.h" #include "md5_file.h" @@ -233,4 +234,23 @@ int elapsed_time() { return (int)(time(0)-execution_time); } +// returns zero if we get lock on file with file descriptor fd. +// returns < 0 if error +// returns PID > 0 if another process has lock +// +int mylockf(int fd) { + struct flock fl; + fl.l_type=F_WRLCK; + fl.l_whence=SEEK_SET; + fl.l_start=0; + fl.l_len=0; + if (-1 != fcntl(fd, F_SETLK, &fl)) return 0; + + // if lock failed, find out why + errno=0; + fcntl(fd, F_GETLK, &fl); + if (fl.l_pid>0) return fl.l_pid; + return -1; +} + const char *BOINC_RCSID_affa6ef1e4 = "$Id$"; diff --git a/sched/sched_util.h b/sched/sched_util.h index ef17d5714c..e1cf2f0286 100644 --- a/sched/sched_util.h +++ b/sched/sched_util.h @@ -61,5 +61,11 @@ extern void compute_avg_turnaround(HOST& host, double turnaround); // used to track execution time of cgi scripts extern int elapsed_time(); +// returns zero if we get lock on file with file descriptor fd. +// returns < 0 if error +// returns PID > 0 if another process has lock +// +extern int mylockf(int fd); + #endif