diff --git a/checkin_notes b/checkin_notes index 9b6a12ac3e..1170d907e4 100755 --- a/checkin_notes +++ b/checkin_notes @@ -16935,3 +16935,24 @@ Rom 2 Sep 2004 client/ cs_apps.C + +David 2 Sept 2004 + - PERS_FILE_XFER::poll(): fixed bug that caused hang + + NOTE: check_giveup(), handle_xfer_failure(), and retry_or_backoff() + are a total mess. I'll get to them next. + + - If an app exits with zero status but no finish file, + instead of putting it in limbo (which causes the core client to do no work) + just restart the process again. + NOTE: eventually we should detect when this is happening repeatedly, + and error out the result. + For now, just print a message telling user they may need to reset project. + - rename PERS_FILE_XFER::xfer_done to pers_xfer_done + - move process cleanup code into separate functions + (ACTIVE_TASK::close_process_handles() + and ACTIVE_TASK::detach_and_destroy_shmem()) + client/ + app.C,h + app_control.C + pers_file_xfer.C diff --git a/client/app.C b/client/app.C index 7ddae4a462..4c01ea74db 100644 --- a/client/app.C +++ b/client/app.C @@ -109,8 +109,8 @@ ACTIVE_TASK::ACTIVE_TASK() { #endif } -ACTIVE_TASK::~ACTIVE_TASK() { #ifdef _WIN32 +void ACTIVE_TASK::close_process_handles() { if (pid_handle) { CloseHandle(pid_handle); pid_handle = NULL; @@ -119,6 +119,10 @@ ACTIVE_TASK::~ACTIVE_TASK() { CloseHandle(thread_handle); thread_handle = NULL; } +} + +ACTIVE_TASK::~ACTIVE_TASK() { + close_process_handles(); if (quitRequestEvent) { CloseHandle(quitRequestEvent); quitRequestEvent = NULL; @@ -130,7 +134,9 @@ ACTIVE_TASK::~ACTIVE_TASK() { detach_shmem(shm_handle, app_client_shm.shm); app_client_shm.shm = NULL; } +} #else +void ACTIVE_TASK::detach_and_destroy_shmem() { // detach from and destroy share mem // if (app_client_shm.shm) { @@ -138,9 +144,13 @@ ACTIVE_TASK::~ACTIVE_TASK() { destroy_shmem(shmem_seg_name); app_client_shm.shm = NULL; } -#endif } +ACTIVE_TASK::~ACTIVE_TASK() { + detach_and_destroy_shmem(); +} +#endif + int ACTIVE_TASK::init(RESULT* rp) { result = rp; wup = rp->wup; diff --git a/client/app.h b/client/app.h index 197f9222e5..7a9ca11a3f 100644 --- a/client/app.h +++ b/client/app.h @@ -145,6 +145,8 @@ public: ACTIVE_TASK(); ~ACTIVE_TASK(); int init(RESULT*); + void close_process_handles(); + void detach_and_destroy_shmem(); int start(bool first_time); // start the task running int request_exit(); // Send a SIGQUIT signal or equivalent diff --git a/client/app_control.C b/client/app_control.C index 0e7a8e6a18..f1a3addfc6 100644 --- a/client/app_control.C +++ b/client/app_control.C @@ -182,7 +182,7 @@ static void limbo_message(ACTIVE_TASK& at) { at.result->name ); msg_printf(at.result->project, MSG_INFO, - "You may need to restart BOINC to finish this result" + "If this happens repeatedly you may need to reset the project." ); } @@ -213,18 +213,16 @@ bool ACTIVE_TASK::handle_exited_app(unsigned long exit_code) { if (pending_suspend_via_quit) { pending_suspend_via_quit = false; state = PROCESS_UNINITIALIZED; - if (pid_handle) { - CloseHandle(pid_handle); - pid_handle = NULL; - } - if (thread_handle) { - CloseHandle(thread_handle); - thread_handle = NULL; - } + close_process_handles(); return true; } if (!finish_file_present()) { +#if 0 state = PROCESS_IN_LIMBO; +#else + state = PROCESS_UNINITIALIZED; + close_process_handles(); +#endif limbo_message(*this); return true; } @@ -274,11 +272,7 @@ bool ACTIVE_TASK::handle_exited_app(int stat, struct rusage rs) { // destroy shm, since restarting app will re-create it // - if (app_client_shm.shm) { - detach_shmem(app_client_shm.shm); - destroy_shmem(shmem_seg_name); - app_client_shm.shm = NULL; - } + detach_and_destroy_shmem(); return true; } if (!finish_file_present()) { @@ -288,7 +282,12 @@ bool ACTIVE_TASK::handle_exited_app(int stat, struct rusage rs) { // and just leave it there // (assume user is about to exit core client) // +#if 0 state = PROCESS_IN_LIMBO; +#else + state = PROCESS_UNINITIALIZED; + detach_and_destroy_shmem(); +#endif limbo_message(*this); return true; } diff --git a/client/cs_files.C b/client/cs_files.C index 21572ef6b0..71a20db7cd 100644 --- a/client/cs_files.C +++ b/client/cs_files.C @@ -204,7 +204,7 @@ bool CLIENT_STATE::handle_pers_file_xfers() { // If the transfer finished, remove the PERS_FILE_XFER object // from the set and delete it // - if (pfx->xfer_done) { + if (pfx->pers_xfer_done) { fip = pfx->fip; if (fip->generated_locally || fip->upload_when_present) { // file has been uploaded - delete if not sticky diff --git a/client/pers_file_xfer.C b/client/pers_file_xfer.C index 5ebc5df4eb..895dafcdb6 100644 --- a/client/pers_file_xfer.C +++ b/client/pers_file_xfer.C @@ -72,7 +72,7 @@ int PERS_FILE_XFER::init(FILE_INFO* f, bool is_file_upload) { fxp = NULL; fip = f; is_upload = is_file_upload; - xfer_done = false; + pers_xfer_done = false; char* p = f->get_init_url(is_file_upload); if (!p) { msg_printf(NULL, MSG_ERROR, "No URL for file transfer of %s", f->name); @@ -109,11 +109,11 @@ int PERS_FILE_XFER::start_xfer() { // signature and checksum match retval = fip->set_permissions(); fip->status = FILE_PRESENT; - xfer_done = true; + pers_xfer_done = true; msg_printf( fip->project, MSG_INFO, - "File %s exists already, skipping download", pathname + "File %s exists already, skipping download", fip->name ); return retval; @@ -144,7 +144,6 @@ int PERS_FILE_XFER::start_xfer() { delete fxp; fxp = NULL; return retval; - // TODO: do we need to do anything here? } retval = gstate.file_xfers->insert(file_xfer); if (retval) { @@ -179,7 +178,7 @@ bool PERS_FILE_XFER::poll(time_t now) { SCOPE_MSG_LOG scope_messages(log_messages, CLIENT_MSG_LOG::DEBUG_FILE_XFER); - if (xfer_done) { + if (pers_xfer_done) { return false; } if (!fxp) { @@ -191,7 +190,7 @@ bool PERS_FILE_XFER::poll(time_t now) { last_time = dtime(); fip->upload_offset = -1; retval = start_xfer(); - return (retval == 0); + return true; } else { return false; } @@ -207,14 +206,15 @@ bool PERS_FILE_XFER::poll(time_t now) { if (fxp->file_xfer_done) { if (fip->nbytes) { - // check if the file was actually downloaded, if not check if there are more urls to try - // if there are no bytes downloaded, than the was probably a wrong url downloaded + // If we know the file size, make sure that what we downloaded + // has the right size + // get_pathname(fip, pathname); file_size(pathname, existing_size); if (existing_size != fip->nbytes) { check_giveup("File downloaded had wrong size"); + return true; } - return false; } scope_messages.printf( "PERS_FILE_XFER::poll(): file transfer status %d", @@ -236,7 +236,7 @@ bool PERS_FILE_XFER::poll(time_t now) { ); } } - xfer_done = true; + pers_xfer_done = true; } else if (fxp->file_xfer_retval == ERR_UPLOAD_PERMANENT) { if (log_flags.file_xfer) { msg_printf( @@ -270,6 +270,9 @@ bool PERS_FILE_XFER::poll(time_t now) { return false; } +// A file transfer (to a particular server) +// has had a +// // Takes a reason why a transfer has failed. // // Checks to see if there are no more valid URLs listed in the file_info. @@ -292,7 +295,7 @@ void PERS_FILE_XFER::check_giveup(char* why) { } else { fip->status = ERR_GIVEUP_DOWNLOAD; } - xfer_done = true; + pers_xfer_done = true; msg_printf( fip->project, MSG_ERROR, "Giving up on %s of %s: %s", is_upload?"upload":"download", fip->name, why @@ -343,7 +346,6 @@ void PERS_FILE_XFER::handle_xfer_failure() { retry_or_backoff(); } } - // Cycle to the next URL, or if we've hit all URLs in this cycle, // backoff and try again later // diff --git a/client/pers_file_xfer.h b/client/pers_file_xfer.h index d860b4f42a..fe9bb25a38 100644 --- a/client/pers_file_xfer.h +++ b/client/pers_file_xfer.h @@ -59,7 +59,7 @@ public: double last_time; // when the above was last updated. // Defined only while a transfer is active - bool xfer_done; + bool pers_xfer_done; FILE_XFER* fxp; // nonzero if file xfer in progress FILE_INFO* fip;