From d2784ffc4b7ab9840b8f8c56e80a8b5f22fbd496 Mon Sep 17 00:00:00 2001 From: Rom Walton Date: Tue, 3 Jan 2012 19:06:39 +0000 Subject: [PATCH] - VBOX: Per tester feedback, at an hourly status report to track network usage. - VBOX: For errors where an error code is specified in the output, use that value when returning from vbm_open(). Make it easier to track failure rates by problem type in the ops pages. (Windows Only) - VBOX: Try making the process of extracting the hypervisor system log more robust, make a temp copy of it in the slot directory. Avoid exclusive file lock issues since VboxSvc will still be logging to it. - VBOX: Change the session lock text to make it clearer that the use of other VirtualBox management tools can cause a problem. samples/vboxwrapper/ vbox.cpp, .h vboxwrapper.cpp svn path=/trunk/boinc/; revision=24990 --- checkin_notes | 21 +++- samples/vboxwrapper/vbox.cpp | 105 ++++++++++++++---- samples/vboxwrapper/vbox.h | 2 - samples/vboxwrapper/vboxwrapper.cpp | 161 ++++++++++++++++++---------- 4 files changed, 209 insertions(+), 80 deletions(-) diff --git a/checkin_notes b/checkin_notes index 070fbf675e..b15fdb0113 100644 --- a/checkin_notes +++ b/checkin_notes @@ -124,4 +124,23 @@ Rom 3 Jan 2012 They didn't wait for the next OpenCL spec change. client/ - coproc_detect.cpp \ No newline at end of file + coproc_detect.cpp + +Rom 3 Jan 2012 + - VBOX: Per tester feedback, at an hourly status report to track + network usage. + - VBOX: For errors where an error code is specified in the output, + use that value when returning from vbm_open(). Make it easier + to track failure rates by problem type in the ops pages. + (Windows Only) + - VBOX: Try making the process of extracting the hypervisor system + log more robust, make a temp copy of it in the slot directory. + Avoid exclusive file lock issues since VboxSvc will still be + logging to it. + - VBOX: Change the session lock text to make it clearer that the + use of other VirtualBox management tools can cause a problem. + + + samples/vboxwrapper/ + vbox.cpp, .h + vboxwrapper.cpp diff --git a/samples/vboxwrapper/vbox.cpp b/samples/vboxwrapper/vbox.cpp index a1e17dd4d9..9985bfea93 100644 --- a/samples/vboxwrapper/vbox.cpp +++ b/samples/vboxwrapper/vbox.cpp @@ -63,7 +63,6 @@ VBOX_VM::VBOX_VM() { job_duration = 0.0; suspended = false; network_suspended = false; - startup_completed = false; online = false; crashed = false; enable_cern_dataformat = false; @@ -119,10 +118,7 @@ int VBOX_VM::run() { boinc_sleep(1.0); } while (timeout <= dtime()); - // From here on out we don't need to report the system log when a vboxmanage - // command fails. - startup_completed = true; - + if (!online) return ERR_EXEC; return 0; } @@ -163,17 +159,19 @@ int VBOX_VM::vbm_popen(string& arguments, string& output, const char* item, bool if ((output.find("VBOX_E_INVALID_OBJECT_STATE") != string::npos) && (output.find("already locked") != string::npos)) { - if (retry_notes.find("already locked") == string::npos) { - retry_notes += "Virtual Machine session already locked.\n"; + if (retry_notes.find("Another VirtualBox management") == string::npos) { + retry_notes += "Another VirtualBox management application has locked the session for\n"; + retry_notes += "this virtual machine. BOINC cannot properly monitor this virtual machine\n"; + retry_notes += "and so this job will be aborted.\n\n"; } } - // Timeout? - if (retry_count >= 6) break; - // Retry? if (!retry_failures) break; + // Timeout? + if (retry_count >= 6) break; + retry_count++; boinc_sleep(5.0); } @@ -196,12 +194,6 @@ int VBOX_VM::vbm_popen(string& arguments, string& output, const char* item, bool output += "\nNotes:\n\n" + retry_notes; } - if (!startup_completed) { - string system_log; - get_system_log(system_log); - output += "\nHypervisor System Log:\n\n" + system_log; - } - fprintf( stderr, "%s Error in %s for VM: %d\nArguments:\n%s\nOutput:\n%s\n", @@ -240,6 +232,9 @@ int VBOX_VM::vbm_popen_raw(string& arguments, string& output) { DWORD dwCount = 0; unsigned long ulExitCode = 0; unsigned long ulExitTimeout = 0; + size_t errcode_start; + size_t errcode_end; + string errcode; memset(&si, 0, sizeof(si)); memset(&pi, 0, sizeof(pi)); @@ -335,7 +330,17 @@ CLEANUP: if (hWritePipe) CloseHandle(hWritePipe); if ((ulExitCode != 0) || (!pi.hProcess)) { - retval = ERR_FOPEN; + + // Determine the real error code by parsing the output + errcode_start = output.find("(0x"); + errcode_start += 1; + errcode_end = output.find(")", errcode_start); + errcode = output.substr(errcode_start, errcode_end - errcode_start); + + retval = atol(errcode.c_str()); + + // If something couldn't be found, just return ERR_FOPEN + if (!retval) retval = ERR_FOPEN; } #else @@ -1292,7 +1297,10 @@ int VBOX_VM::get_network_bytes_sent(double& sent) { int VBOX_VM::get_system_log(string& log) { string virtualbox_user_home; - string virtualbox_system_log; + string slot_directory; + string virtualbox_system_log_src; + string virtualbox_system_log_dst; + char buf[256]; int retval = 0; // Where is VirtualBox storing its configuration files? @@ -1309,12 +1317,47 @@ int VBOX_VM::get_system_log(string& log) { virtualbox_user_home += "/.VirtualBox"; } + // Where should we copy temp files to? + get_slot_directory(slot_directory); + // Locate and read log file - virtualbox_system_log = virtualbox_user_home + "/VBoxSVC.log"; - if (boinc_file_exists(virtualbox_system_log.c_str())) { + virtualbox_system_log_src = virtualbox_user_home + "/VBoxSVC.log"; + virtualbox_system_log_dst = slot_directory + "/VBoxSVC.log"; + + if (boinc_file_exists(virtualbox_system_log_src.c_str())) { + // Skip having to deal with various forms of file locks by just making a temp + // copy of the log file. + boinc_copy(virtualbox_system_log_src.c_str(), virtualbox_system_log_dst.c_str()); + // Keep only the last 16k if it is larger than that. - retval = read_file_string(virtualbox_system_log.c_str(), log, 16384, true); + read_file_string(virtualbox_system_log_dst.c_str(), log, 16384, true); + +#ifdef _WIN32 + // Remove \r from the log spew + for (string::iterator iter = log.begin(); iter != log.end(); ++iter) { + if (*iter == '\r') { + log.erase(iter); + } + } +#endif + + if (log.size() >= 16384) { + // Look for the next whole line of text. + for (string::iterator iter = log.begin(); iter != log.end(); ++iter) { + if (*iter == '\n') { + log.erase(iter); + break; + } + log.erase(iter); + } + } } else { + fprintf( + stderr, + "%s Could not find the system log at '%s'.\n", + boinc_msg_prefix(buf, sizeof(buf)), + virtualbox_system_log_src.c_str() + ); retval = ERR_NOT_FOUND; } @@ -1336,6 +1379,26 @@ int VBOX_VM::get_vm_log(string& log) { size_t size = output.size(); if (size > 16384) { log = output.substr(size - 16384, size); + +#ifdef _WIN32 + // Remove \r from the log spew + for (string::iterator iter = log.begin(); iter != log.end(); ++iter) { + if (*iter == '\r') { + log.erase(iter); + } + } +#endif + + if (log.size() >= 16384) { + // Look for the next whole line of text. + for (string::iterator iter = log.begin(); iter != log.end(); ++iter) { + if (*iter == '\n') { + log.erase(iter); + break; + } + log.erase(iter); + } + } } else { log = output; } diff --git a/samples/vboxwrapper/vbox.h b/samples/vboxwrapper/vbox.h index 26df081a8d..6078ab57f8 100644 --- a/samples/vboxwrapper/vbox.h +++ b/samples/vboxwrapper/vbox.h @@ -51,8 +51,6 @@ struct VBOX_VM { bool suspended; // is network access temporarily suspended? bool network_suspended; - // has the VM successfully completed the startup code - bool startup_completed; // is VM even online? bool online; // Has the VM crashed? diff --git a/samples/vboxwrapper/vboxwrapper.cpp b/samples/vboxwrapper/vboxwrapper.cpp index ec90d0c745..5d417853d7 100644 --- a/samples/vboxwrapper/vboxwrapper.cpp +++ b/samples/vboxwrapper/vboxwrapper.cpp @@ -119,17 +119,15 @@ int parse_job_file(VBOX_VM& vm) { } void write_checkpoint(double cpu) { - boinc_begin_critical_section(); FILE* f = fopen(CHECKPOINT_FILENAME, "w"); if (!f) return; fprintf(f, "%f\n", cpu); fclose(f); - boinc_checkpoint_completed(); } void read_checkpoint(double& cpu) { double c; - cpu = 0; + cpu = 0.0; FILE* f = fopen(CHECKPOINT_FILENAME, "r"); if (!f) return; int n = fscanf(f, "%lf", &c); @@ -204,18 +202,55 @@ void set_floppy_image(APP_INIT_DATA& aid, VBOX_VM& vm) { } } +// set port forwarding information if needed +// +void set_port_forwarding_info(APP_INIT_DATA& /* aid */, VBOX_VM& vm) { + char buf[256]; + + if (vm.pf_guest_port && vm.pf_host_port) { + fprintf( + stderr, + "%s port forwarding enabled on port '%d'.\n", + boinc_msg_prefix(buf, sizeof(buf)), vm.pf_host_port + ); + + // Write info to disk + // + MIOFILE mf; + FILE* f = boinc_fopen(PORTFORWARD_FILENAME, "w"); + mf.init_file(f); + + mf.printf( + "\n" + " \n" + " %d\n" + " %d\n" + " \n" + "\n", + vm.pf_host_port, + vm.pf_guest_port + ); + + fclose(f); + sprintf(buf, "http://localhost:%d", vm.pf_host_port); + boinc_web_graphics_url(buf); + } +} + int main(int argc, char** argv) { int retval; BOINC_OPTIONS boinc_options; VBOX_VM vm; APP_INIT_DATA aid; double elapsed_time = 0.0; - double checkpoint_cpu_time = 0.0; double trickle_period = 0.0; double trickle_cpu_time = 0.0; double fraction_done = 0.0; + double bytes_sent = 0.0; + double bytes_received = 0.0; + double checkpoint_cpu_time = 0.0; + double last_status_report_time = 0.0; bool report_vm_pid = false; - double bytes_sent=0, bytes_received=0; bool report_net_usage = false; int vm_pid=0; int vm_max_cpus=0; @@ -328,47 +363,32 @@ int main(int argc, char** argv) { elapsed_time = aid.starting_elapsed_time; read_checkpoint(checkpoint_cpu_time); + // Should we even try to start things up? + if (vm.job_duration && (elapsed_time > vm.job_duration)) { + return ERR_RSC_LIMIT_EXCEEDED; + } + retval = vm.run(); if (retval) { vm.cleanup(); - write_checkpoint(checkpoint_cpu_time); + write_checkpoint(elapsed_time); + + vm.get_system_log(system_log); + fprintf( + stderr, + "%s Hypervisor System Log:\n\n" + "%s\n", + boinc_msg_prefix(buf, sizeof(buf)), + system_log.c_str() + ); + boinc_finish(retval); } set_floppy_image(aid, vm); + set_port_forwarding_info(aid, vm); set_throttles(aid, vm); - // write port-forwarding info if relevant - // - if (vm.pf_guest_port && vm.pf_host_port) { - fprintf( - stderr, - "%s port forwarding enabled on port '%d'.\n", - boinc_msg_prefix(buf, sizeof(buf)), vm.pf_host_port - ); - - // Write info to disk - // - MIOFILE mf; - FILE* f = boinc_fopen(PORTFORWARD_FILENAME, "w"); - mf.init_file(f); - - mf.printf( - "\n" - " \n" - " %d\n" - " %d\n" - " \n" - "\n", - vm.pf_host_port, - vm.pf_guest_port - ); - - fclose(f); - sprintf(buf, "http://localhost:%d", vm.pf_host_port); - boinc_web_graphics_url(buf); - } - while (1) { // Discover the VM's current state vm.poll(); @@ -380,7 +400,7 @@ int main(int argc, char** argv) { } if (boinc_status.abort_request) { vm.cleanup(); - write_checkpoint(checkpoint_cpu_time); + write_checkpoint(elapsed_time); boinc_finish(EXIT_ABORTED_BY_CLIENT); } if (!vm.online) { @@ -389,7 +409,7 @@ int main(int argc, char** argv) { vm.get_vm_log(vm_log); } vm.cleanup(); - write_checkpoint(checkpoint_cpu_time); + write_checkpoint(elapsed_time); if (vm.crashed || (elapsed_time < vm.job_duration)) { fprintf( @@ -478,11 +498,14 @@ int main(int argc, char** argv) { if (vm.suspended) { vm.resume(); } + elapsed_time += POLL_PERIOD; + if (!vm_pid) { vm.get_process_id(vm_pid); report_vm_pid = true; } + if (boinc_time_to_checkpoint()) { checkpoint_cpu_time = elapsed_time; write_checkpoint(checkpoint_cpu_time); @@ -492,19 +515,40 @@ int main(int argc, char** argv) { fraction_done = 1.0; } } + if (report_vm_pid || report_net_usage) { + retval = boinc_report_app_status_aux( + elapsed_time, + checkpoint_cpu_time, + fraction_done, + vm_pid, + bytes_sent, + bytes_received + ); + if (!retval) { + report_vm_pid = false; + report_net_usage = false; + } + } else { + boinc_report_app_status( + elapsed_time, + checkpoint_cpu_time, + fraction_done + ); + } + if ((elapsed_time - last_status_report_time) >= 6000.0) { + last_status_report_time = elapsed_time; + fprintf( + stderr, + "%s Status Report: Elapsed Time: '%f', Network Bytes Sent (Total): '%f', Network Bytes Received (Total): '%f'\n", + boinc_msg_prefix(buf, sizeof(buf)), + elapsed_time, + bytes_sent, + bytes_received + ); + } boinc_checkpoint_completed(); } - if (report_vm_pid || report_net_usage) { - retval = boinc_report_app_status_aux( - elapsed_time, checkpoint_cpu_time, fraction_done, vm_pid, bytes_sent, bytes_received - ); - if (!retval) { - report_vm_pid = false; - report_net_usage = false; - } - } else { - boinc_report_app_status(elapsed_time, checkpoint_cpu_time, fraction_done); - } + if (trickle_period) { trickle_cpu_time += POLL_PERIOD; if (trickle_cpu_time >= trickle_period) { @@ -517,12 +561,10 @@ int main(int argc, char** argv) { // if the VM has a maximum amount of time it is allowed to run, // shut it down gacefully and exit. // - if (vm.job_duration > 0.0) { - if (elapsed_time > vm.job_duration) { - vm.cleanup(); - write_checkpoint(checkpoint_cpu_time); - boinc_finish(0); - } + if (vm.job_duration && (elapsed_time > vm.job_duration)) { + vm.cleanup(); + write_checkpoint(elapsed_time); + boinc_finish(0); } } if (vm.enable_network) { @@ -538,6 +580,13 @@ int main(int argc, char** argv) { } if (boinc_status.reread_init_data_file) { boinc_status.reread_init_data_file = false; + + fprintf( + stderr, + "%s Preference change detected\n", + boinc_msg_prefix(buf, sizeof(buf)) + ); + boinc_parse_init_data_file(); boinc_get_init_data_p(&aid); set_throttles(aid, vm);