mirror of https://github.com/BOINC/boinc.git
Merge pull request #3181 from BOINC/dpa_vboxwrapper
vboxwrapper: if control op fails, exit for 5 min instead of 1 day
This commit is contained in:
commit
ea00ec75e3
|
@ -267,44 +267,47 @@ void VBOX_BASE::dump_hypervisor_logs(bool include_error_logs) {
|
|||
if (include_error_logs) {
|
||||
dump_screenshot();
|
||||
fprintf(
|
||||
stderr,
|
||||
"\n"
|
||||
" Hypervisor System Log:\n\n"
|
||||
"%s\n"
|
||||
" VM Execution Log:\n\n"
|
||||
"%s\n"
|
||||
" VM Startup Log:\n\n"
|
||||
"%s\n"
|
||||
" VM Trace Log:\n\n"
|
||||
"%s",
|
||||
local_system_log.c_str(),
|
||||
local_vm_log.c_str(),
|
||||
local_startup_log.c_str(),
|
||||
local_trace_log.c_str()
|
||||
);
|
||||
stderr,
|
||||
"\n"
|
||||
" Hypervisor System Log:\n\n"
|
||||
"%s\n"
|
||||
" VM Execution Log:\n\n"
|
||||
"%s\n"
|
||||
" VM Startup Log:\n\n"
|
||||
"%s\n"
|
||||
" VM Trace Log:\n\n"
|
||||
"%s",
|
||||
local_system_log.c_str(),
|
||||
local_vm_log.c_str(),
|
||||
local_startup_log.c_str(),
|
||||
local_trace_log.c_str()
|
||||
);
|
||||
}
|
||||
|
||||
if (vm_exit_code) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"\n"
|
||||
" VM Exit Code: %d (0x%x)\n\n",
|
||||
(unsigned int)vm_exit_code,
|
||||
(unsigned int)vm_exit_code
|
||||
);
|
||||
fprintf(stderr,
|
||||
"\n"
|
||||
" VM Exit Code: %d (0x%x)\n\n",
|
||||
(unsigned int)vm_exit_code,
|
||||
(unsigned int)vm_exit_code
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void VBOX_BASE::report_clean(bool unrecoverable_error, bool skip_cleanup, bool do_dump_hypervisor_logs,
|
||||
int retval, std::string error_reason,
|
||||
int vm_pid, int temp_delay, std::string temp_reason,
|
||||
double current_cpu_time,
|
||||
double last_checkpoint_cpu_time,
|
||||
double fraction_done,
|
||||
double bytes_sent,
|
||||
double bytes_received) {
|
||||
|
||||
|
||||
void VBOX_BASE::report_clean(
|
||||
bool unrecoverable_error,
|
||||
bool skip_cleanup,
|
||||
bool do_dump_hypervisor_logs,
|
||||
int retval,
|
||||
string error_reason,
|
||||
int temp_delay,
|
||||
string temp_reason,
|
||||
double total_cpu_time,
|
||||
double last_checkpoint_cpu_time,
|
||||
double fraction_done,
|
||||
double bytes_sent,
|
||||
double bytes_received
|
||||
) {
|
||||
if (unrecoverable_error) {
|
||||
|
||||
// Attempt to cleanup the VM and exit.
|
||||
|
@ -321,21 +324,20 @@ void VBOX_BASE::report_clean(bool unrecoverable_error, bool skip_cleanup, bool d
|
|||
}
|
||||
|
||||
boinc_finish(retval);
|
||||
}
|
||||
else {
|
||||
|
||||
// if the VM is already running notify BOINC about the process ID so it can
|
||||
// clean up the environment. We should be safe to run after that.
|
||||
} else {
|
||||
// if the VM is already running notify BOINC about the process ID
|
||||
// so it can clean up the environment.
|
||||
// We should be safe to run after that.
|
||||
//
|
||||
if (vm_pid) {
|
||||
retval = boinc_report_app_status_aux(
|
||||
current_cpu_time,
|
||||
last_checkpoint_cpu_time,
|
||||
fraction_done,
|
||||
vm_pid,
|
||||
bytes_sent,
|
||||
bytes_received
|
||||
);
|
||||
total_cpu_time,
|
||||
last_checkpoint_cpu_time,
|
||||
fraction_done,
|
||||
vm_pid,
|
||||
bytes_sent,
|
||||
bytes_received
|
||||
);
|
||||
}
|
||||
|
||||
// Give the BOINC API time to report the pid to BOINC.
|
||||
|
@ -404,7 +406,7 @@ string VBOX_BASE::get_error(int num){
|
|||
"VM Hypervisor failed to enter an online state in a timely fashion."
|
||||
};
|
||||
|
||||
std::vector<std::string> v(args, args + 13);
|
||||
std::vector<string> v(args, args + 13);
|
||||
return v[num];
|
||||
}
|
||||
|
||||
|
@ -645,7 +647,7 @@ int VBOX_BASE::get_scratch_directory(string& dir) {
|
|||
APP_INIT_DATA aid;
|
||||
boinc_get_init_data_p(&aid);
|
||||
|
||||
dir = aid.project_dir + std::string("/scratch");
|
||||
dir = aid.project_dir + string("/scratch");
|
||||
|
||||
if (!dir.empty()) {
|
||||
return 1;
|
||||
|
@ -816,7 +818,7 @@ int VBOX_BASE::get_startup_log(string& log, bool tail_only, unsigned int buffer_
|
|||
return retval;
|
||||
}
|
||||
|
||||
int VBOX_BASE::read_floppy(std::string& data) {
|
||||
int VBOX_BASE::read_floppy(string& data) {
|
||||
if (enable_floppyio && pFloppy) {
|
||||
data = pFloppy->receive();
|
||||
return 0;
|
||||
|
@ -824,7 +826,7 @@ int VBOX_BASE::read_floppy(std::string& data) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
int VBOX_BASE::write_floppy(std::string& data) {
|
||||
int VBOX_BASE::write_floppy(string& data) {
|
||||
if (enable_floppyio && pFloppy) {
|
||||
pFloppy->send(data);
|
||||
return 0;
|
||||
|
@ -832,7 +834,7 @@ int VBOX_BASE::write_floppy(std::string& data) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
void VBOX_BASE::sanitize_format(std::string& output) {
|
||||
void VBOX_BASE::sanitize_format(string& output) {
|
||||
// Check for special characters used by printf and render them harmless
|
||||
string::iterator iter = output.begin();
|
||||
while (iter != output.end()) {
|
||||
|
@ -850,7 +852,7 @@ void VBOX_BASE::sanitize_format(std::string& output) {
|
|||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
void VBOX_BASE::sanitize_output(std::string& output) {
|
||||
void VBOX_BASE::sanitize_output(string& output) {
|
||||
// Remove \r from the log spew
|
||||
string::iterator iter = output.begin();
|
||||
while (iter != output.end()) {
|
||||
|
@ -862,7 +864,7 @@ void VBOX_BASE::sanitize_output(std::string& output) {
|
|||
}
|
||||
}
|
||||
#else
|
||||
void VBOX_BASE::sanitize_output(std::string& ) {}
|
||||
void VBOX_BASE::sanitize_output(string& ) {}
|
||||
#endif
|
||||
|
||||
// Launch VboxSVC.exe before going any further. if we don't, it'll be launched by
|
||||
|
@ -969,7 +971,7 @@ int VBOX_BASE::launch_vboxvm() {
|
|||
char cmdline[1024];
|
||||
char* argv[5];
|
||||
int argc;
|
||||
std::string output;
|
||||
string output;
|
||||
int retval = ERR_EXEC;
|
||||
|
||||
// Construct the command line parameters
|
||||
|
@ -1429,7 +1431,7 @@ CLEANUP:
|
|||
return retval;
|
||||
}
|
||||
|
||||
void VBOX_BASE::vbm_replay(std::string& command) {
|
||||
void VBOX_BASE::vbm_replay(string& command) {
|
||||
FILE* f = fopen(REPLAYLOG_FILENAME, "a");
|
||||
if (f) {
|
||||
fprintf(f, "%s\n", command.c_str());
|
||||
|
@ -1437,7 +1439,7 @@ void VBOX_BASE::vbm_replay(std::string& command) {
|
|||
}
|
||||
}
|
||||
|
||||
void VBOX_BASE::vbm_trace(std::string& command, std::string& output, int retval) {
|
||||
void VBOX_BASE::vbm_trace(string& command, string& output, int retval) {
|
||||
char buf[256];
|
||||
int pid;
|
||||
struct tm tm;
|
||||
|
|
|
@ -270,14 +270,20 @@ class VBOX_BASE : public VBOX_JOB {
|
|||
|
||||
virtual std::string get_error(int choice);
|
||||
|
||||
virtual void report_clean(bool unrecoverable_error, bool skip_cleanup, bool do_dump_hypervisor_logs,
|
||||
int retval, std::string error_reason,
|
||||
int vm_pid, int temp_delay, std::string temp_reason,
|
||||
double current_cpu_time,
|
||||
double last_checkpoint_cpu_time,
|
||||
double fraction_done,
|
||||
double bytes_sent,
|
||||
double bytes_received);
|
||||
virtual void report_clean(
|
||||
bool unrecoverable_error,
|
||||
bool skip_cleanup,
|
||||
bool do_dump_hypervisor_logs,
|
||||
int retval,
|
||||
std::string error_reason,
|
||||
int temp_delay,
|
||||
std::string temp_reason,
|
||||
double current_cpu_time,
|
||||
double last_checkpoint_cpu_time,
|
||||
double fraction_done,
|
||||
double bytes_sent,
|
||||
double bytes_received
|
||||
);
|
||||
};
|
||||
|
||||
class VBOX_VM : public VBOX_BASE {
|
||||
|
|
|
@ -48,6 +48,10 @@
|
|||
// Daniel Lombraña González <teleyinex AT gmail DOT com>
|
||||
// Marius Millea <mariusmillea AT gmail DOT com>
|
||||
|
||||
#define RESTART_DELAY 300
|
||||
// if a VM operation (suspend, resume, snapshot) fails,
|
||||
// exit and restart after this delay.
|
||||
|
||||
#ifdef _WIN32
|
||||
#include "boinc_win.h"
|
||||
#include "win_util.h"
|
||||
|
@ -392,7 +396,8 @@ int main(int argc, char** argv) {
|
|||
double elapsed_time = 0;
|
||||
double fraction_done = 0;
|
||||
double trickle_period = 0;
|
||||
double current_cpu_time = 0;
|
||||
double total_cpu_time = 0;
|
||||
// job CPU time counting previous episodes as well
|
||||
double starting_cpu_time = 0;
|
||||
double last_heartbeat_elapsed_time = 0;
|
||||
double last_checkpoint_cpu_time = 0;
|
||||
|
@ -768,7 +773,7 @@ int main(int argc, char** argv) {
|
|||
pVM->rd_host_port = checkpoint.remote_desktop_port;
|
||||
elapsed_time = checkpoint.elapsed_time;
|
||||
starting_cpu_time = checkpoint.cpu_time;
|
||||
current_cpu_time = starting_cpu_time;
|
||||
total_cpu_time = starting_cpu_time;
|
||||
last_checkpoint_elapsed_time = elapsed_time;
|
||||
last_heartbeat_elapsed_time = elapsed_time;
|
||||
last_checkpoint_cpu_time = starting_cpu_time;
|
||||
|
@ -798,7 +803,7 @@ int main(int argc, char** argv) {
|
|||
return EXIT_TIME_LIMIT_EXCEEDED;
|
||||
}
|
||||
|
||||
retval = pVM->run(current_cpu_time > 0);
|
||||
retval = pVM->run(total_cpu_time > 0);
|
||||
if (retval) {
|
||||
// All 'failure to start' errors are unrecoverable by default
|
||||
vboxlog_msg("ERROR: VM failed to start");
|
||||
|
@ -840,28 +845,29 @@ int main(int argc, char** argv) {
|
|||
if (unrecoverable_error) {
|
||||
if (pVM->online) pVM->capture_screenshot();
|
||||
|
||||
checkpoint.update(elapsed_time, current_cpu_time);
|
||||
checkpoint.update(elapsed_time, total_cpu_time);
|
||||
|
||||
}
|
||||
|
||||
pVM->report_clean(unrecoverable_error, skip_cleanup, do_dump_hypervisor_logs,
|
||||
retval, error_reason, pVM->vm_pid, temp_delay, temp_reason,
|
||||
current_cpu_time, last_checkpoint_cpu_time, fraction_done,
|
||||
bytes_sent, bytes_received);
|
||||
pVM->report_clean(
|
||||
unrecoverable_error, skip_cleanup, do_dump_hypervisor_logs,
|
||||
retval, error_reason, temp_delay, temp_reason,
|
||||
total_cpu_time, last_checkpoint_cpu_time, fraction_done,
|
||||
bytes_sent, bytes_received
|
||||
);
|
||||
}
|
||||
|
||||
// Report the VM pid to BOINC so BOINC can deal with it when needed.
|
||||
//
|
||||
vboxlog_msg("Reporting VM Process ID to BOINC.");
|
||||
retval = boinc_report_app_status_aux(
|
||||
|
||||
current_cpu_time,
|
||||
last_checkpoint_cpu_time,
|
||||
fraction_done,
|
||||
pVM->vm_pid,
|
||||
bytes_sent,
|
||||
bytes_received
|
||||
);
|
||||
total_cpu_time,
|
||||
last_checkpoint_cpu_time,
|
||||
fraction_done,
|
||||
pVM->vm_pid,
|
||||
bytes_sent,
|
||||
bytes_received
|
||||
);
|
||||
|
||||
// Wait for up to 5 minutes for the VM to switch states.
|
||||
// A system under load can take a while.
|
||||
|
@ -912,14 +918,16 @@ int main(int argc, char** argv) {
|
|||
temp_delay = 86400;
|
||||
}
|
||||
|
||||
if (unrecoverable_error) checkpoint.update(elapsed_time, current_cpu_time);
|
||||
|
||||
|
||||
pVM->report_clean(unrecoverable_error, skip_cleanup, do_dump_hypervisor_logs,
|
||||
retval, error_reason, pVM->vm_pid, temp_delay, temp_reason,
|
||||
current_cpu_time, last_checkpoint_cpu_time, fraction_done,
|
||||
bytes_sent, bytes_received);
|
||||
if (unrecoverable_error) {
|
||||
checkpoint.update(elapsed_time, total_cpu_time);
|
||||
}
|
||||
|
||||
pVM->report_clean(
|
||||
unrecoverable_error, skip_cleanup, do_dump_hypervisor_logs,
|
||||
retval, error_reason, temp_delay, temp_reason,
|
||||
total_cpu_time, last_checkpoint_cpu_time, fraction_done,
|
||||
bytes_sent, bytes_received
|
||||
);
|
||||
}
|
||||
|
||||
set_floppy_image(aid, *pVM);
|
||||
|
@ -927,7 +935,7 @@ int main(int argc, char** argv) {
|
|||
report_remote_desktop_info(*pVM);
|
||||
checkpoint.webapi_port = pVM->pf_host_port;
|
||||
checkpoint.remote_desktop_port = pVM->rd_host_port;
|
||||
checkpoint.update(elapsed_time, current_cpu_time);
|
||||
checkpoint.update(elapsed_time, total_cpu_time);
|
||||
|
||||
// Force throttling on our first pass through the loop
|
||||
boinc_status.reread_init_data_file = true;
|
||||
|
@ -992,7 +1000,9 @@ int main(int argc, char** argv) {
|
|||
|
||||
// Write updates for the graphics application's use
|
||||
if (pVM->enable_graphics_support) {
|
||||
boinc_write_graphics_status(current_cpu_time, elapsed_time, fraction_done);
|
||||
boinc_write_graphics_status(
|
||||
total_cpu_time, elapsed_time, fraction_done
|
||||
);
|
||||
}
|
||||
|
||||
if (boinc_status.no_heartbeat || boinc_status.quit_request) {
|
||||
|
@ -1000,7 +1010,7 @@ int main(int argc, char** argv) {
|
|||
if (pVM->enable_vm_savestate_usage) {
|
||||
retval = pVM->create_snapshot(elapsed_time);
|
||||
if (!retval) {
|
||||
checkpoint.update(elapsed_time, current_cpu_time);
|
||||
checkpoint.update(elapsed_time, total_cpu_time);
|
||||
boinc_checkpoint_completed();
|
||||
}
|
||||
pVM->stop();
|
||||
|
@ -1084,7 +1094,7 @@ int main(int argc, char** argv) {
|
|||
pVM->reset_vm_process_priority();
|
||||
retval = pVM->create_snapshot(elapsed_time);
|
||||
if (!retval) {
|
||||
checkpoint.update(elapsed_time, current_cpu_time);
|
||||
checkpoint.update(elapsed_time, total_cpu_time);
|
||||
boinc_checkpoint_completed();
|
||||
}
|
||||
pVM->poweroff();
|
||||
|
@ -1136,7 +1146,11 @@ int main(int argc, char** argv) {
|
|||
if ((unsigned)retval == VBOX_E_INVALID_OBJECT_STATE) {
|
||||
vboxlog_msg("ERROR: VM task failed to pause, rescheduling task for a later time.");
|
||||
pVM->poweroff();
|
||||
boinc_temporary_exit(86400, "VM job unmanageable, restarting later.");
|
||||
sprintf(buf,
|
||||
"VM suspend failed. Will exit and restart in %d sec.",
|
||||
RESTART_DELAY
|
||||
);
|
||||
boinc_temporary_exit(RESTART_DELAY, buf);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -1145,14 +1159,18 @@ int main(int argc, char** argv) {
|
|||
if ((unsigned)retval == VBOX_E_INVALID_OBJECT_STATE) {
|
||||
vboxlog_msg("ERROR: VM task failed to resume, rescheduling task for a later time.");
|
||||
pVM->poweroff();
|
||||
boinc_temporary_exit(86400, "VM job unmanageable, restarting later.");
|
||||
sprintf(buf,
|
||||
"VM resume failed. Will exit and restart in %d sec.",
|
||||
RESTART_DELAY
|
||||
);
|
||||
boinc_temporary_exit(RESTART_DELAY, buf);
|
||||
}
|
||||
}
|
||||
|
||||
// stuff to do every 10 secs (everything else is 1/sec)
|
||||
//
|
||||
if ((loop_iteration % 10) == 0) {
|
||||
current_cpu_time = starting_cpu_time + pVM->get_vm_cpu_time();
|
||||
total_cpu_time = starting_cpu_time + pVM->get_vm_cpu_time();
|
||||
check_trickle_triggers(*pVM);
|
||||
check_intermediate_uploads(*pVM);
|
||||
}
|
||||
|
@ -1172,10 +1190,10 @@ int main(int argc, char** argv) {
|
|||
fraction_done = 1.0;
|
||||
}
|
||||
boinc_report_app_status(
|
||||
current_cpu_time,
|
||||
last_checkpoint_cpu_time,
|
||||
fraction_done
|
||||
);
|
||||
total_cpu_time,
|
||||
last_checkpoint_cpu_time,
|
||||
fraction_done
|
||||
);
|
||||
|
||||
// write status report to stderr at regular intervals
|
||||
//
|
||||
|
@ -1187,7 +1205,7 @@ int main(int argc, char** argv) {
|
|||
if (elapsed_time) {
|
||||
vboxlog_msg("Status Report: Elapsed Time: '%f'", elapsed_time);
|
||||
}
|
||||
vboxlog_msg("Status Report: CPU Time: '%f'", current_cpu_time);
|
||||
vboxlog_msg("Status Report: CPU Time: '%f'", total_cpu_time);
|
||||
if (aid.global_prefs.daily_xfer_limit_mb) {
|
||||
vboxlog_msg("Status Report: Network Bytes Sent (Total): '%f'", bytes_sent);
|
||||
vboxlog_msg("Status Report: Network Bytes Received (Total): '%f'", bytes_received);
|
||||
|
@ -1220,13 +1238,17 @@ int main(int argc, char** argv) {
|
|||
//
|
||||
vboxlog_msg("ERROR: Checkpoint maintenance failed, rescheduling task for a later time. (%d)", retval);
|
||||
pVM->poweroff();
|
||||
boinc_temporary_exit(86400, "VM job unmanageable, restarting later.");
|
||||
sprintf(buf,
|
||||
"VM snapshot failed. Will exit and restart in %d sec.",
|
||||
RESTART_DELAY
|
||||
);
|
||||
boinc_temporary_exit(RESTART_DELAY, buf);
|
||||
} else {
|
||||
// tell BOINC we've successfully created a checkpoint.
|
||||
//
|
||||
checkpoint.update(elapsed_time, current_cpu_time);
|
||||
checkpoint.update(elapsed_time, total_cpu_time);
|
||||
last_checkpoint_elapsed_time = elapsed_time;
|
||||
last_checkpoint_cpu_time = current_cpu_time;
|
||||
last_checkpoint_cpu_time = total_cpu_time;
|
||||
boinc_checkpoint_completed();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue