- client: better behavior if a GPU goes away:

1) if an APP_VERSION is missing a coprocessor,
        don't delete it and its files.
        (If the coprocessor returns, we won't need to re-download)
    2) if a RESULT uses an app version that is missing a coprocessor,
        abort it (rather than deleting it).
        The client will report the result on the next scheduler RPC,
        and the server will make a new instance.

svn path=/trunk/boinc/; revision=19235
This commit is contained in:
David Anderson 2009-10-02 23:39:38 +00:00
parent fca2cb8016
commit 833f417ae5
5 changed files with 59 additions and 29 deletions

View File

@ -8350,3 +8350,20 @@ David 2 Oct 2009
lib/
Makefile.am
coproc.cpp,h
David 2 Oct 2009
- client: better behavior if a GPU goes away:
1) if an APP_VERSION is missing a coprocessor,
don't delete it and its files.
(If the coprocessor returns, we won't need to re-download)
2) if a RESULT uses an app version that is missing a coprocessor,
abort it (rather than deleting it).
The client will report the result on the next scheduler RPC,
and the server will make a new instance.
client/
client_types.cpp
cs_scheduler.cpp
cs_statefile.cpp
lib/
error_numbers.h

View File

@ -1249,15 +1249,9 @@ void APP_VERSION::get_file_errors(string& str) {
bool APP_VERSION::missing_coproc() {
if (ncudas && !coproc_cuda) {
msg_printf(project, MSG_INTERNAL_ERROR,
"Application uses missing NVIDIA GPU"
);
return true;
}
if (natis && !coproc_ati) {
msg_printf(project, MSG_INTERNAL_ERROR,
"Application uses missing ATI GPU"
);
return true;
}
return false;

View File

@ -731,11 +731,17 @@ int CLIENT_STATE::handle_scheduler_reply(PROJECT* project, char* scheduler_url)
}
if (avpp.missing_coproc()) {
msg_printf(project, MSG_INTERNAL_ERROR,
"App version uses non-existent coprocessor; ignoring"
"App version uses non-existent %s GPU",
avpp.ncudas?"NVIDIA":"ATI"
);
}
APP* app = lookup_app(project, avpp.app_name);
if (!app) {
msg_printf(project, MSG_INTERNAL_ERROR,
"Missing app %s", avpp.app_name
);
continue;
}
APP* app = lookup_app(project, avpp.app_name);
APP_VERSION* avp = lookup_app_version(
app, avpp.platform, avpp.version_num, avpp.plan_class
);
@ -817,18 +823,26 @@ int CLIENT_STATE::handle_scheduler_reply(PROJECT* project, char* scheduler_url)
delete rp;
continue;
}
if (rp->avp->missing_coproc()) {
msg_printf(project, MSG_INTERNAL_ERROR,
"Missing coprocessor for task %s; aborting", rp->name
);
rp->abort_inactive(ERR_MISSING_COPROC);
continue;
} else {
rp->set_state(RESULT_NEW, "handle_scheduler_reply");
if (rp->avp->ncudas) {
est_cuda_duration += rp->estimated_duration(false);
} else if (rp->avp->natis) {
est_ati_duration += rp->estimated_duration(false);
} else {
est_cpu_duration += rp->estimated_duration(false);
}
}
rp->wup->version_num = rp->version_num;
rp->received_time = now;
results.push_back(rp);
new_results.push_back(rp);
rp->set_state(RESULT_NEW, "handle_scheduler_reply");
if (rp->avp->ncudas) {
est_cuda_duration += rp->estimated_duration(false);
} else if (rp->avp->natis) {
est_ati_duration += rp->estimated_duration(false);
} else {
est_cpu_duration += rp->estimated_duration(false);
}
results.push_back(rp);
}
if (log_flags.sched_op_debug) {
if (sr.results.size()) {

View File

@ -269,8 +269,10 @@ int CLIENT_STATE::parse_state_file() {
}
}
if (avp->missing_coproc()) {
delete avp;
continue;
msg_printf(project, MSG_INTERNAL_ERROR,
"Application uses missing %s GPU",
avp->ncudas?"NVIDIA":"ATI"
);
}
retval = link_app_version(project, avp);
if (retval) {
@ -350,6 +352,12 @@ int CLIENT_STATE::parse_state_file() {
delete rp;
continue;
}
if (rp->avp->missing_coproc()) {
msg_printf(project, MSG_INTERNAL_ERROR,
"Missing coprocessor for task %s; aborting", rp->name
);
rp->abort_inactive(ERR_MISSING_COPROC);
}
rp->wup->version_num = rp->version_num;
results.push_back(rp);
continue;
@ -800,10 +808,6 @@ int CLIENT_STATE::parse_app_info(PROJECT* p, FILE* in) {
if (strlen(avp->platform) == 0) {
strcpy(avp->platform, get_primary_platform());
}
if (avp->missing_coproc()) {
delete avp;
continue;
}
if (link_app_version(p, avp)) {
delete avp;
continue;

View File

@ -112,12 +112,12 @@
#define ERR_ASYNCSELECT -157
#define ERR_BAD_RESULT_STATE -158
#define ERR_DB_CANT_INIT -159
#define ERR_NOT_UNIQUE -160
// state files had redundant entries
#define ERR_NOT_FOUND -161
// inconsistent client state
#define ERR_NO_EXIT_STATUS -162
// exit_status not found in scheduler request
#define ERR_NOT_UNIQUE -160
// state files had redundant entries
#define ERR_NOT_FOUND -161
// inconsistent client state
#define ERR_NO_EXIT_STATUS -162
// exit_status not found in scheduler request
#define ERR_FILE_MISSING -163
#define ERR_SEMGET -165
#define ERR_SEMCTL -166
@ -188,6 +188,7 @@
#define ERR_CRYPTO -231
#define ERR_ABORTED_ON_EXIT -232
#define ERR_UNSTARTED_LATE -233
#define ERR_MISSING_COPROC -234
// PLEASE: add a text description of your error to
// the text description function boincerror() in str_util.cpp.