- client: tweak parameters of file xfer backoff to

reduce backoff intervals somewhat
- vboxwrapper: fix buffer size typo (from Attila)
- scheduler: fix crash if using homogeneous app version,
    and a WU is committed to an old or deprecated app version.
    From Kevin Reed.


svn path=/trunk/boinc/; revision=24775
This commit is contained in:
David Anderson 2011-12-12 22:07:37 +00:00
parent fa3f5a945e
commit 4e95e690f8
5 changed files with 180 additions and 45 deletions

View File

@ -8960,3 +8960,20 @@ David 10 Dec 2011
get_output.php
ssim/
ssim.cpp
David 12 Dec 2011
- client: tweak parameters of file xfer backoff to
reduce backoff intervals somewhat
- vboxwrapper: fix buffer size typo (from Attila)
- scheduler: fix crash if using homogeneous app version,
and a WU is committed to an old or deprecated app version.
From Kevin Reed.
sched/
sched_version.cpp
samples/vboxwrapper/
vboxwrapper.cpp
ssim/
ssim.cpp
client/
pers_file_xfer.h

View File

@ -24,8 +24,8 @@
// min and max for exponential backoff
// these are multiplied by a random factor .5 .. 1
//
#define PERS_RETRY_DELAY_MIN 600
#define PERS_RETRY_DELAY_MAX (3600*12)
#define PERS_RETRY_DELAY_MIN 120
#define PERS_RETRY_DELAY_MAX (3600*6)
#define PERS_GIVEUP (SECONDS_PER_DAY*90)
// give up on xfer if this time elapses since last byte xferred

View File

@ -82,9 +82,9 @@ using std::vector;
int parse_job_file(VBOX_VM& vm) {
MIOFILE mf;
char buf[256], buf2[256];
char buf[1024], buf2[256];
boinc_resolve_filename(JOB_FILENAME, buf, 1024);
boinc_resolve_filename(JOB_FILENAME, buf, sizeof(buf));
FILE* f = boinc_fopen(buf, "r");
if (!f) {
fprintf(stderr,

View File

@ -426,6 +426,7 @@ static BEST_APP_VERSION* check_homogeneous_app_version(
bool found=false;
APP_VERSION *avp = ssp->lookup_app_version(wu.app_version_id);
if (!avp) return NULL; // in case it was deprecated
for (unsigned int i=0; i<g_request->platforms.list.size(); i++) {
PLATFORM* p = g_request->platforms.list[i];
if (p->id == avp->platformid) {

View File

@ -192,6 +192,7 @@ void die(const char* msg) {
// base class for chunks and meta-chunks
//
struct DATA_UNIT {
#if 0
virtual bool recoverable(){die("recoverable undef"); return false;};
// can be reconstructed w/o reconstructing parent,
// assuming that current downloads succeed
@ -201,6 +202,14 @@ struct DATA_UNIT {
virtual void delete_chunks_from_server(){die("dcfs undef");};
virtual void now_present(){die("now_present undef");};
bool is_uploading;
#endif
virtual void recovery_plan(){};
virtual void recovery_action(){};
enum {PRESENT, RECOVERABLE, UNRECOVERABLE} status;
bool in_recovery_set;
bool data_now_present;
bool data_needed;
double cost;
char name[64];
};
@ -208,20 +217,21 @@ struct CHUNK : DATA_UNIT {
set<CHUNK_ON_HOST*> hosts;
META_CHUNK* parent;
double size;
bool _is_present_on_server;
bool present_on_server;
CHUNK(META_CHUNK* mc, double s, int index);
virtual void assign();
void host_failed(CHUNK_ON_HOST* p);
void upload_complete();
#if 0
virtual bool recoverable() {
return (!hosts.empty());
}
virtual bool is_present_on_server() {
return _is_present_on_server;
}
virtual void start_upload() {
virtual void now_present();
virtual void delete_chunks_from_server();
#endif
void start_upload() {
// if no upload of this chunk is in progress, start one.
// NOTE: all instances are inherently present_on_host,
// since this is only called if chunk is not present on server
@ -238,15 +248,19 @@ struct CHUNK : DATA_UNIT {
printf("%s: starting upload of %s\n", now_str(), c->name);
sim.insert(c);
}
void host_failed(CHUNK_ON_HOST* p);
void upload_complete();
void download_complete();
virtual void now_present();
virtual void delete_chunks_from_server();
void assign();
virtual void recovery_plan();
virtual void recovery_action();
};
struct META_CHUNK : DATA_UNIT {
vector<DATA_UNIT*> children;
META_CHUNK* parent;
int n_children_present;
bool have_unrecoverable_children;
DFILE* dfile;
bool uploading;
@ -254,6 +268,7 @@ struct META_CHUNK : DATA_UNIT {
DFILE* d, META_CHUNK* par, double size, int encoding_level, int index
);
#if 0
virtual bool is_present_on_server() {
int n=0;
for (int i=0; i<ENCODING_M; i++) {
@ -303,6 +318,9 @@ struct META_CHUNK : DATA_UNIT {
// a child is now present on the server.
//
void child_upload_complete();
#endif
virtual void recovery_plan();
virtual void recovery_action();
};
static int next_file_id=0;
@ -389,9 +407,14 @@ struct DFILE : EVENT {
printf("file %d: size %f encoded size %f\n",
id, size, disk_usage.value
);
meta_chunk->assign();
meta_chunk->recovery_plan();
meta_chunk->recovery_action();
}
void recover() {
meta_chunk->recovery_plan();
meta_chunk->recovery_action();
}
inline bool collecting_stats() {
return (pending_init_downloads == 0);
}
@ -446,24 +469,26 @@ void HOST::handle() {
CHUNK::CHUNK(META_CHUNK* mc, double s, int index) {
parent = mc;
_is_present_on_server = true;
present_on_server = true;
size = s;
sprintf(name, "%s.%d", parent->name, index);
parent->dfile->pending_init_downloads += REPLICATION_LEVEL;
parent->dfile->disk_usage.sample_inc(size, false);
}
#if 0
void CHUNK::now_present() {
if (_is_present_on_server) return;
_is_present_on_server = true;
parent->dfile->disk_usage.sample_inc(size, false);
}
#endif
// if there aren't enough replicas of this chunk,
// pick new hosts and start downloads
//
void CHUNK::assign() {
if (!_is_present_on_server) return;
if (!present_on_server) return;
while (hosts.size() < REPLICATION_LEVEL) {
if (parent->dfile->unused_hosts.size() == 0) {
die("no more hosts!\n");
@ -484,6 +509,7 @@ void CHUNK::assign() {
}
}
#if 0
void CHUNK::delete_chunks_from_server() {
set<CHUNK_ON_HOST*>::iterator i;
for (i=hosts.begin(); i!=hosts.end(); i++) {
@ -495,6 +521,7 @@ void CHUNK::delete_chunks_from_server() {
parent->dfile->disk_usage.sample_inc(-size, parent->dfile->collecting_stats());
_is_present_on_server = false;
}
#endif
META_CHUNK::META_CHUNK(
DFILE* d, META_CHUNK* par, double size, int encoding_level, int index
@ -529,17 +556,10 @@ void CHUNK::host_failed(CHUNK_ON_HOST* p) {
set<CHUNK_ON_HOST*>::iterator i = hosts.find(p);
hosts.erase(i);
printf("%s: handling loss of %s\n", now_str(), p->name);
if (_is_present_on_server) {
// if data is on server, make a new replica
//
assign();
} else if (!hosts.empty()) {
start_upload();
} else {
parent->child_unrecoverable();
}
parent->dfile->recover();
}
#if 0
void META_CHUNK::child_unrecoverable() {
int n = n_recoverable_children();
printf("%s: a child of %s has become unrecoverable\n", now_str(), name);
@ -555,23 +575,22 @@ void META_CHUNK::child_unrecoverable() {
printf("%s: only %d recoverable children\n", now_str(), n);
}
}
#endif
//////////////////// UPLOAD COMPLETION //////////////////
void CHUNK::upload_complete() {
if (!_is_present_on_server) {
_is_present_on_server = true;
if (!present_on_server) {
present_on_server = true;
parent->dfile->disk_usage.sample_inc(
size,
parent->dfile->collecting_stats()
);
}
assign();
if (parent->uploading) {
parent->child_upload_complete();
}
parent->dfile->recover();
}
#if 0
void META_CHUNK::child_upload_complete() {
printf("%s: child upload complete for %s\n", now_str(), name);
int n = 0;
@ -594,6 +613,7 @@ void META_CHUNK::child_upload_complete() {
}
}
}
#endif
//////////////////// DOWNLOAD COMPLETION ////////////////
@ -604,25 +624,122 @@ void META_CHUNK::child_upload_complete() {
// - this chunk isn't needed for recovery going on elsewhere
//
void CHUNK::download_complete() {
int n=0;
set<CHUNK_ON_HOST*>::iterator i;
for (i=hosts.begin(); i!=hosts.end(); i++) {
CHUNK_ON_HOST* c = *i;
if (c->present_on_host) {
n++;
}
}
if (n >= REPLICATION_LEVEL) {
printf("%s: %s replicated, removing from server\n", now_str(), name);
_is_present_on_server = false;
parent->dfile->disk_usage.sample_inc(
-size,
parent->dfile->collecting_stats()
);
}
if (parent->dfile->pending_init_downloads) {
parent->dfile->pending_init_downloads--;
}
parent->dfile->recover();
}
void META_CHUNK::recovery_plan() {
vector<DATA_UNIT*> recoverable;
vector<DATA_UNIT*> present;
unsigned int i;
have_unrecoverable_children = false;
for (i=0; i<children.size(); i++) {
DATA_UNIT* c = children[i];
c->in_recovery_set = false;
c->recovery_plan();
switch (c->status) {
case PRESENT:
present.push_back(c);
break;
case RECOVERABLE:
recoverable.push_back(c);
break;
case UNRECOVERABLE:
have_unrecoverable_children = true;
break;
}
}
if (present.size() >= ENCODING_N) {
status = PRESENT;
sort(present.begin(), present.end());
present.resize(ENCODING_N);
cost = 0;
for (i=0; i<present.size(); i++) {
DATA_UNIT* c= present[i];
cost += c->cost;
c->in_recovery_set = true;
}
} else if (present.size() + recoverable.size() >= ENCODING_N) {
status = RECOVERABLE;
int j = ENCODING_N - present.size();
sort(recoverable.begin(), recoverable.end());
recoverable.resize(j);
cost = 0;
for (i=0; i<present.size(); i++) {
DATA_UNIT* c= present[i];
c->in_recovery_set = true;
}
for (i=0; i<recoverable.size(); i++) {
DATA_UNIT* c= recoverable[i];
c->in_recovery_set = true;
cost += c->cost;
}
} else {
status = UNRECOVERABLE;
}
}
void CHUNK::recovery_plan() {
if (present_on_server) {
status = PRESENT;
cost = 0;
} else if (hosts.size() > 0) {
status = RECOVERABLE;
cost = size;
if (hosts.size() < REPLICATION_LEVEL) {
data_needed = true;
}
} else {
status = UNRECOVERABLE;
}
}
void META_CHUNK::recovery_action() {
if (data_now_present) {
status = PRESENT;
}
for (unsigned i=0; i<children.size(); i++) {
DATA_UNIT* c = children[i];
if (status == PRESENT) {
if (c->in_recovery_set && have_unrecoverable_children) {
c->data_needed = true;
}
}
c->recovery_action();
}
}
void CHUNK::recovery_action() {
if (data_now_present) {
present_on_server = true;
parent->dfile->disk_usage.sample_inc(
size,
parent->dfile->collecting_stats()
);
status = PRESENT;
}
if (status == PRESENT && hosts.size() < REPLICATION_LEVEL) {
assign();
}
if (data_needed) {
if (!present_on_server) {
start_upload();
}
} else {
if (present_on_server) {
present_on_server = false;
printf("%s: %s replicated, removing from server\n", now_str(), name);
parent->dfile->disk_usage.sample_inc(
-size,
parent->dfile->collecting_stats()
);
}
}
}
////////////////////