- file upload handler: add an #ifdef for disabling locking of files

while writing to them.
    It's not clear to me that this locking is beneficial,
    and it may be causing filesystem problems at WCG
- volunteer storage stuff


svn path=/trunk/boinc/; revision=26021
This commit is contained in:
David Anderson 2012-08-15 21:27:38 +00:00
parent 5cc5c8b5b8
commit 0d42a4aa5c
11 changed files with 197 additions and 37 deletions

View File

@ -5549,3 +5549,23 @@ Charlie 14 Aug 2012
PostInstall.cpp
release_boinc.sh
Uninstall.cpp
David 15 Aug 2012
- file upload handler: add an #ifdef for disabling locking of files
while writing to them.
It's not clear to me that this locking is beneficial,
and it may be causing filesystem problems at WCG
- volunteer storage stuff
vda/
vda_lib.cpp,h
vda.cpp
ssim.cpp
vdad.cpp
vda_lib2.cpp
db/
boinc_db_types.h
boinc_db.cpp
schema_vda.sql
sched/
file_upload_handler.cpp

View File

@ -2350,7 +2350,8 @@ void DB_VDA_FILE::db_print(char* buf){
"chunk_size=%f, "
"need_update=%d, "
"initialized=%d, "
"retrieving=%d",
"retrieving=%d"
"retrieved=%d",
create_time,
dir,
file_name,
@ -2358,7 +2359,8 @@ void DB_VDA_FILE::db_print(char* buf){
chunk_size,
need_update?1:0,
initialized?1:0,
retrieving?1:0
retrieving?1:0,
retrieved?1:0
);
}
@ -2374,6 +2376,7 @@ void DB_VDA_FILE::db_parse(MYSQL_ROW &r) {
need_update = (atoi(r[i++]) != 0);
initialized = (atoi(r[i++]) != 0);
retrieving = (atoi(r[i++]) != 0);
retrieved = (atoi(r[i++]) != 0);
}
void DB_VDA_CHUNK_HOST::db_print(char* buf) {

View File

@ -685,6 +685,7 @@ struct VDA_FILE {
bool need_update;
bool initialized;
bool retrieving;
bool retrieved;
void clear();
};

View File

@ -8,6 +8,7 @@ create table vda_file (
need_update tinyint not null default 0,
initialized tinyint not null default 0,
retrieving tinyint not null default 0,
retrieved tinyint not null default 0,
primary key(id)
) engine = InnoDB;

View File

@ -50,6 +50,10 @@
#include "sched_msgs.h"
#define LOCK_FILES
// comment this out to not lock files
// this may avoid filesystem hangs
#define ERR_TRANSIENT true
#define ERR_PERMANENT false
@ -127,6 +131,7 @@ int copy_socket_to_file(FILE* in, char* path, double offset, double nbytes) {
);
}
#ifdef LOCK_FILES
// Put an advisory lock on the file.
// This will prevent OTHER instances of file_upload_handler
// from being able to write to the file.
@ -142,6 +147,7 @@ int copy_socket_to_file(FILE* in, char* path, double offset, double nbytes) {
close(fd);
return return_error(ERR_TRANSIENT, "can't lock file %s\n", path);
}
#endif
// check that file length corresponds to offset
// TODO: use a 64-bit variant

View File

@ -238,6 +238,7 @@ struct SIM_FILE : VDA_FILE_AUX, EVENT {
meta_chunk->reconstruct_and_cleanup();
printf("recovery_action():\n");
meta_chunk->recovery_action(sim.now);
meta_chunk->compute_min_failures();
fault_tolerance.sample(
meta_chunk->min_failures-1, collecting_stats(), sim.now
);
@ -467,6 +468,14 @@ void CHUNK::download_complete() {
sfp->recover();
}
int CHUNK::upload_all() {
return 0;
}
int META_CHUNK::upload_all() {
return 0;
}
int META_CHUNK::encode(bool) {
printf("%s: encoding metachunk %s\n", now_str(), name);
return 0;

View File

@ -179,7 +179,7 @@ int handle_retrieve(const char* name) {
sprintf(buf, "where file_name='%s'", name);
int retval = vf.lookup(buf);
if (retval) return retval;
retval = vf.update_field("retrieving=1");
retval = vf.update_field("retrieving=1, need_update=1");
return retval;
}
@ -203,7 +203,10 @@ int handle_status(const char* name) {
return retval;
}
printf("status for file %s:", vf.file_name);
vf.meta_chunk->recovery_plan();
vf.meta_chunk->compute_min_failures();
vf.meta_chunk->print_status(0);
printf("fault tolerance level: %d\n", vf.meta_chunk->min_failures-1);
return 0;
}

View File

@ -128,7 +128,7 @@ META_CHUNK::META_CHUNK(
// and whether to delete chunk data currently on server.
// Also compute min_failures
//
int META_CHUNK::recovery_plan() {
void META_CHUNK::recovery_plan() {
vector<DATA_UNIT*> recoverable;
vector<DATA_UNIT*> present;
@ -146,8 +146,7 @@ int META_CHUNK::recovery_plan() {
c->in_recovery_set = false;
c->data_needed = false;
c->data_now_present = false;
int retval = c->recovery_plan();
if (retval) return retval;
c->recovery_plan();
switch (c->status) {
case PRESENT:
present.push_back(c);
@ -191,7 +190,6 @@ int META_CHUNK::recovery_plan() {
} else {
status = UNRECOVERABLE;
}
return 0;
}
int META_CHUNK::recovery_action(double now) {
@ -230,10 +228,19 @@ int META_CHUNK::recovery_action(double now) {
retval = c->recovery_action(now);
if (retval) return retval;
}
return 0;
}
// Compute min_failures: the smallest # of host failures
// that would make this unit unrecoverable.
//
int META_CHUNK::compute_min_failures() {
unsigned int i;
for (i=0; i<children.size(); i++) {
DATA_UNIT* c = children[i];
c->compute_min_failures();
}
// Compute min_failures: the smallest # of host failures
// that would make this unit unrecoverable.
//
// Because of recovery action,
// some of our children may have changed status and fault tolerance,
// so ours may have changed too.
@ -440,12 +447,11 @@ int META_CHUNK::expand() {
///////////////// CHUNK ///////////////////////
int CHUNK::recovery_plan() {
void CHUNK::recovery_plan() {
keep_present = false;
if (present_on_server) {
status = PRESENT;
cost = 0;
min_failures = INT_MAX;
} else if (hosts.size() > 0) {
// if file is not present on server, assume that it's present
// on all hosts (otherwise we wouldn't have downloaded it).
@ -455,14 +461,28 @@ int CHUNK::recovery_plan() {
if ((int)(hosts.size()) < parent->dfile->policy.replication) {
data_needed = true;
}
min_failures = hosts.size();
} else {
status = UNRECOVERABLE;
min_failures = 0;
}
#ifdef DEBUG_RECOVERY
printf(" chunk %s: status %s\n", name, status_str(status));
#endif
}
int CHUNK::compute_min_failures() {
if (present_on_server) {
min_failures = INT_MAX;
return 0;
}
int nreplicas = 0;
set<VDA_CHUNK_HOST*>::iterator i;
for (i=hosts.begin(); i!=hosts.end(); i++) {
VDA_CHUNK_HOST* ch = *i;
if (ch->present_on_host) {
nreplicas++;
}
}
min_failures = nreplicas;
return 0;
}

View File

@ -85,8 +85,10 @@ struct VDA_FILE_AUX : VDA_FILE {
// base class for chunks and meta-chunks
//
struct DATA_UNIT {
virtual int recovery_plan(){return 0;};
virtual void recovery_plan(){};
virtual int recovery_action(double){return 0;};
virtual int compute_min_failures(){return 0;};
virtual int upload_all(){return 0;};
char name[64];
char dir[1024];
@ -139,8 +141,10 @@ struct META_CHUNK : DATA_UNIT {
int init(const char* dir, POLICY&, int level);
int get_state(const char* dir, POLICY&, int level);
virtual int recovery_plan();
virtual void recovery_plan();
virtual int recovery_action(double);
virtual int compute_min_failures();
virtual int upload_all();
int decide_reconstruct();
int reconstruct_and_cleanup();
@ -154,6 +158,7 @@ struct META_CHUNK : DATA_UNIT {
}
int decode();
int encode(bool first);
int reconstruct();
// used by vda
void print_status(int indent_level);
@ -173,14 +178,19 @@ struct CHUNK : DATA_UNIT {
void upload_complete();
void download_complete();
int assign();
virtual int recovery_plan();
virtual void recovery_plan();
virtual int recovery_action(double);
virtual int compute_min_failures();
virtual int upload_all();
bool need_more_replicas() {
return ((int)hosts.size() < parent->dfile->policy.replication);
}
// used by vda
void print_status(int indent_level);
// used by vdad
int start_upload_from_host(VDA_CHUNK_HOST&);
};
// names

View File

@ -272,6 +272,45 @@ int META_CHUNK::decode() {
return 0;
}
// the meta-chunk is present, and we're retrieving the file
//
int META_CHUNK::reconstruct() {
unsigned int i;
// reconstruct enough children that we can reconstruct ourself
//
if (!bottom_level) {
int n = 0;
for (i=0; i<children.size(); i++) {
META_CHUNK* cp = (META_CHUNK*)children[i];
if (cp->status == PRESENT) {
cp->reconstruct();
n++;
if (n == coding.n) break;
}
}
}
decode();
// then delete childrens' files
//
for (i=0; i<children.size(); i++) {
children[i]->delete_file();
}
return 0;
}
// We're retrieving the file.
// Start all possible uploads.
//
int META_CHUNK::upload_all() {
unsigned int i;
for (i=0; i<children.size(); i++) {
children[i]->upload_all();
}
return 0;
}
///////////////// CHUNK ///////////////////////
CHUNK::CHUNK(META_CHUNK* mc, double s, int index) {
@ -324,18 +363,7 @@ int CHUNK::assign() {
return 0;
}
int CHUNK::start_upload() {
// if no upload of this chunk is in progress, start one.
// NOTE: all instances are inherently present_on_host,
// since this is only called if chunk is not present on server
//
VDA_CHUNK_HOST* chp;
set<VDA_CHUNK_HOST*>::iterator i;
for (i=hosts.begin(); i!=hosts.end(); i++) {
chp = *i;
if (chp->transfer_in_progress) return 0;
}
chp = *(hosts.begin());
int CHUNK::start_upload_from_host(VDA_CHUNK_HOST& ch) {
DB_VDA_CHUNK_HOST dch;
char set_clause[256], where_clause[256];
sprintf(set_clause,
@ -344,12 +372,46 @@ int CHUNK::start_upload() {
);
sprintf(where_clause,
"where vda_file_id=%d and host_id=%d and name='%s'",
chp->vda_file_id,
chp->host_id,
ch.vda_file_id,
ch.host_id,
name
);
int retval = dch.update_fields_noid(set_clause, where_clause);
return retval;
return dch.update_fields_noid(set_clause, where_clause);
}
// if no upload of this chunk is in progress, start one.
// NOTES:
// - all instances are inherently present_on_host,
// since this is only called if chunk is not present on server
// - we arbitrarily pick the first host in the list.
// Could randomize this or use other criteria.
//
int CHUNK::start_upload() {
VDA_CHUNK_HOST* chp;
set<VDA_CHUNK_HOST*>::iterator i;
for (i=hosts.begin(); i!=hosts.end(); i++) {
chp = *i;
if (chp->transfer_in_progress) return 0;
}
chp = *(hosts.begin());
return start_upload_from_host(*chp);
}
// Start uploads of all instances.
// Used when retrieving the file.
//
int CHUNK::upload_all() {
if (present_on_server) return 0;
VDA_CHUNK_HOST* chp;
set<VDA_CHUNK_HOST*>::iterator i;
for (i=hosts.begin(); i!=hosts.end(); i++) {
chp = *i;
if (chp->transfer_in_progress) continue;
int retval = start_upload_from_host(*chp);
if (retval) return retval;
}
return 0;
}
///////////////// VDA_FILE_AUX ///////////////////////

View File

@ -78,11 +78,35 @@ int handle_file(VDA_FILE_AUX& vf, DB_VDA_FILE& dvf) {
dvf.update_field(buf);
}
log_messages.printf(MSG_NORMAL, "Recovery plan:\n");
retval = vf.meta_chunk->recovery_plan();
if (retval) {
log_messages.printf(MSG_CRITICAL, "vf.recovery_plan failed %d\n", retval);
return retval;
vf.meta_chunk->recovery_plan();
// see if we're retrieving this file
//
if (vf.retrieving) {
if (vf.retrieved) return 0;
switch (vf.meta_chunk->status) {
case PRESENT:
// we have enough chunks to reconstruct it - do so
//
vf.meta_chunk->reconstruct();
break;
case RECOVERABLE:
// otherwise start all possible uploads
//
vf.meta_chunk->upload_all();
break;
case UNRECOVERABLE:
// if it looks like we can't recover the file, print a msg
//
log_messages.printf(MSG_CRITICAL,
"Can't retrieve %s: unrecoverable\n",
vf.file_name
);
break;
}
return 0;
}
retval = vf.meta_chunk->decide_reconstruct();
if (retval) {
log_messages.printf(MSG_CRITICAL, "vf.decide_reconstruct failed %d\n", retval);
@ -99,6 +123,7 @@ int handle_file(VDA_FILE_AUX& vf, DB_VDA_FILE& dvf) {
log_messages.printf(MSG_CRITICAL, "vf.recovery_action failed %d\n", retval);
return retval;
}
vf.meta_chunk->compute_min_failures();
return 0;
}