storage simulator: compute fault tolerance correctly

-This line, and those below, will be ignored--

M    locale/templates/BOINC-Project-Generic.pot
M    ssim/ssim.cpp
M    checkin_notes

svn path=/trunk/boinc/; revision=25071
This commit is contained in:
David Anderson 2012-01-16 19:43:03 +00:00
parent e2a4f3b535
commit 72ddd4fa3e
3 changed files with 975 additions and 448 deletions

View File

@ -587,3 +587,9 @@ David 15 Jan 2012
str_util.cpp str_util.cpp
ssim/ ssim/
ssim.cpp ssim.cpp
David 15 Jan 2012
- storage simulator: compute fault tolerance correctly
ssim/
ssim.cpp

File diff suppressed because it is too large Load Diff

View File

@ -209,7 +209,7 @@ struct DATA_UNIT {
bool data_now_present; bool data_now_present;
bool data_needed; bool data_needed;
double cost; double cost;
int fault_tolerance; int min_failures;
// min # of host failures that would make this unrecoverable // min # of host failures that would make this unrecoverable
char name[64]; char name[64];
}; };
@ -257,6 +257,8 @@ struct STATS_ITEM {
double integral; double integral;
double max_val; double max_val;
double max_val_time; double max_val_time;
double min_val;
double min_val_time;
double prev_t; double prev_t;
double start_time; double start_time;
bool first; bool first;
@ -267,6 +269,8 @@ struct STATS_ITEM {
integral = 0; integral = 0;
max_val = 0; max_val = 0;
max_val_time = 0; max_val_time = 0;
min_val = INT_MAX;
min_val_time = 0;
first = true; first = true;
} }
@ -289,17 +293,25 @@ struct STATS_ITEM {
max_val = v; max_val = v;
max_val_time = sim.now; max_val_time = sim.now;
} }
if (v < min_val) {
min_val = v;
min_val_time = sim.now;
}
} }
void sample_inc(double inc, bool collecting_stats) { void sample_inc(double inc, bool collecting_stats) {
sample(value+inc, collecting_stats); sample(value+inc, collecting_stats);
} }
void print() { void print(bool show_min) {
sample_inc(0, true); sample_inc(0, true);
double dt = sim.now - start_time; double dt = sim.now - start_time;
printf(" mean: %f\n", integral/dt); printf(" mean: %f\n", integral/dt);
if (show_min) {
printf(" min: %f\n", min_val);
} else {
printf(" max: %f\n", max_val); printf(" max: %f\n", max_val);
}
printf(" time of max: %s\n", time_str(max_val_time)); printf(" time of max: %s\n", time_str(max_val_time));
} }
}; };
@ -356,19 +368,19 @@ struct DFILE : EVENT {
void recover() { void recover() {
meta_chunk->recovery_plan(); meta_chunk->recovery_plan();
meta_chunk->recovery_action(); meta_chunk->recovery_action();
fault_tolerance.sample(meta_chunk->fault_tolerance, collecting_stats()); fault_tolerance.sample(meta_chunk->min_failures-1, collecting_stats());
} }
void print_stats() { void print_stats() {
printf("Statistics for file %d\n", id); printf("Statistics for file %d\n", id);
printf(" Server disk usage:\n"); printf(" Server disk usage:\n");
disk_usage.print(); disk_usage.print(false);
printf(" Upload rate:\n"); printf(" Upload rate:\n");
upload_rate.print(); upload_rate.print(false);
printf(" Download rate:\n"); printf(" Download rate:\n");
download_rate.print(); download_rate.print(false);
printf(" Fault tolerance level:\n"); printf(" Fault tolerance level:\n");
fault_tolerance.print(); fault_tolerance.print(true);
} }
}; };
@ -591,10 +603,10 @@ bool compare_cost(const DATA_UNIT* d1, const DATA_UNIT* d2) {
return d1->cost < d2->cost; return d1->cost < d2->cost;
} }
// sort by decreasing fault tolerance // sort by increase min_failures
// //
bool compare_fault_tolerance(const DATA_UNIT* d1, const DATA_UNIT* d2) { bool compare_min_failures(const DATA_UNIT* d1, const DATA_UNIT* d2) {
return d1->fault_tolerance > d2->fault_tolerance; return d1->min_failures < d2->min_failures;
} }
// Recovery logic: decide what to do in response to // Recovery logic: decide what to do in response to
@ -655,7 +667,6 @@ void META_CHUNK::recovery_plan() {
sort(present.begin(), present.end(), compare_cost); sort(present.begin(), present.end(), compare_cost);
present.resize(coding.n); present.resize(coding.n);
cost = 0; cost = 0;
fault_tolerance = INT_MAX;
for (i=0; i<present.size(); i++) { for (i=0; i<present.size(); i++) {
DATA_UNIT* c= present[i]; DATA_UNIT* c= present[i];
cost += c->cost; cost += c->cost;
@ -676,14 +687,6 @@ void META_CHUNK::recovery_plan() {
cost += c->cost; cost += c->cost;
} }
// compute our fault tolerance
//
sort(recoverable.begin(), recoverable.end(), compare_fault_tolerance);
fault_tolerance = 0;
for (i=0; i<j; i++) {
DATA_UNIT* c= recoverable[i];
fault_tolerance += c->fault_tolerance;
}
} else { } else {
status = UNRECOVERABLE; status = UNRECOVERABLE;
} }
@ -702,17 +705,17 @@ void CHUNK::recovery_plan() {
if (present_on_server) { if (present_on_server) {
status = PRESENT; status = PRESENT;
cost = 0; cost = 0;
fault_tolerance = INT_MAX; min_failures = INT_MAX;
} else if (hosts.size() > 0) { } else if (hosts.size() > 0) {
status = RECOVERABLE; status = RECOVERABLE;
cost = size; cost = size;
if ((int)(hosts.size()) < p.replication) { if ((int)(hosts.size()) < p.replication) {
data_needed = true; data_needed = true;
} }
fault_tolerance = hosts.size(); min_failures = hosts.size();
} else { } else {
status = UNRECOVERABLE; status = UNRECOVERABLE;
fault_tolerance = 0; min_failures = 0;
} }
#ifdef DEBUG_RECOVERY #ifdef DEBUG_RECOVERY
printf("chunk plan %s: status %s\n", name, status_str(status)); printf("chunk plan %s: status %s\n", name, status_str(status));
@ -720,6 +723,7 @@ void CHUNK::recovery_plan() {
} }
void META_CHUNK::recovery_action() { void META_CHUNK::recovery_action() {
unsigned int i;
if (data_now_present) { if (data_now_present) {
status = PRESENT; status = PRESENT;
} }
@ -728,7 +732,7 @@ void META_CHUNK::recovery_action() {
name, status_str(status), have_unrecoverable_children name, status_str(status), have_unrecoverable_children
); );
#endif #endif
for (unsigned i=0; i<children.size(); i++) { for (i=0; i<children.size(); i++) {
DATA_UNIT* c = children[i]; DATA_UNIT* c = children[i];
#ifdef DEBUG_RECOVERY #ifdef DEBUG_RECOVERY
printf(" child %s status %s in rec set %d\n", printf(" child %s status %s in rec set %d\n",
@ -751,6 +755,50 @@ void META_CHUNK::recovery_action() {
} }
c->recovery_action(); c->recovery_action();
} }
// because of recovery action, some of our children may have changed
// status and fault tolerance, source may have changed too.
// Recompute them.
//
vector<DATA_UNIT*> recoverable;
vector<DATA_UNIT*> present;
for (i=0; i<children.size(); i++) {
DATA_UNIT* c = children[i];
switch (c->status) {
case PRESENT:
present.push_back(c);
break;
case RECOVERABLE:
recoverable.push_back(c);
break;
}
}
if ((int)(present.size()) >= coding.n) {
status = PRESENT;
min_failures = INT_MAX;
} else if ((int)(present.size() + recoverable.size()) >= coding.n) {
status = RECOVERABLE;
// our min_failures is the least X such that some X host failures
// would make this node unrecoverable
//
sort(recoverable.begin(), recoverable.end(), compare_min_failures);
min_failures = 0;
unsigned int k = coding.n - present.size();
// we'd need to recover K recoverable children
unsigned int j = recoverable.size() - k + 1;
// a loss of J recoverable children would make this impossible
// the loss of J recoverable children would make us unrecoverable
// Sum the min_failures of the J children with smallest min_failures
//
for (i=0; i<j; i++) {
DATA_UNIT* c = recoverable[i];
printf(" Min failures of %s: %d\n", c->name, c->min_failures);
min_failures += c->min_failures;
}
printf(" our min failures: %d\n", min_failures);
}
} }
void CHUNK::recovery_action() { void CHUNK::recovery_action() {
@ -780,6 +828,8 @@ void CHUNK::recovery_action() {
} else { } else {
if (present_on_server) { if (present_on_server) {
present_on_server = false; present_on_server = false;
status = RECOVERABLE;
min_failures = p.replication;
#ifdef EVENT_DEBUG #ifdef EVENT_DEBUG
printf("%s: %s replicated, removing from server\n", now_str(), name); printf("%s: %s replicated, removing from server\n", now_str(), name);
#endif #endif