mirror of https://github.com/BOINC/boinc.git
storage simulator: compute fault tolerance correctly
-This line, and those below, will be ignored-- M locale/templates/BOINC-Project-Generic.pot M ssim/ssim.cpp M checkin_notes svn path=/trunk/boinc/; revision=25071
This commit is contained in:
parent
e2a4f3b535
commit
72ddd4fa3e
|
@ -587,3 +587,9 @@ David 15 Jan 2012
|
||||||
str_util.cpp
|
str_util.cpp
|
||||||
ssim/
|
ssim/
|
||||||
ssim.cpp
|
ssim.cpp
|
||||||
|
|
||||||
|
David 15 Jan 2012
|
||||||
|
- storage simulator: compute fault tolerance correctly
|
||||||
|
|
||||||
|
ssim/
|
||||||
|
ssim.cpp
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -209,7 +209,7 @@ struct DATA_UNIT {
|
||||||
bool data_now_present;
|
bool data_now_present;
|
||||||
bool data_needed;
|
bool data_needed;
|
||||||
double cost;
|
double cost;
|
||||||
int fault_tolerance;
|
int min_failures;
|
||||||
// min # of host failures that would make this unrecoverable
|
// min # of host failures that would make this unrecoverable
|
||||||
char name[64];
|
char name[64];
|
||||||
};
|
};
|
||||||
|
@ -257,6 +257,8 @@ struct STATS_ITEM {
|
||||||
double integral;
|
double integral;
|
||||||
double max_val;
|
double max_val;
|
||||||
double max_val_time;
|
double max_val_time;
|
||||||
|
double min_val;
|
||||||
|
double min_val_time;
|
||||||
double prev_t;
|
double prev_t;
|
||||||
double start_time;
|
double start_time;
|
||||||
bool first;
|
bool first;
|
||||||
|
@ -267,6 +269,8 @@ struct STATS_ITEM {
|
||||||
integral = 0;
|
integral = 0;
|
||||||
max_val = 0;
|
max_val = 0;
|
||||||
max_val_time = 0;
|
max_val_time = 0;
|
||||||
|
min_val = INT_MAX;
|
||||||
|
min_val_time = 0;
|
||||||
first = true;
|
first = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -289,17 +293,25 @@ struct STATS_ITEM {
|
||||||
max_val = v;
|
max_val = v;
|
||||||
max_val_time = sim.now;
|
max_val_time = sim.now;
|
||||||
}
|
}
|
||||||
|
if (v < min_val) {
|
||||||
|
min_val = v;
|
||||||
|
min_val_time = sim.now;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void sample_inc(double inc, bool collecting_stats) {
|
void sample_inc(double inc, bool collecting_stats) {
|
||||||
sample(value+inc, collecting_stats);
|
sample(value+inc, collecting_stats);
|
||||||
}
|
}
|
||||||
|
|
||||||
void print() {
|
void print(bool show_min) {
|
||||||
sample_inc(0, true);
|
sample_inc(0, true);
|
||||||
double dt = sim.now - start_time;
|
double dt = sim.now - start_time;
|
||||||
printf(" mean: %f\n", integral/dt);
|
printf(" mean: %f\n", integral/dt);
|
||||||
|
if (show_min) {
|
||||||
|
printf(" min: %f\n", min_val);
|
||||||
|
} else {
|
||||||
printf(" max: %f\n", max_val);
|
printf(" max: %f\n", max_val);
|
||||||
|
}
|
||||||
printf(" time of max: %s\n", time_str(max_val_time));
|
printf(" time of max: %s\n", time_str(max_val_time));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -356,19 +368,19 @@ struct DFILE : EVENT {
|
||||||
void recover() {
|
void recover() {
|
||||||
meta_chunk->recovery_plan();
|
meta_chunk->recovery_plan();
|
||||||
meta_chunk->recovery_action();
|
meta_chunk->recovery_action();
|
||||||
fault_tolerance.sample(meta_chunk->fault_tolerance, collecting_stats());
|
fault_tolerance.sample(meta_chunk->min_failures-1, collecting_stats());
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_stats() {
|
void print_stats() {
|
||||||
printf("Statistics for file %d\n", id);
|
printf("Statistics for file %d\n", id);
|
||||||
printf(" Server disk usage:\n");
|
printf(" Server disk usage:\n");
|
||||||
disk_usage.print();
|
disk_usage.print(false);
|
||||||
printf(" Upload rate:\n");
|
printf(" Upload rate:\n");
|
||||||
upload_rate.print();
|
upload_rate.print(false);
|
||||||
printf(" Download rate:\n");
|
printf(" Download rate:\n");
|
||||||
download_rate.print();
|
download_rate.print(false);
|
||||||
printf(" Fault tolerance level:\n");
|
printf(" Fault tolerance level:\n");
|
||||||
fault_tolerance.print();
|
fault_tolerance.print(true);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -591,10 +603,10 @@ bool compare_cost(const DATA_UNIT* d1, const DATA_UNIT* d2) {
|
||||||
return d1->cost < d2->cost;
|
return d1->cost < d2->cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
// sort by decreasing fault tolerance
|
// sort by increase min_failures
|
||||||
//
|
//
|
||||||
bool compare_fault_tolerance(const DATA_UNIT* d1, const DATA_UNIT* d2) {
|
bool compare_min_failures(const DATA_UNIT* d1, const DATA_UNIT* d2) {
|
||||||
return d1->fault_tolerance > d2->fault_tolerance;
|
return d1->min_failures < d2->min_failures;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Recovery logic: decide what to do in response to
|
// Recovery logic: decide what to do in response to
|
||||||
|
@ -655,7 +667,6 @@ void META_CHUNK::recovery_plan() {
|
||||||
sort(present.begin(), present.end(), compare_cost);
|
sort(present.begin(), present.end(), compare_cost);
|
||||||
present.resize(coding.n);
|
present.resize(coding.n);
|
||||||
cost = 0;
|
cost = 0;
|
||||||
fault_tolerance = INT_MAX;
|
|
||||||
for (i=0; i<present.size(); i++) {
|
for (i=0; i<present.size(); i++) {
|
||||||
DATA_UNIT* c= present[i];
|
DATA_UNIT* c= present[i];
|
||||||
cost += c->cost;
|
cost += c->cost;
|
||||||
|
@ -676,14 +687,6 @@ void META_CHUNK::recovery_plan() {
|
||||||
cost += c->cost;
|
cost += c->cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
// compute our fault tolerance
|
|
||||||
//
|
|
||||||
sort(recoverable.begin(), recoverable.end(), compare_fault_tolerance);
|
|
||||||
fault_tolerance = 0;
|
|
||||||
for (i=0; i<j; i++) {
|
|
||||||
DATA_UNIT* c= recoverable[i];
|
|
||||||
fault_tolerance += c->fault_tolerance;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
status = UNRECOVERABLE;
|
status = UNRECOVERABLE;
|
||||||
}
|
}
|
||||||
|
@ -702,17 +705,17 @@ void CHUNK::recovery_plan() {
|
||||||
if (present_on_server) {
|
if (present_on_server) {
|
||||||
status = PRESENT;
|
status = PRESENT;
|
||||||
cost = 0;
|
cost = 0;
|
||||||
fault_tolerance = INT_MAX;
|
min_failures = INT_MAX;
|
||||||
} else if (hosts.size() > 0) {
|
} else if (hosts.size() > 0) {
|
||||||
status = RECOVERABLE;
|
status = RECOVERABLE;
|
||||||
cost = size;
|
cost = size;
|
||||||
if ((int)(hosts.size()) < p.replication) {
|
if ((int)(hosts.size()) < p.replication) {
|
||||||
data_needed = true;
|
data_needed = true;
|
||||||
}
|
}
|
||||||
fault_tolerance = hosts.size();
|
min_failures = hosts.size();
|
||||||
} else {
|
} else {
|
||||||
status = UNRECOVERABLE;
|
status = UNRECOVERABLE;
|
||||||
fault_tolerance = 0;
|
min_failures = 0;
|
||||||
}
|
}
|
||||||
#ifdef DEBUG_RECOVERY
|
#ifdef DEBUG_RECOVERY
|
||||||
printf("chunk plan %s: status %s\n", name, status_str(status));
|
printf("chunk plan %s: status %s\n", name, status_str(status));
|
||||||
|
@ -720,6 +723,7 @@ void CHUNK::recovery_plan() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void META_CHUNK::recovery_action() {
|
void META_CHUNK::recovery_action() {
|
||||||
|
unsigned int i;
|
||||||
if (data_now_present) {
|
if (data_now_present) {
|
||||||
status = PRESENT;
|
status = PRESENT;
|
||||||
}
|
}
|
||||||
|
@ -728,7 +732,7 @@ void META_CHUNK::recovery_action() {
|
||||||
name, status_str(status), have_unrecoverable_children
|
name, status_str(status), have_unrecoverable_children
|
||||||
);
|
);
|
||||||
#endif
|
#endif
|
||||||
for (unsigned i=0; i<children.size(); i++) {
|
for (i=0; i<children.size(); i++) {
|
||||||
DATA_UNIT* c = children[i];
|
DATA_UNIT* c = children[i];
|
||||||
#ifdef DEBUG_RECOVERY
|
#ifdef DEBUG_RECOVERY
|
||||||
printf(" child %s status %s in rec set %d\n",
|
printf(" child %s status %s in rec set %d\n",
|
||||||
|
@ -751,6 +755,50 @@ void META_CHUNK::recovery_action() {
|
||||||
}
|
}
|
||||||
c->recovery_action();
|
c->recovery_action();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// because of recovery action, some of our children may have changed
|
||||||
|
// status and fault tolerance, source may have changed too.
|
||||||
|
// Recompute them.
|
||||||
|
//
|
||||||
|
vector<DATA_UNIT*> recoverable;
|
||||||
|
vector<DATA_UNIT*> present;
|
||||||
|
for (i=0; i<children.size(); i++) {
|
||||||
|
DATA_UNIT* c = children[i];
|
||||||
|
switch (c->status) {
|
||||||
|
case PRESENT:
|
||||||
|
present.push_back(c);
|
||||||
|
break;
|
||||||
|
case RECOVERABLE:
|
||||||
|
recoverable.push_back(c);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ((int)(present.size()) >= coding.n) {
|
||||||
|
status = PRESENT;
|
||||||
|
min_failures = INT_MAX;
|
||||||
|
} else if ((int)(present.size() + recoverable.size()) >= coding.n) {
|
||||||
|
status = RECOVERABLE;
|
||||||
|
|
||||||
|
// our min_failures is the least X such that some X host failures
|
||||||
|
// would make this node unrecoverable
|
||||||
|
//
|
||||||
|
sort(recoverable.begin(), recoverable.end(), compare_min_failures);
|
||||||
|
min_failures = 0;
|
||||||
|
unsigned int k = coding.n - present.size();
|
||||||
|
// we'd need to recover K recoverable children
|
||||||
|
unsigned int j = recoverable.size() - k + 1;
|
||||||
|
// a loss of J recoverable children would make this impossible
|
||||||
|
|
||||||
|
// the loss of J recoverable children would make us unrecoverable
|
||||||
|
// Sum the min_failures of the J children with smallest min_failures
|
||||||
|
//
|
||||||
|
for (i=0; i<j; i++) {
|
||||||
|
DATA_UNIT* c = recoverable[i];
|
||||||
|
printf(" Min failures of %s: %d\n", c->name, c->min_failures);
|
||||||
|
min_failures += c->min_failures;
|
||||||
|
}
|
||||||
|
printf(" our min failures: %d\n", min_failures);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void CHUNK::recovery_action() {
|
void CHUNK::recovery_action() {
|
||||||
|
@ -780,6 +828,8 @@ void CHUNK::recovery_action() {
|
||||||
} else {
|
} else {
|
||||||
if (present_on_server) {
|
if (present_on_server) {
|
||||||
present_on_server = false;
|
present_on_server = false;
|
||||||
|
status = RECOVERABLE;
|
||||||
|
min_failures = p.replication;
|
||||||
#ifdef EVENT_DEBUG
|
#ifdef EVENT_DEBUG
|
||||||
printf("%s: %s replicated, removing from server\n", now_str(), name);
|
printf("%s: %s replicated, removing from server\n", now_str(), name);
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue