mirror of https://github.com/BOINC/boinc.git
447 lines
13 KiB
C
447 lines
13 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <math.h>
|
|
#include <time.h>
|
|
|
|
#include "speed_stats.h"
|
|
|
|
#define D_FLOP_ITERS 1
|
|
#define I_OP_ITERS 1
|
|
#define BANDWIDTH_ITERS 1
|
|
|
|
//#define RUN_TEST
|
|
|
|
#ifdef RUN_TEST
|
|
|
|
int main( void ) {
|
|
int cache_size;
|
|
|
|
cache_size = check_cache_size( CACHE_MAX );
|
|
|
|
run_test_suite( 4 );
|
|
|
|
return 0;
|
|
}
|
|
|
|
#endif
|
|
|
|
int check_cache_size( int mem_size ) {
|
|
int i, n, index, stride, *memBlock, logStride, logCache;
|
|
double **results;
|
|
int steps, tsteps, csize, limit, temp, cind, sind;
|
|
clock_t total_sec, sec;
|
|
double secs, nanosecs, temp2;
|
|
int not_found;
|
|
|
|
logStride = (int)(log(STRIDE_MAX/STRIDE_MIN)/log(2))+1;
|
|
logCache = (int)(log(CACHE_MAX/CACHE_MIN)/log(2))+1;
|
|
|
|
printf( "Test will take about %.2f seconds.\n", SECS_PER_RUN*logStride*logCache );
|
|
results = (double **)malloc( sizeof( double * )*logStride );
|
|
|
|
for( i=0;i<logStride;i++ ) {
|
|
results[i] = (double *)malloc( sizeof( double )*logCache );
|
|
for( n=0;n<logCache;n++ ) {
|
|
results[i][n] = 1.0;
|
|
}
|
|
}
|
|
|
|
printf( "|" );
|
|
for( i=0;i<logCache;i++ ) {
|
|
printf( "-" );
|
|
}
|
|
|
|
printf( "|\n" );
|
|
memBlock = (int *)malloc( sizeof( int )*mem_size );
|
|
printf( " " );
|
|
|
|
for( csize=CACHE_MIN,cind=0;csize<=CACHE_MAX;csize*=2,cind++ ) {
|
|
for (stride = STRIDE_MIN,sind=0; stride<=STRIDE_MAX; stride*=2,sind++ ) {
|
|
limit = csize - stride + 1; // cache size this loop
|
|
|
|
steps = 0;
|
|
sec = clock();
|
|
do { // repeat until collect 1 second
|
|
for (i = SAMPLE * stride; i != 0; i-- ) { // larger sample
|
|
for (index = 0; index < limit; index += stride) {
|
|
memBlock[index]++; // cache access
|
|
}
|
|
}
|
|
steps++; // count while loop iterations
|
|
} while (clock() < sec+(CLOCKS_PER_SEC*SECS_PER_RUN)); // until collect 1 second
|
|
total_sec = clock()-sec;
|
|
|
|
// Repeat empty loop to loop subtract overhead
|
|
tsteps = 0; // used to match no. while iterations
|
|
temp = 0;
|
|
sec = clock();
|
|
do { // repeat until same no. iterations as above
|
|
for (i = SAMPLE * stride; i != 0; i-- ) { // larger sample
|
|
for (index = 0; index < limit; index += stride) {
|
|
temp += index; // dummy code
|
|
}
|
|
}
|
|
tsteps++; // count while iterations
|
|
} while (tsteps < steps); // until = no. iterations
|
|
total_sec -= clock()-sec;
|
|
|
|
secs = ((double)total_sec) / CLOCKS_PER_SEC;
|
|
|
|
if( temp == 3 ) {
|
|
printf( "Howdy\n" );
|
|
}
|
|
|
|
nanosecs = (double) secs * 1e9 / (steps * SAMPLE * stride * ((limit - 1) / stride + 1));
|
|
results[sind][cind] = nanosecs;
|
|
|
|
//if( stride==STRIDE_MIN ) printf("\n");
|
|
printf(
|
|
"Size (bytes): %7d Stride (bytes): %4d read+write: %4.0f ns, %d %d\n",
|
|
csize * sizeof (int), stride * sizeof( int ), nanosecs, sind, cind
|
|
);
|
|
}
|
|
printf( "." );
|
|
fflush( stdout );
|
|
}
|
|
printf( "\n" );
|
|
|
|
for( i=0;i<logStride;i++ ) {
|
|
for( n=0;n<logCache;n++ ) {
|
|
printf ("%4.0f ", results[i][n]);
|
|
}
|
|
printf( "\n" );
|
|
}
|
|
|
|
for( i=0;i<logStride;i++ ) {
|
|
for( n=logCache;n>0;n-- ) {
|
|
results[i][n] /= results[i][n-1];
|
|
}
|
|
}
|
|
|
|
for( i=0;i<logCache;i++ ) {
|
|
temp2 = 0;
|
|
for( n=0;n<logStride;n++ ) {
|
|
temp2 += results[n][i];
|
|
}
|
|
results[0][i] = temp2/logStride;
|
|
}
|
|
|
|
printf( "\n" );
|
|
for( i=0;i<logStride;i++ ) {
|
|
for( n=1;n<logCache;n++ ) {
|
|
printf ("%1.3f ", results[i][n]);
|
|
}
|
|
printf( "\n" );
|
|
}
|
|
|
|
csize=CACHE_MIN;
|
|
i = 1;
|
|
not_found = 2;
|
|
while( not_found && i < logCache ) {
|
|
if( not_found == 1 && results[0][i] > 1.5 ) {
|
|
printf( "Level 2 Data Cache is %d KB.\n", csize*sizeof(int)/CACHE_MIN );
|
|
not_found = 0;
|
|
}
|
|
if( not_found == 2 && results[0][i] > 1.5 ) {
|
|
printf( "Level 1 Data Cache is %d KB.\n", csize*sizeof(int)/CACHE_MIN );
|
|
not_found = 1;
|
|
}
|
|
i++;
|
|
csize *= 2;
|
|
}
|
|
|
|
free( memBlock );
|
|
for( i=0;i<logStride;i++ )
|
|
free( results[i] );
|
|
|
|
free( results );
|
|
|
|
return 0;
|
|
}
|
|
|
|
double run_double_prec_test( double num_secs ) {
|
|
int df_test_time, df_iters;
|
|
double df_secs;
|
|
|
|
// Start by doing some quick timing tests for rough calibration
|
|
df_test_time = (int)double_flop_test( D_FLOP_ITERS, 0 );
|
|
if( df_test_time <= 0 ) df_test_time = 1;
|
|
df_secs = (double)df_test_time/CLOCKS_PER_SEC;
|
|
|
|
// Calculate the # of iterations based on these tests
|
|
df_iters = (int)(D_FLOP_ITERS*num_secs/df_secs);
|
|
|
|
if( df_iters > D_FLOP_ITERS ) { // no need to redo test
|
|
df_test_time = (int)double_flop_test( df_iters, 0 );
|
|
} else {
|
|
df_iters = D_FLOP_ITERS;
|
|
}
|
|
|
|
df_secs = (double)df_test_time/CLOCKS_PER_SEC;
|
|
|
|
return 1000000*df_iters/df_secs;
|
|
}
|
|
|
|
double run_int_test( double num_secs ) {
|
|
int int_test_time, int_iters;
|
|
double int_secs;
|
|
|
|
// Start by doing some quick timing tests for rough calibration
|
|
int_test_time = (int)int_op_test( I_OP_ITERS, 0 );
|
|
if( int_test_time <= 0 ) int_test_time = 1;
|
|
int_secs = (double)int_test_time/CLOCKS_PER_SEC;
|
|
|
|
// Calculate the # of iterations based on these tests
|
|
int_iters = (int)(I_OP_ITERS*num_secs/int_secs);
|
|
|
|
if( int_iters > I_OP_ITERS ) { // no need to redo test
|
|
int_test_time = (int)int_op_test( int_iters, 0 );
|
|
} else {
|
|
int_iters = I_OP_ITERS;
|
|
}
|
|
|
|
int_secs = (double)int_test_time/CLOCKS_PER_SEC;
|
|
|
|
return 1000000*int_iters/int_secs;
|
|
}
|
|
|
|
double run_mem_bandwidth_test( double num_secs ) {
|
|
int bw_test_time;
|
|
double bw_secs;
|
|
int bw_iters;
|
|
|
|
// Start by doing some quick timing tests for rough calibration
|
|
bw_test_time = (int)bandwidth_test( BANDWIDTH_ITERS, 0 );
|
|
if( bw_test_time <= 0 ) bw_test_time = 1;
|
|
bw_secs = (double)bw_test_time/CLOCKS_PER_SEC;
|
|
|
|
// Calculate the # of iterations based on these tests
|
|
bw_iters = (int)(BANDWIDTH_ITERS*num_secs/bw_secs);
|
|
|
|
if( bw_iters > BANDWIDTH_ITERS ) { // no need to redo test
|
|
bw_test_time = (int)bandwidth_test( bw_iters, 0 );
|
|
} else {
|
|
bw_iters = BANDWIDTH_ITERS;
|
|
}
|
|
|
|
bw_secs = (double)bw_test_time/CLOCKS_PER_SEC;
|
|
return 1000000*bw_iters/bw_secs;
|
|
}
|
|
|
|
void run_test_suite( double num_secs_per_test ) {
|
|
printf(
|
|
"Running tests. This will take about %.1lf seconds.\n\n",
|
|
num_secs_per_test*3
|
|
);
|
|
|
|
printf(
|
|
"Speed: %.5lf million flops/sec\n\n",
|
|
run_double_prec_test(num_secs_per_test)/1000000
|
|
);
|
|
printf(
|
|
"Speed: %.5lf million integer ops/sec\n\n",
|
|
run_int_test(num_secs_per_test)/1000000
|
|
);
|
|
printf(
|
|
"Speed: %.5lf MB/sec\n\n",
|
|
12*sizeof(double)*run_mem_bandwidth_test(num_secs_per_test)/1000000
|
|
);
|
|
}
|
|
|
|
// One iteration == D_LOOP_ITERS (1,000,000) floating point operations
|
|
|
|
clock_t double_flop_test( int iterations, int print_debug ) {
|
|
double a[NUM_DOUBLES],t1,t2;
|
|
double temp;
|
|
clock_t time_start, time_total;
|
|
int i,j,k,calc_error;
|
|
|
|
// Initialize the array
|
|
a[0] = 1;
|
|
for( i=1;i<NUM_DOUBLES;i++ )
|
|
a[i] = a[i-1]/2.0;
|
|
|
|
// Ideally, the array "a" will fit into cache, meaning this test doesn't
|
|
// really include memory accesses
|
|
time_start = clock();
|
|
for( i=0;i<iterations;i++ ) {
|
|
for( j=0;j<D_LOOP_ITERS;j+=((NUM_DOUBLES-1)*5) ) {
|
|
temp = 1;
|
|
t1 = a[0];
|
|
// These tests do a pretty good job of breaking the processor pipeline,
|
|
// since the result from all but one of the lines is required for the
|
|
// next line.
|
|
|
|
for( k=0;k<NUM_DOUBLES-1;k++ ) {
|
|
t2 = a[k+1];
|
|
t1 = t1 * t2; // 1st FLOP
|
|
temp = temp + temp; // 2nd FLOP
|
|
t1 = t1 * temp; // 3rd FLOP
|
|
t1 = t1 + t2; // 4th FLOP
|
|
t1 = t1 / 1.5; // 5th FLOP
|
|
a[k] = t1;
|
|
t1 = t2;
|
|
}
|
|
}
|
|
}
|
|
|
|
time_total = clock()-time_start;
|
|
|
|
calc_error = 0;
|
|
temp = 1;
|
|
// Check to make sure all the values are the same
|
|
for( i=0;i<NUM_DOUBLES;i++ ) {
|
|
if( (float)a[i] != (float)temp ) {
|
|
calc_error = 1;
|
|
}
|
|
|
|
temp /= 3;
|
|
}
|
|
|
|
if( calc_error ) printf( "Calc error\n" );
|
|
|
|
if( print_debug ) {
|
|
for( i=0;i<NUM_DOUBLES;i++ ) {
|
|
printf( "%3d: %.50lf\n", i, a[i] );
|
|
}
|
|
}
|
|
|
|
return time_total;
|
|
}
|
|
|
|
// One iteration == 1,000,000 integer operations
|
|
|
|
clock_t int_op_test( int iterations, int print_debug ) {
|
|
int a[NUM_INTS], temp;
|
|
clock_t time_start, time_total;
|
|
int i,j,k,calc_error;
|
|
|
|
a[0] = 1;
|
|
for( i=1;i<NUM_INTS;i++ ) {
|
|
a[i] = 2*a[i-1];
|
|
}
|
|
|
|
time_start = clock();
|
|
for( i=0;i<iterations;i++ ) {
|
|
for( j=0;j<I_LOOP_ITERS/(NUM_INTS*9);j++ ) {
|
|
for( k=0;k<NUM_INTS;k++ ) {
|
|
a[k] *= 3; // 1 int ops
|
|
}
|
|
for( k=NUM_INTS-1;k>=0;k-- ) {
|
|
a[k] += 6; // 2 int ops
|
|
}
|
|
for( k=0;k<NUM_INTS;k++ ) {
|
|
a[k] /= 3; // 3 int ops
|
|
}
|
|
for( k=NUM_INTS-1;k>=0;k-- ) {
|
|
a[k] -= 2; // 4 int ops
|
|
}
|
|
for( k=NUM_INTS-1;k>0;k-- ) {
|
|
a[k] -= a[k-1]; // 5 int ops
|
|
}
|
|
for( k=1;k<NUM_INTS;k++ ) {
|
|
a[k] = 2*a[k-1]; // 6 int ops
|
|
}
|
|
for( k=NUM_INTS-1;k>0;k-- ) {
|
|
if( a[k-1] != 0 ) // 7 int ops
|
|
a[k] /= a[k-1]; // 8 int ops
|
|
}
|
|
for( k=1;k<NUM_INTS;k++ ) {
|
|
a[k] = 2*a[k-1]; // 9 int ops
|
|
}
|
|
}
|
|
}
|
|
|
|
time_total = clock()-time_start;
|
|
|
|
calc_error = 0;
|
|
temp = 1;
|
|
// Check to make sure all the values are the same
|
|
for( i=0;i<NUM_INTS;i++ ) {
|
|
if( a[i] != temp ) {
|
|
calc_error = 1;
|
|
}
|
|
|
|
temp *= 2;
|
|
}
|
|
|
|
if( calc_error )
|
|
printf( "Calc error\n" );
|
|
|
|
if( print_debug ) {
|
|
for( i=0;i<NUM_INTS;i++ ) {
|
|
printf( "%3d: %d\n", i, a[i] );
|
|
}
|
|
}
|
|
|
|
return time_total;
|
|
}
|
|
|
|
// One iteration == Read of 6M*sizeof(double), Write of 6M*sizeof(double)
|
|
|
|
clock_t bandwidth_test( int iterations, int print_debug ) {
|
|
// a, b, and c are arrays of doubles we will copy around to test memory bandwidth
|
|
double *a, *b, *c;
|
|
// aVal and bVal are the values of all elements of a and b.
|
|
// These values use every other bit,
|
|
// so that if there is a HW problem it will easily manifest itself
|
|
|
|
double aVal, bVal;
|
|
// Start and stop times for the clock
|
|
clock_t time_start, time_total;
|
|
int i,j,copy_error;
|
|
|
|
// These are doubles in order to make full use of bus and instruction bandwidth
|
|
a = (double *)malloc( MEM_SIZE * sizeof( double ) );
|
|
b = (double *)malloc( MEM_SIZE * sizeof( double ) );
|
|
c = (double *)malloc( MEM_SIZE * sizeof( double ) );
|
|
|
|
// These values use all the bits in a floating point number (Investigate these values)
|
|
aVal = (-2.0/3.0)*pow(2.0,-341.0);
|
|
bVal = (1.0/3.0)*pow(2.0,342.0);
|
|
|
|
// We add i to each value to prevent compiler optimizations of the copy
|
|
for( i=0;i<MEM_SIZE;i++ ) {
|
|
a[i] = aVal+i; b[i] = bVal+i; c[i] = 1.0;
|
|
}
|
|
|
|
// Start the clock
|
|
time_start = clock();
|
|
|
|
// 6 read, 6 write operations per iteration which will preserve a and b
|
|
for( i=0;i<iterations*2;i++ ) {
|
|
for( j=0;j<MEM_SIZE;j++ ) {
|
|
c[j] = a[j];
|
|
a[j] = b[j];
|
|
b[j] = c[j];
|
|
}
|
|
}
|
|
|
|
// Stop the clock
|
|
time_total = clock();
|
|
|
|
// Accomodate for the possibility of clock wraparound
|
|
if( time_total > time_start ) {
|
|
time_total -= time_start;
|
|
} else {
|
|
time_total -= time_start;
|
|
}
|
|
|
|
copy_error = 0;
|
|
for( i=0;i<MEM_SIZE;i++ ) {
|
|
if( a[i] != aVal+i || b[i] != bVal+i ) {
|
|
copy_error = 1;
|
|
}
|
|
}
|
|
|
|
if( copy_error ) {
|
|
printf( "Copy error\n" );
|
|
}
|
|
|
|
free( a );
|
|
free( b );
|
|
free( c );
|
|
|
|
return time_total;
|
|
}
|