svn path=/trunk/boinc/; revision=21726

This commit is contained in:
David Anderson 2010-06-10 17:44:32 +00:00
parent ca6fa1cda3
commit 637793aeea
3 changed files with 592 additions and 56 deletions

View File

@ -1,37 +1,445 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/* Template project which demonstrates the basics on how to setup a project
* example application.
* Host code.
*/
/*
* cuda.cu
* Copyright (C) 2010 Tuan Le
* tuanle86@berkeley.edu
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#ifdef _WIN32
#include "boinc_win.h"
#else
#include "config.h"
#include <cstdio>
#include <cctype>
#include <ctime>
#include <cstring>
#include <cstdlib>
#include <csignal>
#include <unistd.h>
#endif
// includes, kernels
#include <cuda_runtime.h>
#include <cublas.h>
#include "cuda_config.h"
#include "cuda_kernel.cu"
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
#include "str_util.h"
#include "util.h"
#include "filesys.h"
#include "boinc_api.h"
#include "mfile.h"
#include "graphics2.h"
#ifdef APP_GRAPHICS
#include "uc2.h"
UC_SHMEM* shmem;
#endif
using std::string;
#define CHECKPOINT_FILE "matrix_inversion_state"
#define INPUT_FILENAME "input"
#define OUTPUT_FILENAME "output"
#define MATRIX_SIZE 10
// execute the kernel NUM_ITERATIONS times
#define NUM_ITERATIONS 19
bool run_slow = false;
bool early_exit = false;
bool early_crash = false;
bool early_sleep = false;
double cpu_time = 20, comp_result;
// do a billion floating-point ops
// (note: I needed to add an arg to this;
// otherwise the MS C++ compiler optimizes away
// all but the first call to it!)
//
static double do_a_giga_flop(int foo) {
double x = 3.14159*foo;
int i;
for (i=0; i<500000000; i++) {
x += 5.12313123;
x *= 0.5398394834;
}
return x;
}
int do_checkpoint(MFILE& mf, int n, REAL *h_idata, int dimension) {
int retval;
string resolved_name;
FILE* f = fopen("temp", "w");
if (!f) return 1;
fprintf(f, "%d", n); //write inversion number
fprintf(f, " ");
fprintf(f, "%d", dimension); //write dimension
fprintf(f, " ");
for (int i=0;i<dimension*dimension;++i) {
fprintf(f, " ");
fprintf(f, "%f", h_idata[i]);
}
fclose(f);
retval = mf.flush(); //not really necessary since we have not yet written anything to output file when doing checkpointing
if (retval) return retval;
boinc_resolve_filename_s(CHECKPOINT_FILE, resolved_name); //resolved_name is a string object
retval = boinc_rename("temp", resolved_name.c_str()); //c_str() will convert a string object to a char string with null terminator equivalent.
// because we do rename. Thus temp does not appear, but the CHECKPOINT_FILE appears instead on the disk.
if (retval) return retval;
return 0; //return 0 to indicate success.
}
#ifdef APP_GRAPHICS
void update_shmem() {
if (!shmem) return;
// always do this; otherwise a graphics app will immediately
// assume we're not alive
shmem->update_time = dtime();
// Check whether a graphics app is running,
// and don't bother updating shmem if so.
// This doesn't matter here,
// but may be worth doing if updating shmem is expensive.
//
if (shmem->countdown > 0) {
// the graphics app sets this to 5 every time it renders a frame
shmem->countdown--;
} else {
return;
}
shmem->fraction_done = boinc_get_fraction_done();
shmem->cpu_time = boinc_worker_thread_cpu_time();;
boinc_get_status(&shmem->status);
}
#endif
void print(REAL *A, int n); //on commandline
void generateRandomInputFile(int n);
int getMatrixDimension(FILE *infile);
int countElementsInMatrix(FILE *infile);
void fetchElementsIntoHostMemory(FILE *infile, REAL *h_idata);
void printToFile(MFILE *out, float *h_odata, int dimension);
int main(int argc, char** argv)
{
int i, retval, lastInversion=0, checkpointExists=0, dimension=0;
double fd;
char input_path[512], output_path[512], chkpt_path[512];
REAL* h_idata;
unsigned int mem_size;
MFILE out;
FILE* state, *infile;
//generateRandomInputFile(MATRIX_SIZE); //call this if you don't want to construct the input file manually
for (i=0; i<argc; i++) {
if (!strcmp(argv[i], "-early_exit")) early_exit = true;
if (!strcmp(argv[i], "-early_crash")) early_crash = true;
if (!strcmp(argv[i], "-early_sleep")) early_sleep = true;
if (!strcmp(argv[i], "-run_slow")) run_slow = true;
if (!strcmp(argv[i], "-cpu_time")) {
cpu_time = atof(argv[++i]);
}
}
retval = boinc_init();
if (retval) {
fprintf(stderr, "%s boinc_init returned %d\n",
boinc_msg_prefix(), retval
);
exit(retval);
}
// open the input file (resolve logical name first)
//
boinc_resolve_filename(INPUT_FILENAME, input_path, sizeof(input_path));
infile = boinc_fopen(input_path, "r");
if (!infile) {
fprintf(stderr,
"%s Couldn't find input file, resolved name %s.\n",
boinc_msg_prefix(), input_path
);
getchar();
exit(-1);
}
boinc_resolve_filename(OUTPUT_FILENAME, output_path, sizeof(output_path));
// See if there's a valid checkpoint file.
// If so retrieve the current matrix and inversion number
//
boinc_resolve_filename(CHECKPOINT_FILE, chkpt_path, sizeof(chkpt_path));
state = boinc_fopen(chkpt_path, "r");
if (state) {
printf("Checkpoint file is detected. Read from checkpoint file ... \n");
checkpointExists=fscanf(state, "%d", &lastInversion);
if (checkpointExists == 1) {
printf("Last inversion # is : %d\n",lastInversion);
fscanf(state,"%d",&dimension);
cudaMallocHost((void **)&h_idata,dimension*dimension*sizeof(REAL));
for (int i=0;i<dimension*dimension;++i) {
fscanf(state, "%f", &h_idata[i]);
//printf("--%f\n",h_idata[i]);
}
}
fclose(state);
} else {
printf("There's no valid checkpoint file!\n");
}
retval = out.open(output_path, "wb");
if (retval) {
fprintf(stderr, "%s APP: matrix_inversion output open failed:\n",
boinc_msg_prefix()
);
fprintf(stderr, "%s resolved name %s, retval %d\n",
boinc_msg_prefix(), output_path, retval
);
perror("open");
exit(1);
}
#ifdef APP_GRAPHICS
// create shared mem segment for graphics, and arrange to update it
//
shmem = (UC_SHMEM*)boinc_graphics_make_shmem("uppercase", sizeof(UC_SHMEM));
if (!shmem) {
fprintf(stderr, "%s failed to create shared mem segment\n",
boinc_msg_prefix()
);
}
update_shmem();
boinc_register_timer_callback(update_shmem);
#endif
if (checkpointExists != 1) {
dimension=getMatrixDimension(infile);
printf("Matrix dimension: %d\n",dimension);
cudaMallocHost((void **)&h_idata,dimension*dimension*sizeof(REAL));
fetchElementsIntoHostMemory(infile,h_idata);
out.printf("\n----------------- Before being inversed ----------------\n\n");
printf("Computation is running ... Inverse the matrix %d times. Start at inversion #1\n",NUM_ITERATIONS);
} else {
out.printf("\n----------------- Last checkpointed inversion #%d ----------------\n\n",lastInversion);
printf("Computation is resumed ... Inverse the matrix %d more times. Start at inversion #%d\n",NUM_ITERATIONS-lastInversion,lastInversion+1);
}
printToFile(&out,h_idata,dimension);
for (int i=lastInversion+1;i<=NUM_ITERATIONS;++i) {
invert(h_idata,dimension);
printf("Finish inversion #%d\n",i);
if (run_slow) {
boinc_sleep(1.);
}
if (early_exit && i>30) {
exit(-10);
}
if (early_crash && i>30) {
boinc_crash();
}
if (early_sleep && i>30) {
g_sleep = true;
while (1) boinc_sleep(1);
}
if (boinc_time_to_checkpoint()) {
//if (i==7) {
printf("Perform checkpointing at inversion # %d\n",i);
//we'll need to write the current matrix to the state file.
retval = do_checkpoint(out, i, h_idata, dimension);
if (retval) {
fprintf(stderr, "%s APP: matrix_inversion checkpoint failed %d\n",
boinc_msg_prefix(), retval
);
exit(retval);
}
boinc_checkpoint_completed();
}
fd = i/NUM_ITERATIONS;
if (cpu_time) fd /= 2;
boinc_fraction_done(fd);
}
out.printf("\n\n----------------- Final inversion #%d----------------\n\n",NUM_ITERATIONS);
printToFile(&out,h_idata,dimension);
cudaFreeHost( h_idata );
retval = out.flush(); //force the output file to be closed.
if (retval) {
fprintf(stderr, "%s APP: matrix_inversion flush failed %d\n",
boinc_msg_prefix(), retval
);
exit(1);
}
// burn up some CPU time if needed
//
if (cpu_time) {
double start = dtime();
for (int i=0; ; i++) {
double e = dtime()-start;
if (e > cpu_time) break;
fd = .5 + .5*(e/cpu_time);
boinc_fraction_done(fd);
if (boinc_time_to_checkpoint()) {
retval = do_checkpoint(out, NUM_ITERATIONS, h_idata, dimension);
if (retval) {
fprintf(stderr, "%s APP: maxtrix_inversion checkpoint failed %d\n",
boinc_msg_prefix(), retval
);
exit(1);
}
boinc_checkpoint_completed();
}
comp_result = do_a_giga_flop(i);
}
}
boinc_fraction_done(1);
#ifdef APP_GRAPHICS
update_shmem();
#endif
printf("Done!");
getchar();
boinc_finish(0);
}
#ifdef _WIN32
int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR Args, int WinMode) {
LPSTR command_line;
char* argv[100];
int argc;
command_line = GetCommandLine();
argc = parse_command_line( command_line, argv );
return main(argc, argv);
}
#endif
void print(REAL *h_idata, int n) {
int j=0;
for (int i=0;i<n*n;++i) {
printf("%f ",h_idata[i]);
if (j+1==n) {
printf("\n");
j=0;
} else {
++j;
}
}
}
// nxn matrix
void generateRandomInputFile(int n) {
FILE *infile;
infile=fopen(INPUT_FILENAME,"w");
REAL *h_idata = new REAL[n*n];
srand(n);
for( int i = 0; i < n; i++ ) {
for (int j = 0; j < n; j++) {
h_idata[i*n+j] = 2.0*(rand()%32768)/32768.0 - 1.0;
}
h_idata[i*n+i] += sqrt((float)n);
}
int j=0;
for (int i=0;i<n*n;++i) {
fprintf(infile,"%15f",h_idata[i]);
if (j+1==n) {
fprintf(infile,"\n");
j=0;
} else {
++j;
}
}
fclose(infile);
}
int getMatrixDimension(FILE *infile) {
int w=0;
char c;
fseek(infile,0,SEEK_SET);
while (true) {
do {
c=fgetc(infile);
if (c == EOF || c == '\n') {
goto exitLoop;
}
} while (isspace(c));
if (isdigit(c) || c=='.' || c=='-') {
++w;
}
do {
c=fgetc(infile);
if (c == EOF || c == '\n') {
goto exitLoop;
}
} while (isdigit(c) || c=='.' || c=='-');
if (c==EOF || c == '\n') {
break;
}
}
exitLoop:
return w;
}
void fetchElementsIntoHostMemory(FILE *infile, REAL *h_idata) {
float num=0;
int i=0;
fseek(infile,0,SEEK_SET);
while (fscanf(infile,"%f",&num)==1) {
h_idata[i]=num;
++i;
}
}
void printToFile(MFILE *out, float *h_odata, int dimension) {
int count=0;
int move=0;
int num_elements=dimension*dimension;
while (num_elements>0) {
out->printf("%15f ",h_odata[move]);
++count;
++move;
if (count==dimension) {
out->printf("\n");
count=0;
}
--num_elements;
}
}

View File

@ -0,0 +1,27 @@
#ifdef DOUBLE_PRECISION
#define REAL double
#define jREAL jdouble
#define jREALArray jdoubleArray
#else
#define REAL float
#define jREAL jfloat
#define jREALArray jfloatArray
#endif
inline void __cudaSafeCall( int err, const char *file, const int line )
{
do {
if( err != 0) {
fprintf(stderr, "cudaSafeCall() Runtime API error in file <%s>, line %i : %s.\n",
file, line, cudaGetErrorString((cudaError_t) err) );
exit(-1);
}
} while (0);
}
#define SAFECALL(err) __cudaSafeCall(err, __FILE__, __LINE__)
#define CUDACHECK {cudaError_t error = cudaGetLastError(); if(error != 0){fprintf(stderr, "Error code %d: %s file %s line %i.\n",error,cudaGetErrorString(error),__FILE__,__LINE__);}}

View File

@ -1,36 +1,137 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/
/* Template project which demonstrates the basics on how to setup a project
* example application.
* Device code.
*/
#ifndef _CUDA_KERNEL_H_
#define _CUDA_KERNEL_H_
// When VERIFY is defined, the sum of squared errors is calculated between the
// identity matrix and the product A * incerse(A). For debugging...
//#define VERIFY 1
#include <stdio.h>
#include <math.h>
#include <time.h>
#include "config.h"
////////////////////////////////////////////////////////////////////////////////
//! Simple test kernel for device functionality
//! @param g_idata input data in global memory
//! @param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
testKernel( float* g_idata, float* g_odata)
{
void mathdispAI(const REAL *mat, int lda, int MAT_SIZE_h) {
fprintf(stderr, "\n");
int i,j;
for (j=0;j<MAT_SIZE_h;j++) {
for (i=0;i<MAT_SIZE_h*2;i++) {
fprintf(stderr, "%6.3f",mat[j*lda*2+i]);
}
fprintf(stderr, "\n");
}
fprintf(stderr, "\n");
} // mathdisp2
void mathdispAId(const REAL * AId, int lda, int n) {
REAL * AI = new REAL[n*lda*2];
cudaMemcpy(AI,AId,sizeof(REAL)*n*lda*2,cudaMemcpyDeviceToHost);
mathdispAI(AI, lda, n);
delete [] AI;
}
#endif // #ifndef _CUDA_KERNEL_H_
__global__ void GEStep1A(REAL * AI, int i, int n2, int lda2) {
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (k>i && k < n2 && AI[i*lda2+k]!=0) {
REAL multiplyer = -AI[i*lda2+k]/AI[i*lda2+i];
int n = n2 / 2;
for (int j = i+1; j < n; j++) {
AI[j*lda2+k] += multiplyer*AI[j*lda2+i];
}
}
}
__global__ void GEStep2(REAL * AI,REAL diag,int i, int n2, int lda2) {
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (k < n2) {
AI[i*lda2+k] /= diag;
}
}
__global__ void GEStep3(REAL * AI,int i, int n2, int lda2) {
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (k > i && k < n2) {
REAL multiplyer = -AI[i*lda2+k];
for (int j = 0; j < i; j++) {
AI[j*lda2+k] += multiplyer*AI[j*lda2+i];
}
}
}
//extern void invert(REAL * A, int n);
void invertge(REAL * AI_d, int lda, int n) {
int lda2 = lda * 2;
// perform elementary row operations till A in AI becomes identity matrix
for (int i = 0; i < n; i++) {
GEStep1A<<<(int)ceil((float)(1+(2*n-1)/32)),32>>>(AI_d,i,n*2, lda2);
CUDACHECK;
cudaThreadSynchronize();
}
for (int i = n-1; i >= 0; i--) {
REAL diag = 1.0;
SAFECALL(cudaMemcpy(&diag, &AI_d[i*lda2+i], sizeof(REAL), cudaMemcpyDeviceToHost));
GEStep2<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,diag,i,n*2, lda2);
CUDACHECK;
GEStep3<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,i,n*2, lda2);
CUDACHECK;
cudaThreadSynchronize();
CUDACHECK;
}
} // invertge
/* inverts nxn matrix A and stores result back in A */
void invert(REAL * A, int n) {
fprintf(stderr,"starting inversion n = %d ", n);
volatile clock_t gputime, gputime0;
gputime=clock();
gputime0 = gputime;
int lda = ((n+15)&~15|16);
//lda=n;
REAL * AI = new REAL[n*lda*2];
memset(AI,0,sizeof(REAL)*n*lda*2);
for (int i = 0; i < n; i++) {
memcpy(&AI[lda*i*2], &A[n*i], sizeof(REAL)*n);
AI[lda*i*2+n+i] = 1;
}
REAL * AI_d;
SAFECALL(cudaMalloc((void **) &AI_d, sizeof(REAL)*n*lda*2));
SAFECALL(cudaMemcpy(AI_d, AI, sizeof(REAL)*n*lda*2, cudaMemcpyHostToDevice));
invertge(AI_d, lda, n);
SAFECALL(cudaMemcpy(AI, AI_d, sizeof(REAL)*n*lda*2, cudaMemcpyDeviceToHost));
cudaFree(AI_d);
gputime=clock()-gputime;fprintf(stderr, " %7.1f ms ",gputime/1.e3f);
fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime);
#ifdef VERIFY
// let's verify that
REAL error=0.0;
// multiply inverse*xcopy, should be Identity matrix
for (int k = 0; k < n; k++) {
for (int j = 0; j < n; j++) {
REAL sum = 0;
for (int i = 0; i < n; i++) {
sum += AI[j*lda*2+n+i]*A[i*n+k];
}
if (j!=k) {
error += sum * sum;
} else {
error += (1.0-sum) * (1.0-sum);
}
}
}
fprintf(stderr, " %6.2f SSE", error);
#endif
for (int i = 0; i < n; i++) {
memcpy(&A[n*i], &AI[lda*i*2+n], sizeof(REAL)*n);
}
free(AI);
fprintf(stderr," done!\n");
} // invert