From 637793aeea354622a1a445237aa5265e0d66011c Mon Sep 17 00:00:00 2001 From: David Anderson Date: Thu, 10 Jun 2010 17:44:32 +0000 Subject: [PATCH] svn path=/trunk/boinc/; revision=21726 --- samples/nvcuda/cuda.cu | 460 ++++++++++++++++++++++++++++++++-- samples/nvcuda/cuda_config.h | 27 ++ samples/nvcuda/cuda_kernel.cu | 161 +++++++++--- 3 files changed, 592 insertions(+), 56 deletions(-) create mode 100644 samples/nvcuda/cuda_config.h diff --git a/samples/nvcuda/cuda.cu b/samples/nvcuda/cuda.cu index 9cda2419e4..2ed6683838 100644 --- a/samples/nvcuda/cuda.cu +++ b/samples/nvcuda/cuda.cu @@ -1,37 +1,445 @@ /* - * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. * - * NVIDIA Corporation and its licensors retain all intellectual property and - * proprietary rights in and to this software and related documentation. - * Any use, reproduction, disclosure, or distribution of this software - * and related documentation without an express license agreement from - * NVIDIA Corporation is strictly prohibited. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. * - * Please refer to the applicable NVIDIA end user license agreement (EULA) - * associated with this source code for terms and conditions that govern - * your use of this NVIDIA software. - * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/* Template project which demonstrates the basics on how to setup a project -* example application. -* Host code. -*/ +/* + * cuda.cu + * Copyright (C) 2010 Tuan Le + * tuanle86@berkeley.edu + */ -// includes, system -#include -#include -#include -#include +#ifdef _WIN32 +#include "boinc_win.h" +#else +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#endif -// includes, kernels +#include +#include +#include "cuda_config.h" #include "cuda_kernel.cu" -//////////////////////////////////////////////////////////////////////////////// -// Program main -//////////////////////////////////////////////////////////////////////////////// -int -main( int argc, char** argv) -{ +#include "str_util.h" +#include "util.h" +#include "filesys.h" +#include "boinc_api.h" +#include "mfile.h" +#include "graphics2.h" + +#ifdef APP_GRAPHICS +#include "uc2.h" +UC_SHMEM* shmem; +#endif + +using std::string; + +#define CHECKPOINT_FILE "matrix_inversion_state" +#define INPUT_FILENAME "input" +#define OUTPUT_FILENAME "output" +#define MATRIX_SIZE 10 + + +// execute the kernel NUM_ITERATIONS times +#define NUM_ITERATIONS 19 + +bool run_slow = false; +bool early_exit = false; +bool early_crash = false; +bool early_sleep = false; +double cpu_time = 20, comp_result; + +// do a billion floating-point ops +// (note: I needed to add an arg to this; +// otherwise the MS C++ compiler optimizes away +// all but the first call to it!) +// +static double do_a_giga_flop(int foo) { + double x = 3.14159*foo; + int i; + for (i=0; i<500000000; i++) { + x += 5.12313123; + x *= 0.5398394834; + } + return x; } +int do_checkpoint(MFILE& mf, int n, REAL *h_idata, int dimension) { + int retval; + string resolved_name; + + FILE* f = fopen("temp", "w"); + if (!f) return 1; + fprintf(f, "%d", n); //write inversion number + fprintf(f, " "); + fprintf(f, "%d", dimension); //write dimension + fprintf(f, " "); + for (int i=0;iupdate_time = dtime(); + + // Check whether a graphics app is running, + // and don't bother updating shmem if so. + // This doesn't matter here, + // but may be worth doing if updating shmem is expensive. + // + if (shmem->countdown > 0) { + // the graphics app sets this to 5 every time it renders a frame + shmem->countdown--; + } else { + return; + } + shmem->fraction_done = boinc_get_fraction_done(); + shmem->cpu_time = boinc_worker_thread_cpu_time();; + boinc_get_status(&shmem->status); +} +#endif + +void print(REAL *A, int n); //on commandline +void generateRandomInputFile(int n); +int getMatrixDimension(FILE *infile); +int countElementsInMatrix(FILE *infile); +void fetchElementsIntoHostMemory(FILE *infile, REAL *h_idata); +void printToFile(MFILE *out, float *h_odata, int dimension); + +int main(int argc, char** argv) +{ + int i, retval, lastInversion=0, checkpointExists=0, dimension=0; + double fd; + char input_path[512], output_path[512], chkpt_path[512]; + REAL* h_idata; + unsigned int mem_size; + MFILE out; + FILE* state, *infile; + + //generateRandomInputFile(MATRIX_SIZE); //call this if you don't want to construct the input file manually + + for (i=0; i30) { + exit(-10); + } + + if (early_crash && i>30) { + boinc_crash(); + } + if (early_sleep && i>30) { + g_sleep = true; + while (1) boinc_sleep(1); + } + + if (boinc_time_to_checkpoint()) { + //if (i==7) { + printf("Perform checkpointing at inversion # %d\n",i); + + //we'll need to write the current matrix to the state file. + retval = do_checkpoint(out, i, h_idata, dimension); + if (retval) { + fprintf(stderr, "%s APP: matrix_inversion checkpoint failed %d\n", + boinc_msg_prefix(), retval + ); + exit(retval); + } + boinc_checkpoint_completed(); + } + + fd = i/NUM_ITERATIONS; + if (cpu_time) fd /= 2; + boinc_fraction_done(fd); + + } + + out.printf("\n\n----------------- Final inversion #%d----------------\n\n",NUM_ITERATIONS); + printToFile(&out,h_idata,dimension); + + cudaFreeHost( h_idata ); + retval = out.flush(); //force the output file to be closed. + if (retval) { + fprintf(stderr, "%s APP: matrix_inversion flush failed %d\n", + boinc_msg_prefix(), retval + ); + exit(1); + } + + // burn up some CPU time if needed + // + if (cpu_time) { + double start = dtime(); + for (int i=0; ; i++) { + double e = dtime()-start; + if (e > cpu_time) break; + fd = .5 + .5*(e/cpu_time); + boinc_fraction_done(fd); + + if (boinc_time_to_checkpoint()) { + retval = do_checkpoint(out, NUM_ITERATIONS, h_idata, dimension); + if (retval) { + fprintf(stderr, "%s APP: maxtrix_inversion checkpoint failed %d\n", + boinc_msg_prefix(), retval + ); + exit(1); + } + boinc_checkpoint_completed(); + } + comp_result = do_a_giga_flop(i); + } + } + + boinc_fraction_done(1); +#ifdef APP_GRAPHICS + update_shmem(); +#endif + + printf("Done!"); + getchar(); + boinc_finish(0); +} + +#ifdef _WIN32 +int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR Args, int WinMode) { + LPSTR command_line; + char* argv[100]; + int argc; + + command_line = GetCommandLine(); + argc = parse_command_line( command_line, argv ); + return main(argc, argv); +} +#endif + +void print(REAL *h_idata, int n) { + int j=0; + for (int i=0;i0) { + out->printf("%15f ",h_odata[move]); + ++count; + ++move; + if (count==dimension) { + out->printf("\n"); + count=0; + } + --num_elements; + } +} \ No newline at end of file diff --git a/samples/nvcuda/cuda_config.h b/samples/nvcuda/cuda_config.h new file mode 100644 index 0000000000..ce1603686b --- /dev/null +++ b/samples/nvcuda/cuda_config.h @@ -0,0 +1,27 @@ + +#ifdef DOUBLE_PRECISION +#define REAL double +#define jREAL jdouble +#define jREALArray jdoubleArray +#else +#define REAL float +#define jREAL jfloat +#define jREALArray jfloatArray +#endif + + + +inline void __cudaSafeCall( int err, const char *file, const int line ) +{ + do { + if( err != 0) { + fprintf(stderr, "cudaSafeCall() Runtime API error in file <%s>, line %i : %s.\n", + file, line, cudaGetErrorString((cudaError_t) err) ); + exit(-1); + } + } while (0); +} + +#define SAFECALL(err) __cudaSafeCall(err, __FILE__, __LINE__) +#define CUDACHECK {cudaError_t error = cudaGetLastError(); if(error != 0){fprintf(stderr, "Error code %d: %s file %s line %i.\n",error,cudaGetErrorString(error),__FILE__,__LINE__);}} + diff --git a/samples/nvcuda/cuda_kernel.cu b/samples/nvcuda/cuda_kernel.cu index cef8c6cafe..f9714ef5ba 100644 --- a/samples/nvcuda/cuda_kernel.cu +++ b/samples/nvcuda/cuda_kernel.cu @@ -1,36 +1,137 @@ -/* - * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. - * - * NVIDIA Corporation and its licensors retain all intellectual property and - * proprietary rights in and to this software and related documentation. - * Any use, reproduction, disclosure, or distribution of this software - * and related documentation without an express license agreement from - * NVIDIA Corporation is strictly prohibited. - * - * Please refer to the applicable NVIDIA end user license agreement (EULA) - * associated with this source code for terms and conditions that govern - * your use of this NVIDIA software. - * - */ -/* Template project which demonstrates the basics on how to setup a project - * example application. - * Device code. - */ - -#ifndef _CUDA_KERNEL_H_ -#define _CUDA_KERNEL_H_ +// When VERIFY is defined, the sum of squared errors is calculated between the +// identity matrix and the product A * incerse(A). For debugging... +//#define VERIFY 1 #include +#include +#include +#include "config.h" -//////////////////////////////////////////////////////////////////////////////// -//! Simple test kernel for device functionality -//! @param g_idata input data in global memory -//! @param g_odata output data in global memory -//////////////////////////////////////////////////////////////////////////////// -__global__ void -testKernel( float* g_idata, float* g_odata) -{ +void mathdispAI(const REAL *mat, int lda, int MAT_SIZE_h) { + fprintf(stderr, "\n"); + int i,j; + for (j=0;ji && k < n2 && AI[i*lda2+k]!=0) { + REAL multiplyer = -AI[i*lda2+k]/AI[i*lda2+i]; + int n = n2 / 2; + for (int j = i+1; j < n; j++) { + AI[j*lda2+k] += multiplyer*AI[j*lda2+i]; + } + } +} + +__global__ void GEStep2(REAL * AI,REAL diag,int i, int n2, int lda2) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k < n2) { + AI[i*lda2+k] /= diag; + } +} + +__global__ void GEStep3(REAL * AI,int i, int n2, int lda2) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k > i && k < n2) { + REAL multiplyer = -AI[i*lda2+k]; + for (int j = 0; j < i; j++) { + AI[j*lda2+k] += multiplyer*AI[j*lda2+i]; + } + } +} + +//extern void invert(REAL * A, int n); +void invertge(REAL * AI_d, int lda, int n) { + int lda2 = lda * 2; + // perform elementary row operations till A in AI becomes identity matrix + for (int i = 0; i < n; i++) { + GEStep1A<<<(int)ceil((float)(1+(2*n-1)/32)),32>>>(AI_d,i,n*2, lda2); + CUDACHECK; + cudaThreadSynchronize(); + } + + + for (int i = n-1; i >= 0; i--) { + REAL diag = 1.0; + SAFECALL(cudaMemcpy(&diag, &AI_d[i*lda2+i], sizeof(REAL), cudaMemcpyDeviceToHost)); + GEStep2<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,diag,i,n*2, lda2); + CUDACHECK; + + GEStep3<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,i,n*2, lda2); + CUDACHECK; + cudaThreadSynchronize(); + CUDACHECK; + } +} // invertge + + +/* inverts nxn matrix A and stores result back in A */ +void invert(REAL * A, int n) { +fprintf(stderr,"starting inversion n = %d ", n); + volatile clock_t gputime, gputime0; + gputime=clock(); + gputime0 = gputime; + + int lda = ((n+15)&~15|16); +//lda=n; + REAL * AI = new REAL[n*lda*2]; + memset(AI,0,sizeof(REAL)*n*lda*2); + for (int i = 0; i < n; i++) { + memcpy(&AI[lda*i*2], &A[n*i], sizeof(REAL)*n); + AI[lda*i*2+n+i] = 1; + } + + REAL * AI_d; + SAFECALL(cudaMalloc((void **) &AI_d, sizeof(REAL)*n*lda*2)); + SAFECALL(cudaMemcpy(AI_d, AI, sizeof(REAL)*n*lda*2, cudaMemcpyHostToDevice)); + + invertge(AI_d, lda, n); + SAFECALL(cudaMemcpy(AI, AI_d, sizeof(REAL)*n*lda*2, cudaMemcpyDeviceToHost)); + cudaFree(AI_d); + + + gputime=clock()-gputime;fprintf(stderr, " %7.1f ms ",gputime/1.e3f); + fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime); +#ifdef VERIFY + // let's verify that + REAL error=0.0; + + // multiply inverse*xcopy, should be Identity matrix + for (int k = 0; k < n; k++) { + for (int j = 0; j < n; j++) { + REAL sum = 0; + for (int i = 0; i < n; i++) { + sum += AI[j*lda*2+n+i]*A[i*n+k]; + } + if (j!=k) { + error += sum * sum; + } else { + error += (1.0-sum) * (1.0-sum); + } + } + } + fprintf(stderr, " %6.2f SSE", error); +#endif + + for (int i = 0; i < n; i++) { + memcpy(&A[n*i], &AI[lda*i*2+n], sizeof(REAL)*n); + } + free(AI); + fprintf(stderr," done!\n"); +} // invert \ No newline at end of file