From 442f7e3f07402f0f75443df8fc60d7b2d953b650 Mon Sep 17 00:00:00 2001 From: Tuan Le Date: Tue, 27 Jul 2010 20:18:42 +0000 Subject: [PATCH] - fix local work size and global work size to be compatible with GPU svn path=/trunk/boinc/; revision=22073 --- samples/atiopencl/atiopencl.cpp | 122 +++++++------------------------- samples/atiopencl/atiopencl.hpp | 14 ++-- 2 files changed, 34 insertions(+), 102 deletions(-) diff --git a/samples/atiopencl/atiopencl.cpp b/samples/atiopencl/atiopencl.cpp index 0cb04b76b4..ec90b13d2b 100644 --- a/samples/atiopencl/atiopencl.cpp +++ b/samples/atiopencl/atiopencl.cpp @@ -16,8 +16,8 @@ // along with BOINC. If not, see . // // This program serves as both -// - An example BOINC-ATIOpenCL application, illustrating the use of the BOINC API -// and ATIStream OpenCL API. +// - An example BOINC-NVOpenCL application, illustrating the use of the BOINC API +// and NVIDIA OpenCL API. // - A program for testing various features of BOINC. // // The program reads the input nxn matrix from the "input" file, inverts the @@ -29,11 +29,10 @@ // -early_exit: exit(10) after 30 chars // -early_crash: crash after 30 chars // -// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues +// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues. // Contributor: Tuan Le (tuanle86@berkeley.edu) #include "atiopencl.hpp" - using std::string; int main(int argc, char * argv[]) { @@ -139,7 +138,7 @@ int main(int argc, char * argv[]) { NUM_ITERATIONS); } else { out.printf("\n----------------- Last checkpointed inversion #%d ----------------\n\n", - lastInversion); + lastInversion); printf("Computation is resumed ... Inverse the matrix %d more times. Start at inversion #%d\n", NUM_ITERATIONS-lastInversion,lastInversion+1); } @@ -323,6 +322,7 @@ void generate_random_input_file(int n) { } } fclose(infile); + free(input); } /* @@ -421,7 +421,7 @@ char * convert_to_string(const char *fileName) { char c; int i=0; - // look for "atiopencl_kernels.cl" in "boinc/samples/atiopencl/debug" or + // look for "atiopencl_kernels.cl" in "boinc/samples/atiopencl/debug" or // in "boinc/samples/atiopencl/release". Note that "atiopencl_kernels.cl" // is automatically copied to these directories along the building process. FILE *infile=fopen(fileName,"r"); @@ -457,6 +457,9 @@ int initialize_cl(void) { cl_int status = 0; size_t deviceListSize; + localThreads[0] = LOCAL_WORK_SIZE; + globalThreads[0] = GLOBAL_WORK_SIZE; + /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. @@ -686,60 +689,6 @@ void print_to_file(MFILE *out, float *h_odata, int n) { } } -/* - * Check if the device is able to support the requested number of work items. - */ -int check_device_capability(size_t *globalThreads, size_t *localThreads) { - cl_int status; - cl_uint maxDims; - size_t maxWorkGroupSize; - size_t maxWorkItemSizes[3]; - - /** - * Query device capabilities. Maximum - * work item matrixSizes and the maximmum - * work item sizes - */ - status = clGetDeviceInfo(devices[0], - CL_DEVICE_MAX_WORK_GROUP_SIZE, - sizeof(size_t), - (void*)&maxWorkGroupSize, - NULL); - if (status != CL_SUCCESS) { - printf("Error: Getting Device Info. (clGetDeviceInfo)\n"); - return 1; - } - - status = clGetDeviceInfo(devices[0], - CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, - sizeof(cl_uint), - (void*)&maxDims, - NULL); - if(status != CL_SUCCESS) { - printf("Error: Getting Device Info. (clGetDeviceInfo)\n"); - return 1; - } - - status = clGetDeviceInfo(devices[0], - CL_DEVICE_MAX_WORK_ITEM_SIZES, - sizeof(size_t)*maxDims, - (void*)maxWorkItemSizes, - NULL); - if (status != CL_SUCCESS) { - printf("Error: Getting Device Info. (clGetDeviceInfo)\n"); - return 1; - } - - globalThreads[0] = width*height; // # of threads associated with global matrixSize. - localThreads[0] = 1; // # of threads associated with local matrixSize. - - /*if (globalThreads[0] > maxWorkItemSizes[0] || localThreads[0] > maxWorkGroupSize) { - printf("Unsupported: Device does not support requested number of work items."); - return 1; - }*/ - return 0; -} - /* * \brief Run OpenCL program * @@ -748,18 +697,12 @@ int check_device_capability(size_t *globalThreads, size_t *localThreads) { */ int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) { cl_int status; - size_t globalThreads[1]; //1 matrixSize - size_t localThreads[1]; //1 matrixSize cl_event events[2]; - if (check_device_capability(globalThreads,localThreads) == 1) { - return 1; - } - /* - * the input array to the kernel. This array will eventually be modified - * to the inverted array. - */ + * the input array to the kernel. This array will eventually be modified + * to the inverted array. + */ status = clSetKernelArg(GEStep1A_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); if (status != CL_SUCCESS) { printf("Error: Setting kernel argument. (input)\n"); @@ -822,11 +765,12 @@ int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) { inputBuffer, CL_TRUE, 0, - width * sizeof(cl_uint), + globalThreads[0] * sizeof(cl_float), AI, 0, NULL, &events[1]); + if(status != CL_SUCCESS) { printf("Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)\n"); return 1; @@ -849,18 +793,12 @@ int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) { int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) { cl_int status; - size_t globalThreads[1]; //1 matrixSize - size_t localThreads[1]; //1 matrixSize cl_event events[2]; - if (check_device_capability(globalThreads,localThreads) == 1) { - return 1; - } - /* - * the input array to the kernel. This array will eventually be modified - * to the inverted array. - */ + * the input array to the kernel. This array will eventually be modified + * to the inverted array. + */ status = clSetKernelArg(GEStep2_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); if (status != CL_SUCCESS) { printf("Error: Setting kernel argument. (AI)\n"); @@ -931,7 +869,7 @@ int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) { inputBuffer, CL_TRUE, 0, - width * sizeof(cl_uint), + globalThreads[0] * sizeof(cl_float), AI, 0, NULL, @@ -958,18 +896,12 @@ int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) { int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) { cl_int status; - size_t globalThreads[1]; //1 matrixSize - size_t localThreads[1]; //1 matrixSize cl_event events[2]; - if (check_device_capability(globalThreads,localThreads) == 1) { - return 1; - } - /* - * The input array to the kernel. This array will eventually be modified - * to the inverted array. - */ + * The input array to the kernel. This array will eventually be modified + * to the inverted array. + */ status = clSetKernelArg(GEStep3_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); if (status != CL_SUCCESS) { printf("Error: Setting kernel argument. (input)\n"); @@ -1033,7 +965,7 @@ int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) { inputBuffer, CL_TRUE, 0, - width * sizeof(cl_uint), + globalThreads[0] * sizeof(cl_float), AI, 0, NULL, @@ -1079,7 +1011,7 @@ void invertge(cl_float * AI_d, int lda, int n) { /* inverts nxn matrix input and stores the result in output */ void invert(cl_float * input, cl_float *output, int n) { fprintf(stderr,"starting inversion n = %d ", n); - volatile clock_t gputime, gputime0; + volatile clock_t gputime; gputime=clock(); int lda = ((n+15)&~15|16); @@ -1097,7 +1029,7 @@ void invert(cl_float * input, cl_float *output, int n) { ///////////////////////////////////////////////////////////////// inputBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, - sizeof(cl_uint) * width * height, + sizeof(cl_float) * globalThreads[0], AI_d, &status); if (status != CL_SUCCESS) { @@ -1113,12 +1045,12 @@ void invert(cl_float * input, cl_float *output, int n) { #ifdef VERIFY // let's verify that - REAL error=0.0; + cl_float error=0.0; // multiply inverse*xcopy, should be Identity matrix for (int k = 0; k < n; k++) { for (int j = 0; j < n; j++) { - REAL sum = 0; + cl_float sum = 0; for (int i = 0; i < n; i++) { sum += AI[j*lda*2+n+i]*A[i*n+k]; } @@ -1138,4 +1070,4 @@ void invert(cl_float * input, cl_float *output, int n) { } free(AI_d); fprintf(stderr," done!\n"); -} \ No newline at end of file +} diff --git a/samples/atiopencl/atiopencl.hpp b/samples/atiopencl/atiopencl.hpp index 552e46e975..6efd3d7ced 100644 --- a/samples/atiopencl/atiopencl.hpp +++ b/samples/atiopencl/atiopencl.hpp @@ -15,7 +15,7 @@ // You should have received a copy of the GNU Lesser General Public License // along with BOINC. If not, see . // -// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues +// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues. // Contributor: Tuan Le (tuanle86@berkeley.edu) #ifndef ATIOPENCL_H_ @@ -40,6 +40,9 @@ #define KERNELS_FILENAME "atiopencl_kernels.cl" #define KERNELS_FILEPATH "../../atiopencl_kernels.cl" // for Linux and Mac #define CHECKPOINT_FILE "matrix_inversion_state" + +#define LOCAL_WORK_SIZE 1 +#define GLOBAL_WORK_SIZE 400 #define MATRIX_SIZE 10 #define NUM_ITERATIONS 501 // execute the kernel NUM_ITERATIONS times @@ -90,6 +93,9 @@ double cpu_time = 20, comp_result; bool isStateFileInUse = false; const char *source; +size_t globalThreads[1]; // 1D var for Total # of work items +size_t localThreads[1]; // 1D var for # of work items in the work group + /* * Input data is stored here. */ @@ -209,12 +215,6 @@ void cleanup_host(void); /* Write the result to output file */ void print_to_file(MFILE *out, float *h_odata, int n); -/* - * Check if the device is able to support the requested number of work items. - */ -int check_device_capability(size_t *globalThreads, - size_t *localThreads); - /* * Functions used to inverst matrix. Call kernels inside. */