From 442f7e3f07402f0f75443df8fc60d7b2d953b650 Mon Sep 17 00:00:00 2001
From: Tuan Le <tuanle86@berkeley.edu>
Date: Tue, 27 Jul 2010 20:18:42 +0000
Subject: [PATCH] - fix local work size and global work size to be compatible
 with GPU

svn path=/trunk/boinc/; revision=22073
---
 samples/atiopencl/atiopencl.cpp | 122 +++++++-------------------------
 samples/atiopencl/atiopencl.hpp |  14 ++--
 2 files changed, 34 insertions(+), 102 deletions(-)

diff --git a/samples/atiopencl/atiopencl.cpp b/samples/atiopencl/atiopencl.cpp
index 0cb04b76b4..ec90b13d2b 100644
--- a/samples/atiopencl/atiopencl.cpp
+++ b/samples/atiopencl/atiopencl.cpp
@@ -16,8 +16,8 @@
 // along with BOINC.  If not, see <http://www.gnu.org/licenses/>.
 //
 // This program serves as both
-// - An example BOINC-ATIOpenCL application, illustrating the use of the BOINC API
-//   and ATIStream OpenCL API.
+// - An example BOINC-NVOpenCL application, illustrating the use of the BOINC API
+//   and NVIDIA OpenCL API.
 // - A program for testing various features of BOINC.
 //
 // The program reads the input nxn matrix from the "input" file, inverts the
@@ -29,11 +29,10 @@
 // -early_exit: exit(10) after 30 chars
 // -early_crash: crash after 30 chars
 //
-// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues
+// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues.
 // Contributor: Tuan Le (tuanle86@berkeley.edu)
 
 #include "atiopencl.hpp"
-
 using std::string;
 
 int main(int argc, char * argv[]) {
@@ -139,7 +138,7 @@ int main(int argc, char * argv[]) {
                NUM_ITERATIONS);
     } else {
         out.printf("\n----------------- Last checkpointed inversion #%d ----------------\n\n",
-			       lastInversion);
+                   lastInversion);
         printf("Computation is resumed ... Inverse the matrix %d more times. Start at inversion #%d\n",
                NUM_ITERATIONS-lastInversion,lastInversion+1);
     }
@@ -323,6 +322,7 @@ void generate_random_input_file(int n) {
         }
     }
     fclose(infile);
+    free(input);
 }
 
 /*
@@ -421,7 +421,7 @@ char * convert_to_string(const char *fileName) {
     char c;
     int i=0;
 
-	// look for "atiopencl_kernels.cl" in "boinc/samples/atiopencl/debug" or
+    // look for "atiopencl_kernels.cl" in "boinc/samples/atiopencl/debug" or
     // in "boinc/samples/atiopencl/release". Note that "atiopencl_kernels.cl"
     // is automatically copied to these directories along the building process.
     FILE *infile=fopen(fileName,"r");
@@ -457,6 +457,9 @@ int initialize_cl(void) {
     cl_int status = 0;
     size_t deviceListSize;
 
+    localThreads[0]  = LOCAL_WORK_SIZE;
+    globalThreads[0] = GLOBAL_WORK_SIZE;
+
     /*
      * Have a look at the available platforms and pick either
      * the AMD one if available or a reasonable default.
@@ -686,60 +689,6 @@ void print_to_file(MFILE *out, float *h_odata, int n) {
     }
 }
 
-/*
- * Check if the device is able to support the requested number of work items.
- */
-int check_device_capability(size_t *globalThreads, size_t *localThreads) {
-    cl_int   status;
-    cl_uint maxDims;
-    size_t maxWorkGroupSize;
-    size_t maxWorkItemSizes[3];
-
-    /**
-    * Query device capabilities. Maximum 
-    * work item matrixSizes and the maximmum
-    * work item sizes
-    */ 
-    status = clGetDeviceInfo(devices[0],
-                             CL_DEVICE_MAX_WORK_GROUP_SIZE,
-                             sizeof(size_t),
-                             (void*)&maxWorkGroupSize,
-                             NULL);
-    if (status != CL_SUCCESS) {  
-        printf("Error: Getting Device Info. (clGetDeviceInfo)\n");
-        return 1;
-    }
-
-    status = clGetDeviceInfo(devices[0],
-                             CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
-                             sizeof(cl_uint),
-                             (void*)&maxDims,
-                             NULL);
-    if(status != CL_SUCCESS) {  
-        printf("Error: Getting Device Info. (clGetDeviceInfo)\n");
-        return 1;
-    }
-
-    status = clGetDeviceInfo(devices[0],
-                             CL_DEVICE_MAX_WORK_ITEM_SIZES,
-                             sizeof(size_t)*maxDims,
-                             (void*)maxWorkItemSizes,
-                             NULL);
-    if (status != CL_SUCCESS) {  
-        printf("Error: Getting Device Info. (clGetDeviceInfo)\n");
-        return 1;
-    }
-
-    globalThreads[0] = width*height; // # of threads associated with global matrixSize.
-    localThreads[0]  = 1;  // # of threads associated with local matrixSize.
-
-    /*if (globalThreads[0] > maxWorkItemSizes[0] || localThreads[0] > maxWorkGroupSize) {
-        printf("Unsupported: Device does not support requested number of work items.");
-        return 1;
-    }*/
-    return 0;
-}
-
 /*
  * \brief Run OpenCL program 
  *		  
@@ -748,18 +697,12 @@ int check_device_capability(size_t *globalThreads, size_t *localThreads) {
  */
 int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
     cl_int status;
-    size_t globalThreads[1]; //1 matrixSize
-    size_t localThreads[1]; //1 matrixSize
     cl_event events[2];
 
-    if (check_device_capability(globalThreads,localThreads) == 1) {
-        return 1;
-    }
-
     /* 
-	 * the input array to the kernel. This array will eventually be modified
-	 * to the inverted array.
-	 */
+     * the input array to the kernel. This array will eventually be modified
+     * to the inverted array.
+     */
     status = clSetKernelArg(GEStep1A_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
     if (status != CL_SUCCESS) { 
         printf("Error: Setting kernel argument. (input)\n");
@@ -822,11 +765,12 @@ int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
                                  inputBuffer,
                                  CL_TRUE,
                                  0,
-                                 width * sizeof(cl_uint),
+                                 globalThreads[0] * sizeof(cl_float),
                                  AI,
                                  0,
                                  NULL,
                                  &events[1]);
+
     if(status != CL_SUCCESS) { 
         printf("Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)\n");
         return 1;
@@ -849,18 +793,12 @@ int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
 
 int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) {
     cl_int status;
-    size_t globalThreads[1]; //1 matrixSize
-    size_t localThreads[1]; //1 matrixSize
     cl_event events[2];
 
-    if (check_device_capability(globalThreads,localThreads) == 1) {
-        return 1;
-    }
-
     /* 
-	 * the input array to the kernel. This array will eventually be modified 
-	 * to the inverted array.  
-	 */
+     * the input array to the kernel. This array will eventually be modified 
+     * to the inverted array.  
+     */
     status = clSetKernelArg(GEStep2_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
     if (status != CL_SUCCESS) { 
         printf("Error: Setting kernel argument. (AI)\n");
@@ -931,7 +869,7 @@ int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) {
                                  inputBuffer,
                                  CL_TRUE,
                                  0,
-                                 width * sizeof(cl_uint),
+                                 globalThreads[0] * sizeof(cl_float),
                                  AI,
                                  0,
                                  NULL,
@@ -958,18 +896,12 @@ int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) {
 
 int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) {
     cl_int status;
-    size_t globalThreads[1]; //1 matrixSize
-    size_t localThreads[1]; //1 matrixSize
     cl_event events[2];
 
-    if (check_device_capability(globalThreads,localThreads) == 1) {
-        return 1;
-    }
-
     /* 
-	 * The input array to the kernel. This array will eventually be modified
-	 * to the inverted array.
-	 */
+     * The input array to the kernel. This array will eventually be modified
+     * to the inverted array.
+     */
     status = clSetKernelArg(GEStep3_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
     if (status != CL_SUCCESS) { 
         printf("Error: Setting kernel argument. (input)\n");
@@ -1033,7 +965,7 @@ int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) {
                                  inputBuffer,
                                  CL_TRUE,
                                  0,
-                                 width * sizeof(cl_uint),
+                                 globalThreads[0] * sizeof(cl_float),
                                  AI,
                                  0,
                                  NULL,
@@ -1079,7 +1011,7 @@ void invertge(cl_float * AI_d, int lda, int n) {
 /* inverts nxn matrix input and stores the result in output */
 void invert(cl_float * input, cl_float *output, int n) {
     fprintf(stderr,"starting inversion n = %d ", n);
-    volatile clock_t gputime, gputime0;
+    volatile clock_t gputime;
     gputime=clock();
 
     int lda = ((n+15)&~15|16);
@@ -1097,7 +1029,7 @@ void invert(cl_float * input, cl_float *output, int n) {
     /////////////////////////////////////////////////////////////////
     inputBuffer = clCreateBuffer(context,
                                  CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
-                                 sizeof(cl_uint) * width * height,
+                                 sizeof(cl_float) * globalThreads[0],
                                  AI_d,
                                  &status);
     if (status != CL_SUCCESS) { 
@@ -1113,12 +1045,12 @@ void invert(cl_float * input, cl_float *output, int n) {
 
 #ifdef VERIFY	
     // let's verify that
-    REAL error=0.0;
+    cl_float error=0.0;
 
     // multiply inverse*xcopy, should be Identity matrix
     for (int k = 0; k < n; k++) {
         for (int j = 0; j < n; j++) {
-            REAL sum = 0;
+            cl_float sum = 0;
             for (int i = 0; i < n; i++) {
                 sum += AI[j*lda*2+n+i]*A[i*n+k];
 	        }
@@ -1138,4 +1070,4 @@ void invert(cl_float * input, cl_float *output, int n) {
     }
     free(AI_d);
     fprintf(stderr," done!\n");
-}
\ No newline at end of file
+}
diff --git a/samples/atiopencl/atiopencl.hpp b/samples/atiopencl/atiopencl.hpp
index 552e46e975..6efd3d7ced 100644
--- a/samples/atiopencl/atiopencl.hpp
+++ b/samples/atiopencl/atiopencl.hpp
@@ -15,7 +15,7 @@
 // You should have received a copy of the GNU Lesser General Public License
 // along with BOINC.  If not, see <http://www.gnu.org/licenses/>.
 //
-// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues
+// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues.
 // Contributor: Tuan Le (tuanle86@berkeley.edu)
 
 #ifndef ATIOPENCL_H_
@@ -40,6 +40,9 @@
 #define KERNELS_FILENAME "atiopencl_kernels.cl"
 #define KERNELS_FILEPATH "../../atiopencl_kernels.cl" // for Linux and Mac
 #define CHECKPOINT_FILE "matrix_inversion_state"
+
+#define LOCAL_WORK_SIZE 1
+#define GLOBAL_WORK_SIZE 400
 #define MATRIX_SIZE 10
 #define NUM_ITERATIONS 501 // execute the kernel NUM_ITERATIONS times
 
@@ -90,6 +93,9 @@ double cpu_time = 20, comp_result;
 bool isStateFileInUse = false;
 const char *source;
 
+size_t globalThreads[1]; // 1D var for Total # of work items
+size_t localThreads[1];  // 1D var for # of work items in the work group	
+
 /*
  * Input data is stored here.
  */
@@ -209,12 +215,6 @@ void cleanup_host(void);
 /* Write the result to output file */
 void print_to_file(MFILE *out, float *h_odata, int n);
 
-/*
- * Check if the device is able to support the requested number of work items.
- */
-int check_device_capability(size_t *globalThreads,
-                            size_t *localThreads);
-
 /*
  *	Functions used to inverst matrix. Call kernels inside.
  */