mirror of https://github.com/BOINC/boinc.git
- fix local work size and global work size to be compatible with GPU
svn path=/trunk/boinc/; revision=22073
This commit is contained in:
parent
f311cce3dc
commit
442f7e3f07
|
@ -16,8 +16,8 @@
|
|||
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
// This program serves as both
|
||||
// - An example BOINC-ATIOpenCL application, illustrating the use of the BOINC API
|
||||
// and ATIStream OpenCL API.
|
||||
// - An example BOINC-NVOpenCL application, illustrating the use of the BOINC API
|
||||
// and NVIDIA OpenCL API.
|
||||
// - A program for testing various features of BOINC.
|
||||
//
|
||||
// The program reads the input nxn matrix from the "input" file, inverts the
|
||||
|
@ -29,11 +29,10 @@
|
|||
// -early_exit: exit(10) after 30 chars
|
||||
// -early_crash: crash after 30 chars
|
||||
//
|
||||
// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues
|
||||
// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues.
|
||||
// Contributor: Tuan Le (tuanle86@berkeley.edu)
|
||||
|
||||
#include "atiopencl.hpp"
|
||||
|
||||
using std::string;
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
|
@ -139,7 +138,7 @@ int main(int argc, char * argv[]) {
|
|||
NUM_ITERATIONS);
|
||||
} else {
|
||||
out.printf("\n----------------- Last checkpointed inversion #%d ----------------\n\n",
|
||||
lastInversion);
|
||||
lastInversion);
|
||||
printf("Computation is resumed ... Inverse the matrix %d more times. Start at inversion #%d\n",
|
||||
NUM_ITERATIONS-lastInversion,lastInversion+1);
|
||||
}
|
||||
|
@ -323,6 +322,7 @@ void generate_random_input_file(int n) {
|
|||
}
|
||||
}
|
||||
fclose(infile);
|
||||
free(input);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -421,7 +421,7 @@ char * convert_to_string(const char *fileName) {
|
|||
char c;
|
||||
int i=0;
|
||||
|
||||
// look for "atiopencl_kernels.cl" in "boinc/samples/atiopencl/debug" or
|
||||
// look for "atiopencl_kernels.cl" in "boinc/samples/atiopencl/debug" or
|
||||
// in "boinc/samples/atiopencl/release". Note that "atiopencl_kernels.cl"
|
||||
// is automatically copied to these directories along the building process.
|
||||
FILE *infile=fopen(fileName,"r");
|
||||
|
@ -457,6 +457,9 @@ int initialize_cl(void) {
|
|||
cl_int status = 0;
|
||||
size_t deviceListSize;
|
||||
|
||||
localThreads[0] = LOCAL_WORK_SIZE;
|
||||
globalThreads[0] = GLOBAL_WORK_SIZE;
|
||||
|
||||
/*
|
||||
* Have a look at the available platforms and pick either
|
||||
* the AMD one if available or a reasonable default.
|
||||
|
@ -686,60 +689,6 @@ void print_to_file(MFILE *out, float *h_odata, int n) {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if the device is able to support the requested number of work items.
|
||||
*/
|
||||
int check_device_capability(size_t *globalThreads, size_t *localThreads) {
|
||||
cl_int status;
|
||||
cl_uint maxDims;
|
||||
size_t maxWorkGroupSize;
|
||||
size_t maxWorkItemSizes[3];
|
||||
|
||||
/**
|
||||
* Query device capabilities. Maximum
|
||||
* work item matrixSizes and the maximmum
|
||||
* work item sizes
|
||||
*/
|
||||
status = clGetDeviceInfo(devices[0],
|
||||
CL_DEVICE_MAX_WORK_GROUP_SIZE,
|
||||
sizeof(size_t),
|
||||
(void*)&maxWorkGroupSize,
|
||||
NULL);
|
||||
if (status != CL_SUCCESS) {
|
||||
printf("Error: Getting Device Info. (clGetDeviceInfo)\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
status = clGetDeviceInfo(devices[0],
|
||||
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
|
||||
sizeof(cl_uint),
|
||||
(void*)&maxDims,
|
||||
NULL);
|
||||
if(status != CL_SUCCESS) {
|
||||
printf("Error: Getting Device Info. (clGetDeviceInfo)\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
status = clGetDeviceInfo(devices[0],
|
||||
CL_DEVICE_MAX_WORK_ITEM_SIZES,
|
||||
sizeof(size_t)*maxDims,
|
||||
(void*)maxWorkItemSizes,
|
||||
NULL);
|
||||
if (status != CL_SUCCESS) {
|
||||
printf("Error: Getting Device Info. (clGetDeviceInfo)\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
globalThreads[0] = width*height; // # of threads associated with global matrixSize.
|
||||
localThreads[0] = 1; // # of threads associated with local matrixSize.
|
||||
|
||||
/*if (globalThreads[0] > maxWorkItemSizes[0] || localThreads[0] > maxWorkGroupSize) {
|
||||
printf("Unsupported: Device does not support requested number of work items.");
|
||||
return 1;
|
||||
}*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* \brief Run OpenCL program
|
||||
*
|
||||
|
@ -748,18 +697,12 @@ int check_device_capability(size_t *globalThreads, size_t *localThreads) {
|
|||
*/
|
||||
int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
|
||||
cl_int status;
|
||||
size_t globalThreads[1]; //1 matrixSize
|
||||
size_t localThreads[1]; //1 matrixSize
|
||||
cl_event events[2];
|
||||
|
||||
if (check_device_capability(globalThreads,localThreads) == 1) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* the input array to the kernel. This array will eventually be modified
|
||||
* to the inverted array.
|
||||
*/
|
||||
* the input array to the kernel. This array will eventually be modified
|
||||
* to the inverted array.
|
||||
*/
|
||||
status = clSetKernelArg(GEStep1A_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
|
||||
if (status != CL_SUCCESS) {
|
||||
printf("Error: Setting kernel argument. (input)\n");
|
||||
|
@ -822,11 +765,12 @@ int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
|
|||
inputBuffer,
|
||||
CL_TRUE,
|
||||
0,
|
||||
width * sizeof(cl_uint),
|
||||
globalThreads[0] * sizeof(cl_float),
|
||||
AI,
|
||||
0,
|
||||
NULL,
|
||||
&events[1]);
|
||||
|
||||
if(status != CL_SUCCESS) {
|
||||
printf("Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)\n");
|
||||
return 1;
|
||||
|
@ -849,18 +793,12 @@ int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
|
|||
|
||||
int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) {
|
||||
cl_int status;
|
||||
size_t globalThreads[1]; //1 matrixSize
|
||||
size_t localThreads[1]; //1 matrixSize
|
||||
cl_event events[2];
|
||||
|
||||
if (check_device_capability(globalThreads,localThreads) == 1) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* the input array to the kernel. This array will eventually be modified
|
||||
* to the inverted array.
|
||||
*/
|
||||
* the input array to the kernel. This array will eventually be modified
|
||||
* to the inverted array.
|
||||
*/
|
||||
status = clSetKernelArg(GEStep2_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
|
||||
if (status != CL_SUCCESS) {
|
||||
printf("Error: Setting kernel argument. (AI)\n");
|
||||
|
@ -931,7 +869,7 @@ int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) {
|
|||
inputBuffer,
|
||||
CL_TRUE,
|
||||
0,
|
||||
width * sizeof(cl_uint),
|
||||
globalThreads[0] * sizeof(cl_float),
|
||||
AI,
|
||||
0,
|
||||
NULL,
|
||||
|
@ -958,18 +896,12 @@ int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) {
|
|||
|
||||
int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) {
|
||||
cl_int status;
|
||||
size_t globalThreads[1]; //1 matrixSize
|
||||
size_t localThreads[1]; //1 matrixSize
|
||||
cl_event events[2];
|
||||
|
||||
if (check_device_capability(globalThreads,localThreads) == 1) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* The input array to the kernel. This array will eventually be modified
|
||||
* to the inverted array.
|
||||
*/
|
||||
* The input array to the kernel. This array will eventually be modified
|
||||
* to the inverted array.
|
||||
*/
|
||||
status = clSetKernelArg(GEStep3_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
|
||||
if (status != CL_SUCCESS) {
|
||||
printf("Error: Setting kernel argument. (input)\n");
|
||||
|
@ -1033,7 +965,7 @@ int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) {
|
|||
inputBuffer,
|
||||
CL_TRUE,
|
||||
0,
|
||||
width * sizeof(cl_uint),
|
||||
globalThreads[0] * sizeof(cl_float),
|
||||
AI,
|
||||
0,
|
||||
NULL,
|
||||
|
@ -1079,7 +1011,7 @@ void invertge(cl_float * AI_d, int lda, int n) {
|
|||
/* inverts nxn matrix input and stores the result in output */
|
||||
void invert(cl_float * input, cl_float *output, int n) {
|
||||
fprintf(stderr,"starting inversion n = %d ", n);
|
||||
volatile clock_t gputime, gputime0;
|
||||
volatile clock_t gputime;
|
||||
gputime=clock();
|
||||
|
||||
int lda = ((n+15)&~15|16);
|
||||
|
@ -1097,7 +1029,7 @@ void invert(cl_float * input, cl_float *output, int n) {
|
|||
/////////////////////////////////////////////////////////////////
|
||||
inputBuffer = clCreateBuffer(context,
|
||||
CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
|
||||
sizeof(cl_uint) * width * height,
|
||||
sizeof(cl_float) * globalThreads[0],
|
||||
AI_d,
|
||||
&status);
|
||||
if (status != CL_SUCCESS) {
|
||||
|
@ -1113,12 +1045,12 @@ void invert(cl_float * input, cl_float *output, int n) {
|
|||
|
||||
#ifdef VERIFY
|
||||
// let's verify that
|
||||
REAL error=0.0;
|
||||
cl_float error=0.0;
|
||||
|
||||
// multiply inverse*xcopy, should be Identity matrix
|
||||
for (int k = 0; k < n; k++) {
|
||||
for (int j = 0; j < n; j++) {
|
||||
REAL sum = 0;
|
||||
cl_float sum = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
sum += AI[j*lda*2+n+i]*A[i*n+k];
|
||||
}
|
||||
|
@ -1138,4 +1070,4 @@ void invert(cl_float * input, cl_float *output, int n) {
|
|||
}
|
||||
free(AI_d);
|
||||
fprintf(stderr," done!\n");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
// You should have received a copy of the GNU Lesser General Public License
|
||||
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues
|
||||
// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues.
|
||||
// Contributor: Tuan Le (tuanle86@berkeley.edu)
|
||||
|
||||
#ifndef ATIOPENCL_H_
|
||||
|
@ -40,6 +40,9 @@
|
|||
#define KERNELS_FILENAME "atiopencl_kernels.cl"
|
||||
#define KERNELS_FILEPATH "../../atiopencl_kernels.cl" // for Linux and Mac
|
||||
#define CHECKPOINT_FILE "matrix_inversion_state"
|
||||
|
||||
#define LOCAL_WORK_SIZE 1
|
||||
#define GLOBAL_WORK_SIZE 400
|
||||
#define MATRIX_SIZE 10
|
||||
#define NUM_ITERATIONS 501 // execute the kernel NUM_ITERATIONS times
|
||||
|
||||
|
@ -90,6 +93,9 @@ double cpu_time = 20, comp_result;
|
|||
bool isStateFileInUse = false;
|
||||
const char *source;
|
||||
|
||||
size_t globalThreads[1]; // 1D var for Total # of work items
|
||||
size_t localThreads[1]; // 1D var for # of work items in the work group
|
||||
|
||||
/*
|
||||
* Input data is stored here.
|
||||
*/
|
||||
|
@ -209,12 +215,6 @@ void cleanup_host(void);
|
|||
/* Write the result to output file */
|
||||
void print_to_file(MFILE *out, float *h_odata, int n);
|
||||
|
||||
/*
|
||||
* Check if the device is able to support the requested number of work items.
|
||||
*/
|
||||
int check_device_capability(size_t *globalThreads,
|
||||
size_t *localThreads);
|
||||
|
||||
/*
|
||||
* Functions used to inverst matrix. Call kernels inside.
|
||||
*/
|
||||
|
|
Loading…
Reference in New Issue