// This file is part of BOINC.
// http://boinc.berkeley.edu
// Copyright (C) 2008 University of California
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation,
// either version 3 of the License, or (at your option) any later version.
//
// BOINC is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC. If not, see .
//
// This program serves as both
// - An example BOINC-ATIOpenCL application, illustrating the use of the BOINC API
// and ATIStream OpenCL API.
// - A program for testing various features of BOINC.
//
// The program reads the input nxn matrix from the "input" file, inverts the
// matrix NUM_ITERATIONS times and write to "output" file.
//
// command line options
// -run_slow: sleep 1 second after each character
// -cpu_time N: use about N CPU seconds after copying files
// -early_exit: exit(10) after 30 chars
// -early_crash: crash after 30 chars
//
// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues.
// Contributor: Tuan Le (tuanle86@berkeley.edu)
#include "atiopencl.hpp"
#include "boinc_opencl.h"
using std::string;
int main(int argc, char * argv[]) {
int i, retval, lastInversion=0, checkpointExists=0, matrixSize=0;
double fd;
char input_path[512], output_path[512], chkpt_path[512], buf[256];
MFILE out;
FILE* state, *infile;
generate_random_input_file(MATRIX_SIZE); //call this if you don't want to
//construct the input file manually
for (i=0; i30) {
exit(-10);
}
if (early_crash && i>30) {
boinc_crash();
}
if (early_sleep && i>30) {
g_sleep = true;
while (1) boinc_sleep(1);
}
if (boinc_time_to_checkpoint()) {
printf("Perform checkpointing at inversion # %d\n",i);
//we'll need to write the current matrix to the state file.
retval = do_checkpoint(out, i, input, matrixSize);
if (retval) {
fprintf(stderr,
"ERROR: %s APP: matrix_inversion checkpoint failed %d\n",
boinc_msg_prefix(buf, sizeof(buf)), retval
);
exit(retval);
}
boinc_checkpoint_completed();
}
fd = i/NUM_ITERATIONS;
if (cpu_time) fd /= 2;
boinc_fraction_done(fd);
}
out.printf("\n\n----------------- Final inversion #%d ----------------\n\n",
NUM_ITERATIONS);
print_to_file(&out,output,matrixSize);
retval = out.flush(); //force the output file to be closed.
if (retval) {
fprintf(stderr,
"ERROR: %s APP: matrix_inversion flush failed %d\n",
boinc_msg_prefix(buf, sizeof(buf)), retval
);
exit(1);
}
// Releases OpenCL resources
if (cleanup_cl()==1) {
printf("Error from cleanup_cl() !");
return 1;
}
// Release host resources
cleanup_host();
// burn up some CPU time if needed
//
if (cpu_time) {
printf("\nBurning up some CPU time ... \n");
double start = dtime();
for (int i=0; ; i++) {
double e = dtime()-start;
if (e > cpu_time) break;
fd = .5 + .5*(e/cpu_time);
boinc_fraction_done(fd);
if (boinc_time_to_checkpoint()) {
retval = do_checkpoint(out, NUM_ITERATIONS, input, matrixSize);
if (retval) {
fprintf(stderr,
"ERROR: %s APP: maxtrix_inversion checkpoint failed %d\n",
boinc_msg_prefix(buf, sizeof(buf)), retval
);
exit(1);
}
boinc_checkpoint_completed();
}
comp_result = do_a_giga_flop(i);
}
}
boinc_fraction_done(1);
#ifdef APP_GRAPHICS
update_shmem();
#endif
if (boinc_is_standalone()) {
printf("\nDone! Please press ENTER to exit. ");
getchar();
}
boinc_finish(0);
}
#ifdef _WIN32
int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR Args, int WinMode) {
LPSTR command_line;
char* argv[100];
int argc;
command_line = GetCommandLine();
argc = parse_command_line( command_line, argv );
return main(argc, argv);
}
#endif
/*** BOINC FUNCTION DEFINITIONS ***/
/* Do a billion floating-point ops */
static double do_a_giga_flop(int foo) {
double x = 3.14159*foo;
int i;
for (i=0; i<500000000; i++) {
x += 5.12313123;
x *= 0.5398394834;
}
return x;
}
/* Save the computation state into checkpoint file */
int do_checkpoint(MFILE& mf, int n, cl_float *input, int matrixSize) {
int retval;
string resolved_name;
FILE* f = fopen("temp", "w");
if (!f) return 1;
fprintf(f, "%d", n); //write inversion number
fprintf(f, " ");
fprintf(f, "%d", matrixSize); //write matrixSize
fprintf(f, " ");
for (int i=0;i height, the matrix is
* non-invertible.
*/
int get_matrix_size(FILE *infile) {
int w=0;
char c;
fseek(infile,0,SEEK_SET);
while (true) {
do {
c=fgetc(infile);
if (c == EOF || c == '\n') {
goto exitLoop;
}
} while (isspace(c));
if (isdigit(c) || c=='.' || c=='-') {
++w;
}
do {
c=fgetc(infile);
if (c == EOF || c == '\n') {
goto exitLoop;
}
} while (isdigit(c) || c=='.' || c=='-');
if (c==EOF || c == '\n') {
break;
}
}
exitLoop:
return w;
}
/*
* \brief Host Initialization
* Allocate and initialize memory
* on the host. Print input array.
*/
int initialize_host(FILE *infile) {
input = NULL;
output = NULL;
if (width!=height) {
fprintf(stderr, "Error: non nxn matrix cannot be invertiable.\n");
return 1;
}
/////////////////////////////////////////////////////////////////
// Allocate and initialize memory used by host
/////////////////////////////////////////////////////////////////
cl_uint sizeInBytes = width * height * sizeof(cl_float);
input = (cl_float *) malloc(sizeInBytes);
if (input == NULL) {
fprintf(stderr, "Error: Failed to allocate input memory on host\n");
return 1;
}
output = (cl_float *) malloc(sizeInBytes);
if(output == NULL) {
fprintf(stderr, "Error: Failed to allocate output memory on host\n");
return 1;
}
//fillRandom(input,width,height);
fetch_elements_into_host_memory(infile,input);
return 0;
}
/*
* Read the float values from input file into "input" array.
*/
void fetch_elements_into_host_memory(FILE *infile, cl_float *input) {
float num=0;
int i=0;
if (!isStateFileInUse) {
fseek(infile,0,SEEK_SET);
}
while (fscanf(infile,"%f",&num)==1) {
input[i]=num;
++i;
}
}
/*
* Converts the contents of a file into a string
*/
char * convert_to_string(const char *fileName) {
int count=0;
char *s;
char c;
int i=0;
// look for "atiopencl_kernels.cl" in "boinc/samples/atiopencl/debug" or
// in "boinc/samples/atiopencl/release". Note that "atiopencl_kernels.cl"
// is automatically copied to these directories along the building process.
FILE *infile=fopen(fileName,"r");
if (!infile) { //not found. This typically happens on Linux or Mac.
//look for "atiopencl_kernels.cl" in "boinc/sample/atiopencl/" instead.
infile = fopen(KERNELS_FILEPATH,"r");
if (!infile) {
fprintf(stderr, "File open Error!");
exit(0);
}
}
fseek(infile,0,SEEK_SET);
while (fgetc(infile)!=EOF) count++;
s=(char *) malloc(sizeof(char)*(count+1)); //add 1 for string terminator.
fseek(infile,0,SEEK_SET);
while ((c=fgetc(infile))!=EOF) {
s[i++]=c;
}
s[i]='\0';
fclose(infile);
return s;
}
/*
* \brief OpenCL related initialization
* Create Context, Device list, Command Queue
* Load CL file, compile, link CL source
* Build program and kernel objects
*/
// Note: OpenCL memory buffer objects will be created in invert
// function before kernel calls are made.
int initialize_cl(void) {
cl_int status = 0;
size_t deviceListSize;
bool standalone = false;
int retval;
devices = NULL;
localThreads[0] = LOCAL_WORK_SIZE;
globalThreads[0] = GLOBAL_WORK_SIZE;
cl_platform_id platform = NULL;
cl_device_id device;
if (boinc_is_standalone()) {
/*
* Have a look at the available platforms and pick either
* the AMD one if available or a reasonable default.
*/
cl_uint numPlatforms;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
if(status != CL_SUCCESS) {
fprintf(stderr,
"Error: Getting Platforms. (clGetPlatformsIDs) returned %d\n",
status
);
return 1;
}
if (numPlatforms > 0) {
cl_platform_id* platforms = (cl_platform_id *)
malloc(sizeof(cl_platform_id)*numPlatforms);
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Getting Platform Ids. (clGetPlatformsIDs) returned %d\n",
status
);
return 1;
}
for (unsigned int i=0; i < numPlatforms; ++i) {
char pbuff[100];
status = clGetPlatformInfo(platforms[i],
CL_PLATFORM_VENDOR,
sizeof(pbuff),
pbuff,
NULL);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Getting Platform Info.(clGetPlatformInfo)returned %d\n",
status
);
return 1;
}
platform = platforms[i];
if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) {
break;
}
}
delete platforms;
}
if(NULL == platform) {
fprintf(stderr, "ERROR: NULL platform found so Exiting Application.");
return 1;
}
/*
* If we could find our platform, use it. Otherwise use just available platform.
*/
cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM,
(cl_context_properties)platform,
0
};
/////////////////////////////////////////////////////////////////
// Create an OpenCL context
/////////////////////////////////////////////////////////////////
context = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &status);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Creating Context. (clCreateContextFromType) returned %d\n",
status
);
return 1;
}
/* First, get the size of device list data */
status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Getting Context Info (device list size, clGetContextInfo)returned %d\n",
status
);
return 1;
}
/////////////////////////////////////////////////////////////////
// Detect OpenCL devices
/////////////////////////////////////////////////////////////////
devices = (cl_device_id *)malloc(deviceListSize);
if (devices == 0) {
fprintf(stderr, "Error: No devices found.\n");
return 1;
}
/* Now, get the device list data */
status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Getting Context Info (device list, clGetContextInfo) returned %d\n",
status
);
return 1;
}
device = devices[0];
} else { // NOT stand_alone
retval = boinc_get_opencl_ids(&device, &platform);
if (retval) {
fprintf(stderr,
"Error: boinc_get_opencl_ids() failed with error %d\n",
retval
);
return 1;
}
cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM,
(cl_context_properties)platform,
0
};
context = clCreateContext(cps, 1, &device, NULL, NULL, &status);
if (status != CL_SUCCESS) {
fprintf(stderr, "Error: clCreateContext() returned %d\n", status);
return 1;
}
}
/////////////////////////////////////////////////////////////////
// Create an OpenCL command queue
/////////////////////////////////////////////////////////////////
commandQueue = clCreateCommandQueue(context, device, 0, &status);
if(status != CL_SUCCESS) {
fprintf(stderr,
"Error: Creating Command Queue. (clCreateCommandQueue) returned %d\n",
status
);
return 1;
}
/////////////////////////////////////////////////////////////////
// Load CL file, build CL program object, create CL kernel object
/////////////////////////////////////////////////////////////////
source = convert_to_string(KERNELS_FILENAME);
size_t sourceSize[] = { strlen(source) };
program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Loading Binary into cl_program (clCreateProgramWithBinary) returned %d\n",
status
);
return 1;
}
/* create a cl program executable for all the devices specified */
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Building Program (clBuildProgram) returned %d\n",
status
);
return 1;
}
/* get a kernel object handle for a kernel with the given name */
GEStep1A_kernel = clCreateKernel(program, "GEStep1A", &status);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: clCreateKernel (GEStep1A) returned %d\n",
status
);
return 1;
}
GEStep2_kernel = clCreateKernel(program, "GEStep2", &status);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: clCreateKernel (GEStep2) returned %d\n",
status
);
return 1;
}
GEStep3_kernel = clCreateKernel(program, "GEStep3", &status);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: clCreateKernel (GEStep3) returned %d\n",
status
);
return 1;
}
return 0;
}
/*
* \brief Release OpenCL resources (Context, Memory etc.)
*/
int cleanup_cl(void) {
cl_int status;
status = clReleaseKernel(GEStep1A_kernel);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: In clReleaseKernel (GEStep1A_kernel) returned %d\n",
status
);
return 1;
}
status = clReleaseKernel(GEStep2_kernel);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: In clReleaseKernel (GEStep2_kernel) returned %d\n",
status
);
return 1;
}
status = clReleaseKernel(GEStep3_kernel);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: In clReleaseKernel (GEStep3_kernel) returned %d\n",
status
);
return 1;
}
status = clReleaseProgram(program);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: clReleaseProgram returned %d\n",
status
);
return 1;
}
status = clReleaseMemObject(inputBuffer);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: In clReleaseMemObject (inputBuffer) returned %d\n",
status
);
return 1;
}
status = clReleaseCommandQueue(commandQueue);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: In clReleaseCommandQueue returned %d\n",
status
);
return 1;
}
status = clReleaseContext(context);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: In clReleaseContext returned %d\n",
status
);
return 1;
}
return 0;
}
/*
* \brief Releases program's resources
*/
void cleanup_host(void) {
if (input != NULL) {
free(input);
input = NULL;
}
if (output != NULL) {
free(output);
output = NULL;
}
if (devices != NULL) {
free(devices);
devices = NULL;
}
if (source != NULL) {
free((char *)source);
source = NULL;
}
}
/*
* Write the result to output file
*/
void print_to_file(MFILE *out, float *h_odata, int n) {
int count=0;
int move=0;
int num_elements=n*n;
while (num_elements>0) {
out->printf("%15f ",h_odata[move]);
++count;
++move;
if (count==n) {
out->printf("\n");
count=0;
}
--num_elements;
}
}
/*
* \brief Run OpenCL program
*
* Bind host variables to kernel arguments
* Run the CL kernel
*/
int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
cl_int status;
cl_event events[2];
/*
* the input array to the kernel. This array will eventually be modified
* to the inverted array.
*/
status = clSetKernelArg(GEStep1A_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Setting kernel argument. (input) returned %d\n",
status
);
return 1;
}
/*i*/
status = clSetKernelArg(GEStep1A_kernel, 1, sizeof(int), (void *)&i);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Setting kernel argument. (i) returned %d\n",
status
);
return 1;
}
/*n2*/
status = clSetKernelArg(GEStep1A_kernel, 2, sizeof(int), (void *)&n2);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Setting kernel argument. (n2) returned %d\n",
status
);
return 1;
}
/*lda2*/
status = clSetKernelArg(GEStep1A_kernel, 3, sizeof(int), (void *)&lda2);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Setting kernel argument. (lda2) returned %d\n",
status
);
return 1;
}
/*
* Enqueue a kernel run call.
*/
status = clEnqueueNDRangeKernel(commandQueue,
GEStep1A_kernel,
1,
NULL,
globalThreads,
localThreads,
0,
NULL,
&events[0]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n",
status
);
return 1;
}
/* wait for the kernel call to finish execution */
status = clWaitForEvents(1, &events[0]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Waiting for kernel run to finish. (clWaitForEvents) returned %d\n",
status
);
return 1;
}
status = clReleaseEvent(events[0]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Release event object. (clReleaseEvent) returned %d\n",
status
);
return 1;
}
/* Enqueue readBuffer*/ //Note: we are reading back from inputBuffer since AI is modified directly in kernel
status = clEnqueueReadBuffer(commandQueue,
inputBuffer,
CL_TRUE,
0,
globalThreads[0] * sizeof(cl_float),
AI,
0,
NULL,
&events[1]);
if(status != CL_SUCCESS) {
fprintf(stderr,
"Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n",
status
);
return 1;
}
/* Wait for the read buffer to finish execution */
status = clWaitForEvents(1, &events[1]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Waiting for read buffer call to finish. (clWaitForEvents) returned %d\n",
status
);
return 1;
}
status = clReleaseEvent(events[1]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Release event object. (clReleaseEvent) returned %d\n",
status
);
return 1;
}
return 0;
}
int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) {
cl_int status;
cl_event events[2];
/*
* the input array to the kernel. This array will eventually be modified
* to the inverted array.
*/
status = clSetKernelArg(GEStep2_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Setting kernel argument. (AI) returned %d\n",
status
);
return 1;
}
/*diag*/
status = clSetKernelArg(GEStep2_kernel, 1, sizeof(cl_float), (void *)&diag);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Setting kernel argument. (diag) returned %d\n",
status
);
return 1;
}
/*i*/
status = clSetKernelArg(GEStep2_kernel, 2, sizeof(int), (void *)&i);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Setting kernel argument. (i) returned %d\n",
status
);
return 1;
}
/*n2*/
status = clSetKernelArg(GEStep2_kernel, 3, sizeof(int), (void *)&n2);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Setting kernel argument. (n2) returned %d\n",
status
);
return 1;
}
/*lda2*/
status = clSetKernelArg(GEStep2_kernel, 4, sizeof(int), (void *)&lda2);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Setting kernel argument. (lda2) returned %d\n",
status
);
return 1;
}
/*
* Enqueue a kernel run call.
*/
status = clEnqueueNDRangeKernel(commandQueue,
GEStep2_kernel,
1,
NULL,
globalThreads,
localThreads,
0,
NULL,
&events[0]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n",
status
);
return 1;
}
/* wait for the kernel call to finish execution */
status = clWaitForEvents(1, &events[0]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Waiting for kernel run to finish. (clWaitForEvents) returned %d\n",
status
);
return 1;
}
status = clReleaseEvent(events[0]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Release event object. (clReleaseEvent) returned %d\n",
status
);
return 1;
}
/* Enqueue readBuffer*/
//Note: we are reading back from inputBuffer since AI is modified directly in kernel
status = clEnqueueReadBuffer(commandQueue,
inputBuffer,
CL_TRUE,
0,
globalThreads[0] * sizeof(cl_float),
AI,
0,
NULL,
&events[1]);
if (status != CL_SUCCESS) {
fprintf(stderr, "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n", status);
return 1;
}
/* Wait for the read buffer to finish execution */
status = clWaitForEvents(1, &events[1]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Waiting for read buffer call to finish. (clWaitForEvents) returned %d\n",
status
);
return 1;
}
status = clReleaseEvent(events[1]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Release event object. (clReleaseEvent) returned %d\n",
status
);
return 1;
}
return 0;
}
int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) {
cl_int status;
cl_event events[2];
/*
* The input array to the kernel. This array will eventually be modified
* to the inverted array.
*/
status = clSetKernelArg(GEStep3_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Setting kernel argument. (input) returned %d\n",
status
);
return 1;
}
/*i*/
status = clSetKernelArg(GEStep3_kernel, 1, sizeof(int), (void *)&i);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Setting kernel argument. (i) returned %d\n",
status
);
return 1;
}
/*n2*/
status = clSetKernelArg(GEStep3_kernel, 2, sizeof(int), (void *)&n2);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Setting kernel argument. (n2) returned %d\n",
status
);
return 1;
}
/*lda2*/
status = clSetKernelArg(GEStep3_kernel, 3, sizeof(int), (void *)&lda2);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Setting kernel argument. (lda2) returned %d\n",
status
);
return 1;
}
/*
* Enqueue a kernel run call.
*/
status = clEnqueueNDRangeKernel(commandQueue,
GEStep3_kernel,
1,
NULL,
globalThreads,
localThreads,
0,
NULL,
&events[0]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n",
status
);
return 1;
}
/* wait for the kernel call to finish execution */
status = clWaitForEvents(1, &events[0]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Waiting for kernel run to finish. (clWaitForEvents) returned %d\n",
status
);
return 1;
}
status = clReleaseEvent(events[0]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Release event object. (clReleaseEvent) returned %d\n",
status
);
return 1;
}
/* Enqueue readBuffer*/
//Note: we are reading back from inputBuffer since AI is modified directly in kernel
status = clEnqueueReadBuffer(commandQueue,
inputBuffer,
CL_TRUE,
0,
globalThreads[0] * sizeof(cl_float),
AI,
0,
NULL,
&events[1]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n",
status
);
return 1;
}
/* Wait for the read buffer to finish execution */
status = clWaitForEvents(1, &events[1]);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: Waiting for read buffer call to finish. (clWaitForEvents) returned %d\n",
status
);
return 1;
}
status = clReleaseEvent(events[1]);
if(status != CL_SUCCESS) {
fprintf(stderr,
"Error: Release event object. (clReleaseEvent) returned %d\n",
status
);
return 1;
}
return 0;
}
void invertge(cl_float * AI_d, int lda, int n) {
int lda2 = lda * 2;
// perform elementary row operations till A in AI becomes identity matrix
for (int i = 0; i < n; i++) {
// execute kernel
run_GEStep1A_kernel(AI_d,i,n*2, lda2);
}
for (int i = n-1; i >= 0; i--) {
cl_float diag = 1.0;
diag=AI_d[i*lda2+i];
// execute kernels
run_GEStep2_kernel(AI_d,diag,i,n*2, lda2);
run_GEStep3_kernel(AI_d,i,n*2, lda2);
}
}
/* inverts nxn matrix input and stores the result in output */
void invert(cl_float * input, cl_float *output, int n) {
printf("starting inversion n = %d ", n);
volatile clock_t gputime;
gputime=clock();
int lda = ((n+15)&~15|16);
cl_float * AI_d = (cl_float *)malloc(sizeof(cl_float)*n*lda*2);
memset(AI_d,0,sizeof(cl_float)*n*lda*2);
for (int i = 0; i < n; i++) {
memcpy(&AI_d[lda*i*2], &input[n*i], sizeof(cl_float)*n);
AI_d[lda*i*2+n+i] = 1;
}
cl_int status;
/////////////////////////////////////////////////////////////////
// Create OpenCL memory buffer
/////////////////////////////////////////////////////////////////
inputBuffer = clCreateBuffer(context,
CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
sizeof(cl_float) * globalThreads[0],
AI_d,
&status);
if (status != CL_SUCCESS) {
fprintf(stderr,
"Error: clCreateBuffer (inputBuffer) returned %d\n",
status
);
exit(0);
}
// Note: there's no output buffer. In kernel, AI_d is modified directly.
// Thus, we should read the result back to host from inputBuffer as well.
invertge(AI_d, lda, n);
gputime=clock()-gputime;fprintf(stderr, " %7.1f ms ",gputime/1.e3f);
fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime);
#ifdef VERIFY
// let's verify that
cl_float error=0.0;
// multiply inverse*xcopy, should be Identity matrix
for (int k = 0; k < n; k++) {
for (int j = 0; j < n; j++) {
cl_float sum = 0;
for (int i = 0; i < n; i++) {
sum += AI[j*lda*2+n+i]*A[i*n+k];
}
if (j!=k) {
error += sum * sum;
} else {
error += (1.0-sum) * (1.0-sum);
}
}
}
fprintf(stderr, " %6.2f SSE", error);
#endif
//copy the result to output
for (int i = 0; i < n; i++) {
memcpy(&output[n*i], &AI_d[lda*i*2+n], sizeof(cl_float)*n);
}
free(AI_d);
fprintf(stderr," done!\n");
}