// This file is part of BOINC. // http://boinc.berkeley.edu // Copyright (C) 2008 University of California // // BOINC is free software; you can redistribute it and/or modify it // under the terms of the GNU Lesser General Public License // as published by the Free Software Foundation, // either version 3 of the License, or (at your option) any later version. // // BOINC is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // See the GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with BOINC. If not, see . // // This program serves as both // - An example BOINC-ATIOpenCL application, illustrating the use of the BOINC API // and ATIStream OpenCL API. // - A program for testing various features of BOINC. // // The program reads the input nxn matrix from the "input" file, inverts the // matrix NUM_ITERATIONS times and write to "output" file. // // command line options // -run_slow: sleep 1 second after each character // -cpu_time N: use about N CPU seconds after copying files // -early_exit: exit(10) after 30 chars // -early_crash: crash after 30 chars // // See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues. // Contributor: Tuan Le (tuanle86@berkeley.edu) #include "atiopencl.hpp" #include "boinc_opencl.h" using std::string; int main(int argc, char * argv[]) { int i, retval, lastInversion=0, checkpointExists=0, matrixSize=0; double fd; char input_path[512], output_path[512], chkpt_path[512], buf[256]; MFILE out; FILE* state, *infile; generate_random_input_file(MATRIX_SIZE); //call this if you don't want to //construct the input file manually for (i=0; i30) { exit(-10); } if (early_crash && i>30) { boinc_crash(); } if (early_sleep && i>30) { g_sleep = true; while (1) boinc_sleep(1); } if (boinc_time_to_checkpoint()) { printf("Perform checkpointing at inversion # %d\n",i); //we'll need to write the current matrix to the state file. retval = do_checkpoint(out, i, input, matrixSize); if (retval) { fprintf(stderr, "ERROR: %s APP: matrix_inversion checkpoint failed %d\n", boinc_msg_prefix(buf, sizeof(buf)), retval ); exit(retval); } boinc_checkpoint_completed(); } fd = i/NUM_ITERATIONS; if (cpu_time) fd /= 2; boinc_fraction_done(fd); } out.printf("\n\n----------------- Final inversion #%d ----------------\n\n", NUM_ITERATIONS); print_to_file(&out,output,matrixSize); retval = out.flush(); //force the output file to be closed. if (retval) { fprintf(stderr, "ERROR: %s APP: matrix_inversion flush failed %d\n", boinc_msg_prefix(buf, sizeof(buf)), retval ); exit(1); } // Releases OpenCL resources if (cleanup_cl()==1) { printf("Error from cleanup_cl() !"); return 1; } // Release host resources cleanup_host(); // burn up some CPU time if needed // if (cpu_time) { printf("\nBurning up some CPU time ... \n"); double start = dtime(); for (int i=0; ; i++) { double e = dtime()-start; if (e > cpu_time) break; fd = .5 + .5*(e/cpu_time); boinc_fraction_done(fd); if (boinc_time_to_checkpoint()) { retval = do_checkpoint(out, NUM_ITERATIONS, input, matrixSize); if (retval) { fprintf(stderr, "ERROR: %s APP: maxtrix_inversion checkpoint failed %d\n", boinc_msg_prefix(buf, sizeof(buf)), retval ); exit(1); } boinc_checkpoint_completed(); } comp_result = do_a_giga_flop(i); } } boinc_fraction_done(1); #ifdef APP_GRAPHICS update_shmem(); #endif if (boinc_is_standalone()) { printf("\nDone! Please press ENTER to exit. "); getchar(); } boinc_finish(0); } #ifdef _WIN32 int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR Args, int WinMode) { LPSTR command_line; char* argv[100]; int argc; command_line = GetCommandLine(); argc = parse_command_line( command_line, argv ); return main(argc, argv); } #endif /*** BOINC FUNCTION DEFINITIONS ***/ /* Do a billion floating-point ops */ static double do_a_giga_flop(int foo) { double x = 3.14159*foo; int i; for (i=0; i<500000000; i++) { x += 5.12313123; x *= 0.5398394834; } return x; } /* Save the computation state into checkpoint file */ int do_checkpoint(MFILE& mf, int n, cl_float *input, int matrixSize) { int retval; string resolved_name; FILE* f = fopen("temp", "w"); if (!f) return 1; fprintf(f, "%d", n); //write inversion number fprintf(f, " "); fprintf(f, "%d", matrixSize); //write matrixSize fprintf(f, " "); for (int i=0;i height, the matrix is * non-invertible. */ int get_matrix_size(FILE *infile) { int w=0; char c; fseek(infile,0,SEEK_SET); while (true) { do { c=fgetc(infile); if (c == EOF || c == '\n') { goto exitLoop; } } while (isspace(c)); if (isdigit(c) || c=='.' || c=='-') { ++w; } do { c=fgetc(infile); if (c == EOF || c == '\n') { goto exitLoop; } } while (isdigit(c) || c=='.' || c=='-'); if (c==EOF || c == '\n') { break; } } exitLoop: return w; } /* * \brief Host Initialization * Allocate and initialize memory * on the host. Print input array. */ int initialize_host(FILE *infile) { input = NULL; output = NULL; if (width!=height) { fprintf(stderr, "Error: non nxn matrix cannot be invertiable.\n"); return 1; } ///////////////////////////////////////////////////////////////// // Allocate and initialize memory used by host ///////////////////////////////////////////////////////////////// cl_uint sizeInBytes = width * height * sizeof(cl_float); input = (cl_float *) malloc(sizeInBytes); if (input == NULL) { fprintf(stderr, "Error: Failed to allocate input memory on host\n"); return 1; } output = (cl_float *) malloc(sizeInBytes); if(output == NULL) { fprintf(stderr, "Error: Failed to allocate output memory on host\n"); return 1; } //fillRandom(input,width,height); fetch_elements_into_host_memory(infile,input); return 0; } /* * Read the float values from input file into "input" array. */ void fetch_elements_into_host_memory(FILE *infile, cl_float *input) { float num=0; int i=0; if (!isStateFileInUse) { fseek(infile,0,SEEK_SET); } while (fscanf(infile,"%f",&num)==1) { input[i]=num; ++i; } } /* * Converts the contents of a file into a string */ char * convert_to_string(const char *fileName) { int count=0; char *s; char c; int i=0; // look for "atiopencl_kernels.cl" in "boinc/samples/atiopencl/debug" or // in "boinc/samples/atiopencl/release". Note that "atiopencl_kernels.cl" // is automatically copied to these directories along the building process. FILE *infile=fopen(fileName,"r"); if (!infile) { //not found. This typically happens on Linux or Mac. //look for "atiopencl_kernels.cl" in "boinc/sample/atiopencl/" instead. infile = fopen(KERNELS_FILEPATH,"r"); if (!infile) { fprintf(stderr, "File open Error!"); exit(0); } } fseek(infile,0,SEEK_SET); while (fgetc(infile)!=EOF) count++; s=(char *) malloc(sizeof(char)*(count+1)); //add 1 for string terminator. fseek(infile,0,SEEK_SET); while ((c=fgetc(infile))!=EOF) { s[i++]=c; } s[i]='\0'; fclose(infile); return s; } /* * \brief OpenCL related initialization * Create Context, Device list, Command Queue * Load CL file, compile, link CL source * Build program and kernel objects */ // Note: OpenCL memory buffer objects will be created in invert // function before kernel calls are made. int initialize_cl(void) { cl_int status = 0; size_t deviceListSize; bool standalone = false; int retval; devices = NULL; localThreads[0] = LOCAL_WORK_SIZE; globalThreads[0] = GLOBAL_WORK_SIZE; cl_platform_id platform = NULL; cl_device_id device; if (boinc_is_standalone()) { /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { fprintf(stderr, "Error: Getting Platforms. (clGetPlatformsIDs) returned %d\n", status ); return 1; } if (numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id)*numPlatforms); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Getting Platform Ids. (clGetPlatformsIDs) returned %d\n", status ); return 1; } for (unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Getting Platform Info.(clGetPlatformInfo)returned %d\n", status ); return 1; } platform = platforms[i]; if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } delete platforms; } if(NULL == platform) { fprintf(stderr, "ERROR: NULL platform found so Exiting Application."); return 1; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; ///////////////////////////////////////////////////////////////// // Create an OpenCL context ///////////////////////////////////////////////////////////////// context = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &status); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Creating Context. (clCreateContextFromType) returned %d\n", status ); return 1; } /* First, get the size of device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Getting Context Info (device list size, clGetContextInfo)returned %d\n", status ); return 1; } ///////////////////////////////////////////////////////////////// // Detect OpenCL devices ///////////////////////////////////////////////////////////////// devices = (cl_device_id *)malloc(deviceListSize); if (devices == 0) { fprintf(stderr, "Error: No devices found.\n"); return 1; } /* Now, get the device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Getting Context Info (device list, clGetContextInfo) returned %d\n", status ); return 1; } device = devices[0]; } else { // NOT stand_alone retval = boinc_get_opencl_ids(&device, &platform); if (retval) { fprintf(stderr, "Error: boinc_get_opencl_ids() failed with error %d\n", retval ); return 1; } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContext(cps, 1, &device, NULL, NULL, &status); if (status != CL_SUCCESS) { fprintf(stderr, "Error: clCreateContext() returned %d\n", status); return 1; } } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// commandQueue = clCreateCommandQueue(context, device, 0, &status); if(status != CL_SUCCESS) { fprintf(stderr, "Error: Creating Command Queue. (clCreateCommandQueue) returned %d\n", status ); return 1; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// source = convert_to_string(KERNELS_FILENAME); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Loading Binary into cl_program (clCreateProgramWithBinary) returned %d\n", status ); return 1; } /* create a cl program executable for all the devices specified */ status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Building Program (clBuildProgram) returned %d\n", status ); return 1; } /* get a kernel object handle for a kernel with the given name */ GEStep1A_kernel = clCreateKernel(program, "GEStep1A", &status); if (status != CL_SUCCESS) { fprintf(stderr, "Error: clCreateKernel (GEStep1A) returned %d\n", status ); return 1; } GEStep2_kernel = clCreateKernel(program, "GEStep2", &status); if (status != CL_SUCCESS) { fprintf(stderr, "Error: clCreateKernel (GEStep2) returned %d\n", status ); return 1; } GEStep3_kernel = clCreateKernel(program, "GEStep3", &status); if (status != CL_SUCCESS) { fprintf(stderr, "Error: clCreateKernel (GEStep3) returned %d\n", status ); return 1; } return 0; } /* * \brief Release OpenCL resources (Context, Memory etc.) */ int cleanup_cl(void) { cl_int status; status = clReleaseKernel(GEStep1A_kernel); if (status != CL_SUCCESS) { fprintf(stderr, "Error: In clReleaseKernel (GEStep1A_kernel) returned %d\n", status ); return 1; } status = clReleaseKernel(GEStep2_kernel); if (status != CL_SUCCESS) { fprintf(stderr, "Error: In clReleaseKernel (GEStep2_kernel) returned %d\n", status ); return 1; } status = clReleaseKernel(GEStep3_kernel); if (status != CL_SUCCESS) { fprintf(stderr, "Error: In clReleaseKernel (GEStep3_kernel) returned %d\n", status ); return 1; } status = clReleaseProgram(program); if (status != CL_SUCCESS) { fprintf(stderr, "Error: clReleaseProgram returned %d\n", status ); return 1; } status = clReleaseMemObject(inputBuffer); if (status != CL_SUCCESS) { fprintf(stderr, "Error: In clReleaseMemObject (inputBuffer) returned %d\n", status ); return 1; } status = clReleaseCommandQueue(commandQueue); if (status != CL_SUCCESS) { fprintf(stderr, "Error: In clReleaseCommandQueue returned %d\n", status ); return 1; } status = clReleaseContext(context); if (status != CL_SUCCESS) { fprintf(stderr, "Error: In clReleaseContext returned %d\n", status ); return 1; } return 0; } /* * \brief Releases program's resources */ void cleanup_host(void) { if (input != NULL) { free(input); input = NULL; } if (output != NULL) { free(output); output = NULL; } if (devices != NULL) { free(devices); devices = NULL; } if (source != NULL) { free((char *)source); source = NULL; } } /* * Write the result to output file */ void print_to_file(MFILE *out, float *h_odata, int n) { int count=0; int move=0; int num_elements=n*n; while (num_elements>0) { out->printf("%15f ",h_odata[move]); ++count; ++move; if (count==n) { out->printf("\n"); count=0; } --num_elements; } } /* * \brief Run OpenCL program * * Bind host variables to kernel arguments * Run the CL kernel */ int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) { cl_int status; cl_event events[2]; /* * the input array to the kernel. This array will eventually be modified * to the inverted array. */ status = clSetKernelArg(GEStep1A_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Setting kernel argument. (input) returned %d\n", status ); return 1; } /*i*/ status = clSetKernelArg(GEStep1A_kernel, 1, sizeof(int), (void *)&i); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Setting kernel argument. (i) returned %d\n", status ); return 1; } /*n2*/ status = clSetKernelArg(GEStep1A_kernel, 2, sizeof(int), (void *)&n2); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Setting kernel argument. (n2) returned %d\n", status ); return 1; } /*lda2*/ status = clSetKernelArg(GEStep1A_kernel, 3, sizeof(int), (void *)&lda2); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Setting kernel argument. (lda2) returned %d\n", status ); return 1; } /* * Enqueue a kernel run call. */ status = clEnqueueNDRangeKernel(commandQueue, GEStep1A_kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n", status ); return 1; } /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Waiting for kernel run to finish. (clWaitForEvents) returned %d\n", status ); return 1; } status = clReleaseEvent(events[0]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Release event object. (clReleaseEvent) returned %d\n", status ); return 1; } /* Enqueue readBuffer*/ //Note: we are reading back from inputBuffer since AI is modified directly in kernel status = clEnqueueReadBuffer(commandQueue, inputBuffer, CL_TRUE, 0, globalThreads[0] * sizeof(cl_float), AI, 0, NULL, &events[1]); if(status != CL_SUCCESS) { fprintf(stderr, "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n", status ); return 1; } /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[1]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Waiting for read buffer call to finish. (clWaitForEvents) returned %d\n", status ); return 1; } status = clReleaseEvent(events[1]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Release event object. (clReleaseEvent) returned %d\n", status ); return 1; } return 0; } int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) { cl_int status; cl_event events[2]; /* * the input array to the kernel. This array will eventually be modified * to the inverted array. */ status = clSetKernelArg(GEStep2_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Setting kernel argument. (AI) returned %d\n", status ); return 1; } /*diag*/ status = clSetKernelArg(GEStep2_kernel, 1, sizeof(cl_float), (void *)&diag); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Setting kernel argument. (diag) returned %d\n", status ); return 1; } /*i*/ status = clSetKernelArg(GEStep2_kernel, 2, sizeof(int), (void *)&i); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Setting kernel argument. (i) returned %d\n", status ); return 1; } /*n2*/ status = clSetKernelArg(GEStep2_kernel, 3, sizeof(int), (void *)&n2); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Setting kernel argument. (n2) returned %d\n", status ); return 1; } /*lda2*/ status = clSetKernelArg(GEStep2_kernel, 4, sizeof(int), (void *)&lda2); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Setting kernel argument. (lda2) returned %d\n", status ); return 1; } /* * Enqueue a kernel run call. */ status = clEnqueueNDRangeKernel(commandQueue, GEStep2_kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n", status ); return 1; } /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Waiting for kernel run to finish. (clWaitForEvents) returned %d\n", status ); return 1; } status = clReleaseEvent(events[0]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Release event object. (clReleaseEvent) returned %d\n", status ); return 1; } /* Enqueue readBuffer*/ //Note: we are reading back from inputBuffer since AI is modified directly in kernel status = clEnqueueReadBuffer(commandQueue, inputBuffer, CL_TRUE, 0, globalThreads[0] * sizeof(cl_float), AI, 0, NULL, &events[1]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n", status); return 1; } /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[1]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Waiting for read buffer call to finish. (clWaitForEvents) returned %d\n", status ); return 1; } status = clReleaseEvent(events[1]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Release event object. (clReleaseEvent) returned %d\n", status ); return 1; } return 0; } int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) { cl_int status; cl_event events[2]; /* * The input array to the kernel. This array will eventually be modified * to the inverted array. */ status = clSetKernelArg(GEStep3_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Setting kernel argument. (input) returned %d\n", status ); return 1; } /*i*/ status = clSetKernelArg(GEStep3_kernel, 1, sizeof(int), (void *)&i); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Setting kernel argument. (i) returned %d\n", status ); return 1; } /*n2*/ status = clSetKernelArg(GEStep3_kernel, 2, sizeof(int), (void *)&n2); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Setting kernel argument. (n2) returned %d\n", status ); return 1; } /*lda2*/ status = clSetKernelArg(GEStep3_kernel, 3, sizeof(int), (void *)&lda2); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Setting kernel argument. (lda2) returned %d\n", status ); return 1; } /* * Enqueue a kernel run call. */ status = clEnqueueNDRangeKernel(commandQueue, GEStep3_kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n", status ); return 1; } /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Waiting for kernel run to finish. (clWaitForEvents) returned %d\n", status ); return 1; } status = clReleaseEvent(events[0]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Release event object. (clReleaseEvent) returned %d\n", status ); return 1; } /* Enqueue readBuffer*/ //Note: we are reading back from inputBuffer since AI is modified directly in kernel status = clEnqueueReadBuffer(commandQueue, inputBuffer, CL_TRUE, 0, globalThreads[0] * sizeof(cl_float), AI, 0, NULL, &events[1]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n", status ); return 1; } /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[1]); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Waiting for read buffer call to finish. (clWaitForEvents) returned %d\n", status ); return 1; } status = clReleaseEvent(events[1]); if(status != CL_SUCCESS) { fprintf(stderr, "Error: Release event object. (clReleaseEvent) returned %d\n", status ); return 1; } return 0; } void invertge(cl_float * AI_d, int lda, int n) { int lda2 = lda * 2; // perform elementary row operations till A in AI becomes identity matrix for (int i = 0; i < n; i++) { // execute kernel run_GEStep1A_kernel(AI_d,i,n*2, lda2); } for (int i = n-1; i >= 0; i--) { cl_float diag = 1.0; diag=AI_d[i*lda2+i]; // execute kernels run_GEStep2_kernel(AI_d,diag,i,n*2, lda2); run_GEStep3_kernel(AI_d,i,n*2, lda2); } } /* inverts nxn matrix input and stores the result in output */ void invert(cl_float * input, cl_float *output, int n) { printf("starting inversion n = %d ", n); volatile clock_t gputime; gputime=clock(); int lda = ((n+15)&~15|16); cl_float * AI_d = (cl_float *)malloc(sizeof(cl_float)*n*lda*2); memset(AI_d,0,sizeof(cl_float)*n*lda*2); for (int i = 0; i < n; i++) { memcpy(&AI_d[lda*i*2], &input[n*i], sizeof(cl_float)*n); AI_d[lda*i*2+n+i] = 1; } cl_int status; ///////////////////////////////////////////////////////////////// // Create OpenCL memory buffer ///////////////////////////////////////////////////////////////// inputBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * globalThreads[0], AI_d, &status); if (status != CL_SUCCESS) { fprintf(stderr, "Error: clCreateBuffer (inputBuffer) returned %d\n", status ); exit(0); } // Note: there's no output buffer. In kernel, AI_d is modified directly. // Thus, we should read the result back to host from inputBuffer as well. invertge(AI_d, lda, n); gputime=clock()-gputime;fprintf(stderr, " %7.1f ms ",gputime/1.e3f); fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime); #ifdef VERIFY // let's verify that cl_float error=0.0; // multiply inverse*xcopy, should be Identity matrix for (int k = 0; k < n; k++) { for (int j = 0; j < n; j++) { cl_float sum = 0; for (int i = 0; i < n; i++) { sum += AI[j*lda*2+n+i]*A[i*n+k]; } if (j!=k) { error += sum * sum; } else { error += (1.0-sum) * (1.0-sum); } } } fprintf(stderr, " %6.2f SSE", error); #endif //copy the result to output for (int i = 0; i < n; i++) { memcpy(&output[n*i], &AI_d[lda*i*2+n], sizeof(cl_float)*n); } free(AI_d); fprintf(stderr," done!\n"); }