diff --git a/samples/openclapp/INIT_DATA test files/ATI-AMD/init_data.xml b/samples/openclapp/INIT_DATA test files/ATI-AMD/init_data.xml
new file mode 100644
index 0000000000..0c4320fc2d
--- /dev/null
+++ b/samples/openclapp/INIT_DATA test files/ATI-AMD/init_data.xml	
@@ -0,0 +1,4 @@
+<app_init_data>
+<gpu_type>ATI</gpu_type>
+<gpu_device_num>0</gpu_device_num>
+</app_init_data>
diff --git a/samples/openclapp/INIT_DATA test files/INTEL_GPU/init_data.xml b/samples/openclapp/INIT_DATA test files/INTEL_GPU/init_data.xml
new file mode 100644
index 0000000000..34bdc9c241
--- /dev/null
+++ b/samples/openclapp/INIT_DATA test files/INTEL_GPU/init_data.xml	
@@ -0,0 +1,4 @@
+<app_init_data>
+<gpu_type>intel_gpu</gpu_type>
+<gpu_device_num>0</gpu_device_num>
+</app_init_data>
diff --git a/samples/openclapp/INIT_DATA test files/NVIDIA/init_data.xml b/samples/openclapp/INIT_DATA test files/NVIDIA/init_data.xml
new file mode 100644
index 0000000000..ac3bb1bb7c
--- /dev/null
+++ b/samples/openclapp/INIT_DATA test files/NVIDIA/init_data.xml	
@@ -0,0 +1,4 @@
+<app_init_data>
+<gpu_type>NVIDIA</gpu_type>
+<gpu_device_num>0</gpu_device_num>
+</app_init_data>
diff --git a/samples/openclapp/Makefile_AMD b/samples/openclapp/Makefile_AMD
new file mode 100644
index 0000000000..8bbe87f0f3
--- /dev/null
+++ b/samples/openclapp/Makefile_AMD
@@ -0,0 +1,44 @@
+# makefile for openclapp BOINC example appication using AMD GPU on Linux.
+
+BOINC_DIR = ../..
+BOINC_API_DIR = $(BOINC_DIR)/api
+BOINC_LIB_DIR = $(BOINC_DIR)/lib 
+
+CXXFLAGS = -g \
+    -I$(BOINC_DIR) \
+    -I$(BOINC_LIB_DIR) \
+    -I$(BOINC_API_DIR) \
+    -I$(AMDAPPSDKROOT)/include \
+    -L. \
+    -L$(BOINC_LIB_DIR) \
+    -L$(BOINC_API_DIR) \
+    -L$(AMDAPPSDKROOT)/lib/x86 \
+    -L/lib/i386-linux-gnu
+## The above 2 lines work around an Ubuntu LD_LIBRARY_PATH bug described in: 
+## <https://help.ubuntu.com/community/EnvironmentVariables#File-location_related_variables>
+
+PROGS = openclapp \
+
+all: $(PROGS)
+
+libstdc++.a:
+	ln -s `g++ -print-file-name=libstdc++.a`
+
+clean:
+	/bin/rm -f $(PROGS) *.o libstdc++.a
+
+distclean:
+	/bin/rm -f $(PROGS) *.o libstdc++.a
+
+install: openclapp
+
+openclapp: openclapp.o boinc_opencl.o libstdc++.a
+	$(CXX) $(CXXFLAGS) -o openclapp openclapp.o boinc_opencl.o \
+        libstdc++.a -lOpenCL -lboinc_api -lboinc -lpthread
+
+openclapp.o: openclapp.cpp openclapp.hpp
+	$(CXX) $(CXXFLAGS) -c openclapp.cpp
+
+boinc_opencl.o: $(BOINC_API_DIR)/boinc_opencl.cpp $(BOINC_API_DIR)/boinc_opencl.h
+	$(CXX) $(CXXFLAGS) -c $(BOINC_API_DIR)/boinc_opencl.cpp
+
diff --git a/samples/openclapp/Makefile_NVIDIA b/samples/openclapp/Makefile_NVIDIA
new file mode 100644
index 0000000000..66ea2c6253
--- /dev/null
+++ b/samples/openclapp/Makefile_NVIDIA
@@ -0,0 +1,44 @@
+# makefile for openclapp BOINC example appication using NVIDIA GPU on Linux.
+
+BOINC_DIR = ../..
+BOINC_API_DIR = $(BOINC_DIR)/api
+BOINC_LIB_DIR = $(BOINC_DIR)/lib 
+
+CXXFLAGS = -g \
+    -I$(BOINC_DIR) \
+    -I$(BOINC_LIB_DIR) \
+    -I$(BOINC_API_DIR) \
+    -I/usr/local/cuda/include \
+    -L$(BOINC_LIB_DIR) \
+    -L$(BOINC_API_DIR) \
+    -L. \
+    -L/usr/local/cuda/lib \
+    -L/lib/i386-linux-gnu
+## The above 2 lines work around an Ubuntu LD_LIBRARY_PATH bug described in: 
+## <https://help.ubuntu.com/community/EnvironmentVariables#File-location_related_variables>
+
+PROGS = openclapp \
+
+all: $(PROGS)
+
+libstdc++.a:
+	ln -s `g++ -print-file-name=libstdc++.a`
+
+clean:
+	/bin/rm -f $(PROGS) *.o libstdc++.a
+
+distclean:
+	/bin/rm -f $(PROGS) *.o libstdc++.a
+
+install: openclapp
+
+openclapp: openclapp.o boinc_opencl.o libstdc++.a
+	$(CXX) $(CXXFLAGS) -o openclapp openclapp.o boinc_opencl.o \
+        libstdc++.a -lOpenCL -lboinc_api -lboinc -lpthread
+
+openclapp.o: openclapp.cpp openclapp.hpp
+	$(CXX) $(CXXFLAGS) -c openclapp.cpp
+
+boinc_opencl.o: $(BOINC_API_DIR)/boinc_opencl.cpp $(BOINC_API_DIR)/boinc_opencl.h
+	$(CXX) $(CXXFLAGS) -c $(BOINC_API_DIR)/boinc_opencl.cpp
+
diff --git a/samples/openclapp/Makefile_mac b/samples/openclapp/Makefile_mac
index a677fb272d..f352b110b8 100644
--- a/samples/openclapp/Makefile_mac
+++ b/samples/openclapp/Makefile_mac
@@ -1,9 +1,10 @@
-# makefile for atiopencl BOINC example appication on Mac OS X 10.7
+# makefile for openclapp BOINC example appication on Mac OS X 10.7
 # To build:
-#   cd to the boinc/samples/atiopencl directory
+#   cd to the boinc/samples/openclapp directory
 #   make -f Makefile_mac [clean] all
 #
 
+
 BOINC_DIR = ../..
 BOINC_API_DIR = $(BOINC_DIR)/api
 BOINC_LIB_DIR = $(BOINC_DIR)/lib 
@@ -14,10 +15,9 @@ CXXFLAGS = -g \
     -I$(BOINC_DIR) \
     -I$(BOINC_LIB_DIR) \
     -I$(BOINC_API_DIR) \
-    -I$(BOINC_MAC_CONFIG_DIR) \
-    -L.
+    -I$(BOINC_MAC_CONFIG_DIR)
 
-PROGS = atiopencl \
+PROGS = openclapp \
 
 all: $(PROGS)
 
@@ -27,17 +27,17 @@ clean:
 distclean:
 	/bin/rm -f $(PROGS) *.o
 
-install: atiopencl
+install: openclapp
 
-atiopencl: atiopencl.o boinc_opencl.o
-	$(CXX) $(CXXFLAGS) -o atiopencl atiopencl.o \
+openclapp: openclapp.o boinc_opencl.o
+	$(CXX) $(CXXFLAGS) -o openclapp openclapp.o \
         boinc_opencl.o \
         -framework OpenCL \
         -lboinc_api -L$(BOINC_BUILD_DIR) \
         -lboinc -L$(BOINC_BUILD_DIR)
 
-atiopencl.o: atiopencl.cpp atiopencl.hpp
-	$(CXX) $(CXXFLAGS) -c atiopencl.cpp 
+openclapp.o: openclapp.cpp openclapp.hpp
+	$(CXX) $(CXXFLAGS) -c openclapp.cpp 
 
 boinc_opencl.o: $(BOINC_API_DIR)/boinc_opencl.cpp $(BOINC_API_DIR)/boinc_opencl.h
 	$(CXX) $(CXXFLAGS) -c $(BOINC_API_DIR)/boinc_opencl.cpp
diff --git a/samples/openclapp/ReadMe.txt b/samples/openclapp/ReadMe.txt
new file mode 100644
index 0000000000..29e89ce6ea
--- /dev/null
+++ b/samples/openclapp/ReadMe.txt
@@ -0,0 +1,56 @@
+Windows projects.
+
+To build for Mac:
+make -f Makefile_mac
+
+To build for Linux (assuming you have installed the appropriate GPU computing SDK):
+For ATI/AMD:
+make -f Makefile_AMD
+
+For NVIDIA:
+make -f Makefile_NVIDIA
+
+For intel Ivy Bridge: modify one of the above make files for the appropriate paths to the OpenCL headers and libraries.
+
+Adjust the -I and -L arguments for Linux if the OpenCL headers and libraries are in non-standard locations.
+
+To run:
+This same sample is designed to run with AMD, NVIDIA and Intel Ivy Bridge GPUs.  It is supplied with 3 minimal init_data.xml files, one for each of these 3 vendors (GPU "types".)  Copy the appropriate init_data.xml file into the directory containing the openclapp executable.  Then run from the Terminal:
+$ cd to/the/directory/containing/executable/and/init_data.xml/file
+$ ./openclapp [options]
+
+command line options
+ -run_slow: sleep 1 second after each character
+ -cpu_time N: use about N CPU seconds after copying files
+ -early_exit: exit(10) after 30 iterations
+ -early_crash: crash after 30 iterations
+ -early_sleep: go into infinite sleep after 30 iterations
+
+==============================================================
+
+Important notes about the sample code:
+
+Since a computer can have multiple GPUs, the application must use the GPU assigned by the BOINC client.  To do this, it must call the following API:
+int boinc_get_opencl_ids(
+    int argc, char** argv, int type,
+    cl_device_id* device, cl_platform_id* platform
+);
+
+The arguments are as follows:
+argc, argv: the argv and argc received by the application's main() from the BOINC client.
+type: may be PROC_TYPE_NVIDIA_GPU, PROC_TYPE_AMD_GPU or PROC_TYPE_INTEL_GPU.
+device: a pointer to the variable to receive the cl_device_id of the desired GPU.
+platform: a pointer to the variable to receive the cl_platform_id of the desired GPU.
+
+Currently, BOINC expects projects to provide separate production applications for each GPU vendor (GPU type), with a separate "plan class" for each.  BOINC currently supports GPUs from the three major vendors: AMD (ATI). NVIDIA or Intel (Ivy Bridge or later).  BOINC refers to the vendors as gpu "types."  
+
+Because older clients do not write the <gpu_type> field into the init_data.xml file, your application must pass the appropriate GPU type as the third argument in the boinc_get_opencl_ids() call, or it will not be compatible with older clients.  
+
+However, to avoid redundancy, this one sample is designed to work with OpenCl-capable GPUs from any of the three vendors.  To accomplish this, it does not pass a valid type in the boinc_get_opencl_ids() call; it requires init_data.xml file to have a valid <gpu_type> field, and so would not be compatible with older clients.  This shortcut is not acceptable for production OpenCL applications; you _must_ pass in a type of either PROC_TYPE_NVIDIA_GPU, PROC_TYPE_AMD_GPU or PROC_TYPE_INTEL_GPU.
+
+==============================================================
+
+What is the difference between a GPU's gpu_device_num and its gpu_opencl_dev_index?
+
+In most cases, they are identical.  But on Macs which have CUDA installed, Mac OpenCL does not always recognize all NVIDIA GPUs recognized by CUDA.  In that case, the gpu_device_num  is the device's position among all the CUDA-capable GPUs, and the gpu_opencl_dev_index is the device's position among all the OpenCL-capable GPUs.
+
diff --git a/samples/openclapp/openclapp.cpp b/samples/openclapp/openclapp.cpp
index faee54c6ac..b283e354e7 100644
--- a/samples/openclapp/openclapp.cpp
+++ b/samples/openclapp/openclapp.cpp
@@ -1,6 +1,6 @@
 // This file is part of BOINC.
 // http://boinc.berkeley.edu
-// Copyright (C) 2008 University of California
+// Copyright (C) 2013 University of California
 //
 // BOINC is free software; you can redistribute it and/or modify it
 // under the terms of the GNU Lesser General Public License
@@ -16,23 +16,30 @@
 // along with BOINC.  If not, see <http://www.gnu.org/licenses/>.
 //
 // This program serves as both
-// - An example BOINC-ATIOpenCL application, illustrating the use of the BOINC API
-//   and ATIStream OpenCL API.
+// - An example BOINC-OpenCL application, illustrating the use of the BOINC API
+//   and OpenCL API.
 // - A program for testing various features of BOINC.
 //
 // The program reads the input nxn matrix from the "input" file, inverts the
 // matrix NUM_ITERATIONS times and write to "output" file.
 //
+// To run, place the executable in the same directory as an init_data.xml
+// file specifying the gpu_type (vendor) and gpu_device_num, then invoke
+// form the command line as follows:
+// $ cd to/the/directory/containing/executable/and/init_data.mxl/file
+// $ ./openclapp [options]
+//
 // command line options
 // -run_slow: sleep 1 second after each character
 // -cpu_time N: use about N CPU seconds after copying files
-// -early_exit: exit(10) after 30 chars
-// -early_crash: crash after 30 chars
+// -early_exit: exit(10) after 30 iterations
+// -early_crash: crash after 30 iterations
+// -early_sleep: go into infinite sleep after 30 iterations
 //
 // See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues.
-// Contributor: Tuan Le (tuanle86@berkeley.edu)
+// Original contributor: Tuan Le (tuanle86@berkeley.edu)
 
-#include "atiopencl.hpp"
+#include "openclapp.hpp"
 #include "boinc_opencl.h"
 
 using std::string;
@@ -46,7 +53,6 @@ int main(int argc, char * argv[]) {
 
     generate_random_input_file(MATRIX_SIZE); //call this if you don't want to
                                              //construct the input file manually
-    
 	for (i=0; i<argc; i++) {
         if (!strcmp(argv[i], "-early_exit")) early_exit = true;
         if (!strcmp(argv[i], "-early_crash")) early_crash = true;
@@ -61,7 +67,8 @@ int main(int argc, char * argv[]) {
     if (retval) {
         fprintf(stderr, 
             "ERROR: %s boinc_init returned %d\n",
-            boinc_msg_prefix(buf, sizeof(buf)), retval );
+            boinc_msg_prefix(buf, sizeof(buf)), retval
+        );
         exit(retval);
     }
     
@@ -122,7 +129,7 @@ int main(int argc, char * argv[]) {
     shmem = (UC_SHMEM*)boinc_graphics_make_shmem("matrix_inversion", sizeof(UC_SHMEM));
     if (!shmem) {
         fprintf(stderr, 
-            "%s failed to create shared mem segment\n",
+            "ERROR: %s failed to create shared mem segment\n",
             boinc_msg_prefix(buf, sizeof(buf))
         );
     }
@@ -150,7 +157,7 @@ int main(int argc, char * argv[]) {
     }
 
     // Initialize OpenCL resources
-    if (initialize_cl()==1) {
+    if (initialize_cl(argc, argv) != 0) {
         return 1;
     }
 
@@ -213,7 +220,7 @@ int main(int argc, char * argv[]) {
 
     // Releases OpenCL resources 
     if (cleanup_cl()==1) {
-        printf("Error from cleanup_cl() !");
+        fprintf(stderr, "Error from cleanup_cl() !");
         return 1;
     }
 
@@ -432,15 +439,15 @@ char * convert_to_string(const char *fileName) {
     char c;
     int i=0;
 
-    // look for "atiopencl_kernels.cl" in "boinc/samples/atiopencl/debug" or
-    // in "boinc/samples/atiopencl/release". Note that "atiopencl_kernels.cl"
+    // look for "openclapp_kernels.cl" in "boinc/samples/openclapp/debug" or
+    // in "boinc/samples/openclapp/release". Note that "openclapp_kernels.cl"
     // is automatically copied to these directories along the building process.
     FILE *infile=fopen(fileName,"r");
     if (!infile) { //not found. This typically happens on Linux or Mac.
-        //look for "atiopencl_kernels.cl" in "boinc/sample/atiopencl/" instead.
+        //look for "openclapp_kernels.cl" in "boinc/sample/openclapp/" instead.
         infile = fopen(KERNELS_FILEPATH,"r"); 
         if (!infile) {
-            fprintf(stderr, "File open Error!");
+            fprintf(stderr, "ERROR: Failed to open file %s!", fileName);
             exit(0);
         }
     }
@@ -465,148 +472,49 @@ char * convert_to_string(const char *fileName) {
 
  // Note: OpenCL memory buffer objects will be created in invert
  //       function before kernel calls are made.
-int initialize_cl(void) {
+int initialize_cl(int argc, char * argv[]) {
     cl_int status = 0;
-    size_t deviceListSize;
-    bool standalone = false;
-    int retval;
-
-    devices = NULL;
+    int retval = 0;
 
     localThreads[0]  = LOCAL_WORK_SIZE;
     globalThreads[0] = GLOBAL_WORK_SIZE;
     cl_platform_id platform = NULL;
     cl_device_id device;
 
-    if (boinc_is_standalone()) {
-        /*
-         * Have a look at the available platforms and pick either
-         * the AMD one if available or a reasonable default.
-         */
+    // IMPORTANT NOTE: production applications should always specify
+    // the GPU type (vendor) in the call to boinc_get_opencl_ids as
+    // the third argument: it must be either PROC_TYPE_NVIDIA_GPU,
+    // PROC_TYPE_AMD_GPU or PROC_TYPE_INTEL_GPU.  This is to support
+    // older versions of the BOINC client which do not include the 
+    // <gpu-type> field in the init_data.xml file.  
+    //
+    // This sample passes -1 for the type argument to allow using
+    // just one sample for any GPU vendor (AMD, NVIDIA or Intel.)  
+    // As a result, the init_data.xml file for this sample must 
+    // specify the GPU type (vendor) and either gpu_device_num (the 
+    // GPU's index from that vendor) or gpu_opencl_dev_index (the 
+    // GPU's index among OpenCL-capable devices from that vendor.)
+    //
+    // See the ReadMe file for more details, including an explanation
+    // of the difference between the gpu_device_num and the
+    // gpu_opencl_dev_index.
+    retval = boinc_get_opencl_ids(argc, argv, -1, &device, &platform);
+    if (retval) {
+        fprintf(stderr, 
+            "Error: boinc_get_opencl_ids() failed with error %d\n", 
+            retval
+        );
+        return 1;
+    }
 
-        cl_uint numPlatforms;
-        status = clGetPlatformIDs(0, NULL, &numPlatforms);
-        if(status != CL_SUCCESS) {
-            fprintf(stderr, 
-                "Error: Getting Platforms. (clGetPlatformsIDs) returned %d\n", 
-                status
-            );
-            return 1;
-        }
-        
-        if (numPlatforms > 0) {
-            cl_platform_id* platforms = (cl_platform_id *)
-                                            malloc(sizeof(cl_platform_id)*numPlatforms);
-            status = clGetPlatformIDs(numPlatforms, platforms, NULL);
-            if (status != CL_SUCCESS) {
-                fprintf(stderr, 
-                    "Error: Getting Platform Ids. (clGetPlatformsIDs) returned %d\n", 
-                    status
-                );
-
-                return 1;
-            }
-            for (unsigned int i=0; i < numPlatforms; ++i) {
-                char pbuff[100];
-                status = clGetPlatformInfo(platforms[i],
-                                           CL_PLATFORM_VENDOR,
-                                           sizeof(pbuff),
-                                           pbuff,
-                                           NULL);
-                if (status != CL_SUCCESS) {
-                    fprintf(stderr, 
-                        "Error: Getting Platform Info.(clGetPlatformInfo)returned %d\n", 
-                        status
-                    );
-
-                    return 1;
-                }
-                platform = platforms[i];
-                if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) {
-                    break;
-                }
-            }
-            delete platforms;
-        }
-
-        if(NULL == platform) {
-            fprintf(stderr, "ERROR: NULL platform found so Exiting Application.");
-            return 1;
-        }
-
-        /*
-         * If we could find our platform, use it. Otherwise use just available platform.
-         */
-        cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM,
-                                         (cl_context_properties)platform,
-                                         0 
-                                       };
-
-        /////////////////////////////////////////////////////////////////
-        // Create an OpenCL context
-        /////////////////////////////////////////////////////////////////
-        context = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &status);
-        if (status != CL_SUCCESS) {  
-            fprintf(stderr, 
-                "Error: Creating Context. (clCreateContextFromType) returned %d\n", 
-                status
-            );
-
-            return 1; 
-        }
-
-        /* First, get the size of device list data */
-        status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize);
-        if (status != CL_SUCCESS) {  
-            fprintf(stderr, 
-                "Error: Getting Context Info (device list size, clGetContextInfo)returned %d\n", 
-                status
-            );
-
-            return 1;
-        }
-
-        /////////////////////////////////////////////////////////////////
-        // Detect OpenCL devices
-        /////////////////////////////////////////////////////////////////
-        devices = (cl_device_id *)malloc(deviceListSize);
-        if (devices == 0) {
-            fprintf(stderr, "Error: No devices found.\n");
-            return 1;
-        }
-
-        /* Now, get the device list data */
-        status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL);
-        if (status != CL_SUCCESS) { 
-            fprintf(stderr, 
-                "Error: Getting Context Info (device list, clGetContextInfo) returned %d\n", 
-                status
-            );
-
-            return 1;
-        }
-        
-        device = devices[0];
-
-    } else {    // NOT stand_alone
-        retval = boinc_get_opencl_ids(&device, &platform);
-        if (retval) {
-            fprintf(stderr, 
-                "Error: boinc_get_opencl_ids() failed with error %d\n", 
-                retval
-            );
-            return 1;
-        }
-        
-       cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM,
-                                         (cl_context_properties)platform,
-                                         0 
-                                       };
-        context = clCreateContext(cps, 1, &device, NULL, NULL, &status);
-        if (status != CL_SUCCESS) {  
-            fprintf(stderr, "Error: clCreateContext() returned %d\n", status);
-            return 1; 
-        }
+   cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM,
+                                     (cl_context_properties)platform,
+                                     0 
+                                   };
+    context = clCreateContext(cps, 1, &device, NULL, NULL, &status);
+    if (status != CL_SUCCESS) {
+        fprintf(stderr, "Error: clCreateContext() returned %d\n", status);
+        return 1; 
     }
 
     /////////////////////////////////////////////////////////////////
@@ -618,7 +526,6 @@ int initialize_cl(void) {
             "Error: Creating Command Queue. (clCreateCommandQueue) returned %d\n", 
             status
         );
-
         return 1;
     }
     
@@ -638,7 +545,7 @@ int initialize_cl(void) {
     }
 
     /* create a cl program executable for all the devices specified */
-    status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
+    status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
     if (status != CL_SUCCESS)  {
         fprintf(stderr, 
             "Error: Building Program (clBuildProgram) returned %d\n", 
@@ -724,15 +631,6 @@ int cleanup_cl(void) {
         return 1; 
     }
 
-    status = clReleaseMemObject(inputBuffer);
-    if (status != CL_SUCCESS) {
-        fprintf(stderr, 
-            "Error: In clReleaseMemObject (inputBuffer) returned %d\n", 
-            status
-        );
-        return 1; 
-    }
-
     status = clReleaseCommandQueue(commandQueue);
     if (status != CL_SUCCESS) {
         fprintf(stderr, 
@@ -768,11 +666,6 @@ void cleanup_host(void) {
         output = NULL;
     }
 
-    if (devices != NULL) {
-        free(devices);
-        devices = NULL;
-    }
-
     if (source != NULL) {
         free((char *)source);
         source = NULL;
@@ -806,7 +699,6 @@ void print_to_file(MFILE *out, float *h_odata, int n) {
  */
 int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
     cl_int status;
-    cl_event events[2];
 
     /* 
      * the input array to the kernel. This array will eventually be modified
@@ -862,7 +754,8 @@ int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
                                     localThreads,
                                     0,
                                     NULL,
-                                    &events[0]);
+                                    NULL);
+
     if (status != CL_SUCCESS) { 
         fprintf(stderr, 
             "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n", 
@@ -871,35 +764,18 @@ int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
         return 1;
     }
 
-    /* wait for the kernel call to finish execution */
-    status = clWaitForEvents(1, &events[0]);
-    if (status != CL_SUCCESS) { 
-        fprintf(stderr, 
-            "Error: Waiting for kernel run to finish. (clWaitForEvents) returned %d\n", 
-            status
-        );
-        return 1;
-    }
-
-    status = clReleaseEvent(events[0]);
-    if (status != CL_SUCCESS) { 
-        fprintf(stderr, 
-            "Error: Release event object. (clReleaseEvent) returned %d\n", 
-            status
-        );
-        return 1;
-    }
+    clFinish(commandQueue);
 
     /* Enqueue readBuffer*/  //Note: we are reading back from inputBuffer since AI is modified directly in kernel
     status = clEnqueueReadBuffer(commandQueue,
                                  inputBuffer,
-                                 CL_TRUE,
+                                 CL_FALSE,
                                  0,
                                  globalThreads[0] * sizeof(cl_float),
                                  AI,
                                  0,
                                  NULL,
-                                 &events[1]);
+                                 NULL);
 
     if(status != CL_SUCCESS) { 
         fprintf(stderr, 
@@ -909,30 +785,11 @@ int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
         return 1;
     }
 
-    /* Wait for the read buffer to finish execution */
-    status = clWaitForEvents(1, &events[1]);
-    if (status != CL_SUCCESS) {
-        fprintf(stderr, 
-            "Error: Waiting for read buffer call to finish. (clWaitForEvents) returned %d\n", 
-            status
-        );
-        return 1;
-    }
-
-    status = clReleaseEvent(events[1]);
-    if (status != CL_SUCCESS) { 
-        fprintf(stderr, 
-            "Error: Release event object. (clReleaseEvent) returned %d\n", 
-            status
-        );
-        return 1;
-    }
     return 0;
 }
 
 int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) {
     cl_int status;
-    cl_event events[2];
 
     /* 
      * the input array to the kernel. This array will eventually be modified 
@@ -998,8 +855,9 @@ int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) {
                                     localThreads,
                                     0,
                                     NULL,
-                                    &events[0]);
-    if (status != CL_SUCCESS) { 
+                                    NULL);
+
+    if (status != CL_SUCCESS) {
         fprintf(stderr, 
             "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n", 
             status
@@ -1007,65 +865,29 @@ int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) {
         return 1;
     }
 
-    /* wait for the kernel call to finish execution */
-    status = clWaitForEvents(1, &events[0]);
-    if (status != CL_SUCCESS) { 
-        fprintf(stderr, 
-            "Error: Waiting for kernel run to finish. (clWaitForEvents) returned %d\n", 
-            status
-        );
-        return 1;
-    }
-
-    status = clReleaseEvent(events[0]);
-    if (status != CL_SUCCESS) { 
-        fprintf(stderr, 
-            "Error: Release event object. (clReleaseEvent) returned %d\n", 
-            status
-        );
-        return 1;
-    }
-
+    clFinish(commandQueue);
+    
     /* Enqueue readBuffer*/
 	//Note: we are reading back from inputBuffer since AI is modified directly in kernel
     status = clEnqueueReadBuffer(commandQueue,
                                  inputBuffer,
-                                 CL_TRUE,
+                                 CL_FALSE,
                                  0,
                                  globalThreads[0] * sizeof(cl_float),
                                  AI,
                                  0,
                                  NULL,
-                                 &events[1]);
+                                 NULL);
     if (status != CL_SUCCESS) { 
         fprintf(stderr, "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n", status);
         return 1;
     }
     
-    /* Wait for the read buffer to finish execution */
-    status = clWaitForEvents(1, &events[1]);
-    if (status != CL_SUCCESS) { 
-        fprintf(stderr, 
-            "Error: Waiting for read buffer call to finish. (clWaitForEvents) returned %d\n", 
-            status
-        );
-        return 1;
-    }
-
-    status = clReleaseEvent(events[1]);
-    if (status != CL_SUCCESS) { 
-        fprintf(stderr, 
-            "Error: Release event object. (clReleaseEvent) returned %d\n", 
-            status
-        );
-        return 1;
-    }
     return 0;
 }
 
 int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) {
     cl_int status;
-    cl_event events[2];
 
     /* 
      * The input array to the kernel. This array will eventually be modified
@@ -1121,8 +943,9 @@ int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) {
                                     localThreads,
                                     0,
                                     NULL,
-                                    &events[0]);
-    if (status != CL_SUCCESS) { 
+                                    NULL);
+
+    if (status != CL_SUCCESS) {
         fprintf(stderr, 
             "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n", 
             status
@@ -1130,24 +953,7 @@ int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) {
         return 1;
     }
 
-    /* wait for the kernel call to finish execution */
-    status = clWaitForEvents(1, &events[0]);
-    if (status != CL_SUCCESS) { 
-        fprintf(stderr, 
-            "Error: Waiting for kernel run to finish. (clWaitForEvents) returned %d\n", 
-            status
-        );
-        return 1;
-    }
-
-    status = clReleaseEvent(events[0]);
-    if (status != CL_SUCCESS) { 
-        fprintf(stderr, 
-            "Error: Release event object. (clReleaseEvent) returned %d\n", 
-            status
-        );
-        return 1;
-    }
+    clFinish(commandQueue);
 
     /* Enqueue readBuffer*/
 	//Note: we are reading back from inputBuffer since AI is modified directly in kernel
@@ -1159,7 +965,7 @@ int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) {
                                  AI,
                                  0,
                                  NULL,
-                                 &events[1]);
+                                 NULL);
     if (status != CL_SUCCESS) { 
         fprintf(stderr, 
             "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n", 
@@ -1168,25 +974,6 @@ int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) {
         return 1;
     }
 
-    /* Wait for the read buffer to finish execution */
-    status = clWaitForEvents(1, &events[1]);
-    if (status != CL_SUCCESS) { 
-        fprintf(stderr, 
-            "Error: Waiting for read buffer call to finish. (clWaitForEvents) returned %d\n", 
-            status
-        );
-        return 1;
-    }
-
-    status = clReleaseEvent(events[1]);
-    if(status != CL_SUCCESS) { 
-        fprintf(stderr, 
-            "Error: Release event object. (clReleaseEvent) returned %d\n", 
-            status
-        );
-        return 1;
-    }
-
 	return 0;
 }
 
@@ -1213,7 +1000,7 @@ void invert(cl_float * input, cl_float *output, int n) {
     volatile clock_t gputime;
     gputime=clock();
 
-    int lda = ((n+15)&~15|16);
+    int lda = (((n+15)&(~15))|16);
     cl_float * AI_d = (cl_float *)malloc(sizeof(cl_float)*n*lda*2);
     memset(AI_d,0,sizeof(cl_float)*n*lda*2);
     for (int i = 0; i < n; i++) {
@@ -1238,14 +1025,15 @@ void invert(cl_float * input, cl_float *output, int n) {
         );
         exit(0);
     }
+
     // Note: there's no output buffer. In kernel, AI_d is modified directly.
 	// Thus, we should read the result back to host from inputBuffer as well.
-
     invertge(AI_d, lda, n);
-    gputime=clock()-gputime;fprintf(stderr, " %7.1f ms ",gputime/1.e3f);
+    gputime=clock()-gputime;
+    fprintf(stderr, " %7.1f ms ",gputime/1.e3f);
     fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime);
 
-#ifdef VERIFY	
+#ifdef VERIFY
     // let's verify that
     cl_float error=0.0;
 
@@ -1270,6 +1058,15 @@ void invert(cl_float * input, cl_float *output, int n) {
     for (int i = 0; i < n; i++) {
         memcpy(&output[n*i], &AI_d[lda*i*2+n], sizeof(cl_float)*n);
     }
+
+    status = clReleaseMemObject(inputBuffer);
+    if (status != CL_SUCCESS) {
+        fprintf(stderr, 
+            "Error: In clReleaseMemObject (inputBuffer) returned %d\n", 
+            status
+        );
+    }
+
     free(AI_d);
     fprintf(stderr," done!\n");
 }
diff --git a/samples/openclapp/openclapp.hpp b/samples/openclapp/openclapp.hpp
index 6efd3d7ced..d291d70641 100644
--- a/samples/openclapp/openclapp.hpp
+++ b/samples/openclapp/openclapp.hpp
@@ -1,6 +1,6 @@
 // This file is part of BOINC.
 // http://boinc.berkeley.edu
-// Copyright (C) 2008 University of California
+// Copyright (C) 2013 University of California
 //
 // BOINC is free software; you can redistribute it and/or modify it
 // under the terms of the GNU Lesser General Public License
@@ -16,10 +16,10 @@
 // along with BOINC.  If not, see <http://www.gnu.org/licenses/>.
 //
 // See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues.
-// Contributor: Tuan Le (tuanle86@berkeley.edu)
+// Original contributor: Tuan Le (tuanle86@berkeley.edu)
 
-#ifndef ATIOPENCL_H_
-#define ATIOPENCL_H_
+#ifndef OPENCLAPP_H_
+#define OPENCLAPP_H_
 
 #ifdef __APPLE__
 #include <OpenCL/opencl.h>
@@ -37,8 +37,8 @@
 
 #define INPUT_FILENAME "input"
 #define OUTPUT_FILENAME "output"
-#define KERNELS_FILENAME "atiopencl_kernels.cl"
-#define KERNELS_FILEPATH "../../atiopencl_kernels.cl" // for Linux and Mac
+#define KERNELS_FILENAME "openclapp_kernels.cl"
+#define KERNELS_FILEPATH "../../openclapp_kernels.cl" // for Linux and Mac
 #define CHECKPOINT_FILE "matrix_inversion_state"
 
 #define LOCAL_WORK_SIZE 1
@@ -115,7 +115,6 @@ cl_uint height;
 cl_mem   inputBuffer; //in this sample app, we will read the result
                       //from the device back to host from inputBuffer as well.
 cl_context          context;
-cl_device_id        *devices;
 cl_command_queue    commandQueue;
 
 cl_program program;
@@ -185,7 +184,7 @@ void update_shmem() {
  * Calls are made to set up OpenCL memory buffers that this program uses
  * and to load the programs into memory and get kernel handles.
  */
-int initialize_cl(void);
+int initialize_cl(int argc, char * argv[]);
 
 int initialize_host(FILE *infile);
 
@@ -226,4 +225,4 @@ void invertge(cl_float * AI_d,
               int lda,
               int n);
 
-#endif  /* #ifndef ATIOPENCL_H_ */
+#endif  /* #ifndef OPENCLAPP_H_ */
diff --git a/samples/openclapp/openclapp_kernels.cl b/samples/openclapp/openclapp_kernels.cl
index a6b4a73498..e93ce7d454 100644
--- a/samples/openclapp/openclapp_kernels.cl
+++ b/samples/openclapp/openclapp_kernels.cl
@@ -1,6 +1,6 @@
 // This file is part of BOINC.
 // http://boinc.berkeley.edu
-// Copyright (C) 2008 University of California
+// Copyright (C) 2013 University of California
 //
 // BOINC is free software; you can redistribute it and/or modify it
 // under the terms of the GNU Lesser General Public License
@@ -21,6 +21,7 @@
 // Contributor: Tuan Le (tuanle86@berkeley.edu)
 
 __kernel void GEStep1A(__global float * AI, int i, int n2, int lda2) {
+    //int k = get_group_id(0) * get_local_size(0) + get_local_id(0);
     int k=get_global_id(0);
     if (k>i && k < n2 && AI[i*lda2+k]!=0) {
         float multiplyer = -AI[i*lda2+k]/AI[i*lda2+i];