diff --git a/samples/nvopencl/nvopencl.cpp b/samples/nvopencl/nvopencl.cpp
index 0ae00c167d..9ce7bd7dec 100644
--- a/samples/nvopencl/nvopencl.cpp
+++ b/samples/nvopencl/nvopencl.cpp
@@ -458,8 +458,7 @@ int initialize_cl(void) {
     size_t deviceListSize;
 
     localThreads[0]  = LOCAL_WORK_SIZE;
-    // rounded up to the nearest multiple of the LocalWorkSize
-    globalThreads[0] = shrRoundUp((int)(localThreads[0]),width*height);
+    globalThreads[0] = shrRoundUp(GLOBAL_WORK_SIZE,width*height);
 
     /*
      * Have a look at the available platforms and pick either
diff --git a/samples/nvopencl/nvopencl.hpp b/samples/nvopencl/nvopencl.hpp
index 031613077b..2390a4731e 100644
--- a/samples/nvopencl/nvopencl.hpp
+++ b/samples/nvopencl/nvopencl.hpp
@@ -37,7 +37,8 @@
 #define KERNELS_FILEPATH "../../nvopencl_kernels.cl" // for Linux and Mac
 #define CHECKPOINT_FILE "matrix_inversion_state"
 
-#define LOCAL_WORK_SIZE 400
+#define LOCAL_WORK_SIZE 1
+#define GLOBAL_WORK_SIZE 400
 #define MATRIX_SIZE 10
 #define NUM_ITERATIONS 501 // execute the kernel NUM_ITERATIONS times