diff --git a/samples/nvcuda/cuda.cu b/samples/nvcuda/cuda.cu
index 9cda2419e4..2ed6683838 100644
--- a/samples/nvcuda/cuda.cu
+++ b/samples/nvcuda/cuda.cu
@@ -1,37 +1,445 @@
 /*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
  *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation. 
- * Any use, reproduction, disclosure, or distribution of this software 
- * and related documentation without an express license agreement from
- * NVIDIA Corporation is strictly prohibited.
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
  *
- * Please refer to the applicable NVIDIA end user license agreement (EULA) 
- * associated with this source code for terms and conditions that govern 
- * your use of this NVIDIA software.
- * 
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-/* Template project which demonstrates the basics on how to setup a project 
-* example application.
-* Host code.
-*/
+/*
+ * cuda.cu
+ * Copyright (C) 2010 Tuan Le
+ * tuanle86@berkeley.edu
+ */
 
-// includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
+#ifdef _WIN32
+#include "boinc_win.h"
+#else
+#include "config.h"
+#include <cstdio>
+#include <cctype>
+#include <ctime>
+#include <cstring>
+#include <cstdlib>
+#include <csignal>
+#include <unistd.h>
+#endif
 
-// includes, kernels
+#include <cuda_runtime.h>
+#include <cublas.h>
+#include "cuda_config.h"
 #include "cuda_kernel.cu"
 
-////////////////////////////////////////////////////////////////////////////////
-// Program main
-////////////////////////////////////////////////////////////////////////////////
-int
-main( int argc, char** argv) 
-{
+#include "str_util.h"
+#include "util.h"
+#include "filesys.h"
+#include "boinc_api.h"
+#include "mfile.h"
+#include "graphics2.h"
+
+#ifdef APP_GRAPHICS
+#include "uc2.h"
+UC_SHMEM* shmem;
+#endif
+
+using std::string;
+
+#define CHECKPOINT_FILE "matrix_inversion_state"
+#define INPUT_FILENAME "input"
+#define OUTPUT_FILENAME "output"
+#define MATRIX_SIZE 10
+
+
+// execute the kernel NUM_ITERATIONS times
+#define NUM_ITERATIONS 19
+
+bool run_slow = false;
+bool early_exit = false;
+bool early_crash = false;
+bool early_sleep = false;
+double cpu_time = 20, comp_result;
+
+// do a billion floating-point ops
+// (note: I needed to add an arg to this;
+// otherwise the MS C++ compiler optimizes away
+// all but the first call to it!)
+//
+static double do_a_giga_flop(int foo) {
+    double x = 3.14159*foo;
+    int i;
+    for (i=0; i<500000000; i++) {
+        x += 5.12313123;
+        x *= 0.5398394834;
+    }
+    return x;
 }
 
+int do_checkpoint(MFILE& mf, int n, REAL *h_idata, int dimension) {
+    int retval;
+    string resolved_name;
+
+    FILE* f = fopen("temp", "w");
+    if (!f) return 1;
+    fprintf(f, "%d", n); //write inversion number
+    fprintf(f, " ");
+    fprintf(f, "%d", dimension); //write dimension
+    fprintf(f, " ");
+    for (int i=0;i<dimension*dimension;++i) {
+		fprintf(f, " ");
+		fprintf(f, "%f", h_idata[i]);
+	}
+    fclose(f);
+    retval = mf.flush(); //not really necessary since we have not yet written anything to output file when doing checkpointing
+    if (retval) return retval;
+    boinc_resolve_filename_s(CHECKPOINT_FILE, resolved_name); //resolved_name is a string object
+    retval = boinc_rename("temp", resolved_name.c_str()); //c_str() will convert a string object to a char string with null terminator equivalent.
+		//	because we do rename. Thus temp does not appear, but the CHECKPOINT_FILE appears instead on the disk.
+    
+    if (retval) return retval;
+    return 0; //return 0 to indicate success.
+}
+
+#ifdef APP_GRAPHICS
+void update_shmem() {
+    if (!shmem) return;
+
+    // always do this; otherwise a graphics app will immediately
+    // assume we're not alive
+    shmem->update_time = dtime();
+
+    // Check whether a graphics app is running,
+    // and don't bother updating shmem if so.
+    // This doesn't matter here,
+    // but may be worth doing if updating shmem is expensive.
+    //
+    if (shmem->countdown > 0) {
+        // the graphics app sets this to 5 every time it renders a frame
+        shmem->countdown--;
+    } else {
+        return;
+    }
+    shmem->fraction_done = boinc_get_fraction_done();
+    shmem->cpu_time = boinc_worker_thread_cpu_time();;
+    boinc_get_status(&shmem->status);
+}
+#endif
+
+void print(REAL *A, int n); //on commandline
+void generateRandomInputFile(int n);
+int getMatrixDimension(FILE *infile);
+int countElementsInMatrix(FILE *infile);
+void fetchElementsIntoHostMemory(FILE *infile, REAL *h_idata);
+void printToFile(MFILE *out, float *h_odata, int dimension);
+
+int main(int argc, char** argv)
+{   
+    int i, retval, lastInversion=0, checkpointExists=0, dimension=0;
+    double fd;
+    char input_path[512], output_path[512], chkpt_path[512];
+    REAL* h_idata;
+    unsigned int mem_size;
+    MFILE out;
+    FILE* state, *infile;
+    
+    //generateRandomInputFile(MATRIX_SIZE); //call this if you don't want to construct the input file manually
+
+    for (i=0; i<argc; i++) {
+        if (!strcmp(argv[i], "-early_exit")) early_exit = true;
+        if (!strcmp(argv[i], "-early_crash")) early_crash = true;
+        if (!strcmp(argv[i], "-early_sleep")) early_sleep = true;
+        if (!strcmp(argv[i], "-run_slow")) run_slow = true;
+        if (!strcmp(argv[i], "-cpu_time")) {
+            cpu_time = atof(argv[++i]);
+        }
+    }
+	
+	retval = boinc_init();
+    if (retval) {
+        fprintf(stderr, "%s boinc_init returned %d\n",
+            boinc_msg_prefix(), retval
+        );
+        exit(retval);
+    }
+    
+    // open the input file (resolve logical name first)
+    //
+    boinc_resolve_filename(INPUT_FILENAME, input_path, sizeof(input_path));
+    infile = boinc_fopen(input_path, "r");
+    if (!infile) {
+        fprintf(stderr,
+            "%s Couldn't find input file, resolved name %s.\n",
+            boinc_msg_prefix(), input_path
+        );
+        getchar();
+        exit(-1);
+    }
+    
+    boinc_resolve_filename(OUTPUT_FILENAME, output_path, sizeof(output_path));
+    
+    // See if there's a valid checkpoint file.
+    // If so retrieve the current matrix and inversion number
+    //
+    boinc_resolve_filename(CHECKPOINT_FILE, chkpt_path, sizeof(chkpt_path));
+    state = boinc_fopen(chkpt_path, "r");
+    if (state) {
+		printf("Checkpoint file is detected. Read from checkpoint file ... \n");
+		checkpointExists=fscanf(state, "%d", &lastInversion); 
+		if (checkpointExists == 1) {
+			printf("Last inversion # is : %d\n",lastInversion);	
+			fscanf(state,"%d",&dimension);
+			cudaMallocHost((void **)&h_idata,dimension*dimension*sizeof(REAL));
+			for (int i=0;i<dimension*dimension;++i) {
+				fscanf(state, "%f", &h_idata[i]);
+				//printf("--%f\n",h_idata[i]);
+			}
+		}
+        fclose(state);
+    } else {
+		printf("There's no valid checkpoint file!\n");
+	}
+    
+	retval = out.open(output_path, "wb");
+    
+    if (retval) {
+        fprintf(stderr, "%s APP: matrix_inversion output open failed:\n",
+            boinc_msg_prefix()
+        );
+        fprintf(stderr, "%s resolved name %s, retval %d\n",
+            boinc_msg_prefix(), output_path, retval
+        );
+        perror("open");
+        exit(1);
+    }
+
+#ifdef APP_GRAPHICS
+    // create shared mem segment for graphics, and arrange to update it
+    //
+    shmem = (UC_SHMEM*)boinc_graphics_make_shmem("uppercase", sizeof(UC_SHMEM));
+    if (!shmem) {
+        fprintf(stderr, "%s failed to create shared mem segment\n",
+            boinc_msg_prefix()
+        );
+    }
+    update_shmem();
+    boinc_register_timer_callback(update_shmem);
+#endif
+
+    if (checkpointExists != 1) {
+		dimension=getMatrixDimension(infile);
+		printf("Matrix dimension: %d\n",dimension);
+		cudaMallocHost((void **)&h_idata,dimension*dimension*sizeof(REAL));
+		fetchElementsIntoHostMemory(infile,h_idata);
+		out.printf("\n----------------- Before being inversed ----------------\n\n");
+		printf("Computation is running ... Inverse the matrix %d times. Start at inversion #1\n",NUM_ITERATIONS);
+	} else {
+		out.printf("\n----------------- Last checkpointed inversion #%d ----------------\n\n",lastInversion);
+		printf("Computation is resumed ... Inverse the matrix %d more times. Start at inversion #%d\n",NUM_ITERATIONS-lastInversion,lastInversion+1);
+	}
+	printToFile(&out,h_idata,dimension);
+	
+	
+	
+	for (int i=lastInversion+1;i<=NUM_ITERATIONS;++i) {
+		invert(h_idata,dimension);
+		
+		printf("Finish inversion #%d\n",i);
+		
+		if (run_slow) {
+            boinc_sleep(1.);
+        }
+
+        if (early_exit && i>30) {
+            exit(-10);
+        }
+
+        if (early_crash && i>30) {
+            boinc_crash();
+        }
+        if (early_sleep && i>30) {
+            g_sleep = true;
+            while (1) boinc_sleep(1);
+        }
+		
+		if (boinc_time_to_checkpoint()) {
+		//if (i==7) {
+			printf("Perform checkpointing at inversion # %d\n",i);
+			
+			//we'll need to write the current matrix to the state file.
+			retval = do_checkpoint(out, i, h_idata, dimension); 
+            if (retval) {
+                fprintf(stderr, "%s APP: matrix_inversion checkpoint failed %d\n",
+                    boinc_msg_prefix(), retval
+                );
+                exit(retval);
+            }
+            boinc_checkpoint_completed();
+        }
+
+        fd = i/NUM_ITERATIONS;
+        if (cpu_time) fd /= 2;
+        boinc_fraction_done(fd);
+
+	}
+    
+    out.printf("\n\n----------------- Final inversion #%d----------------\n\n",NUM_ITERATIONS);
+	printToFile(&out,h_idata,dimension);
+
+    cudaFreeHost( h_idata );
+    retval = out.flush(); //force the output file to be closed.
+    if (retval) {
+        fprintf(stderr, "%s APP: matrix_inversion flush failed %d\n",
+            boinc_msg_prefix(), retval
+        );
+        exit(1);
+    }
+    
+    // burn up some CPU time if needed
+    //
+    if (cpu_time) {
+        double start = dtime();
+        for (int i=0; ; i++) {
+            double e = dtime()-start;
+            if (e > cpu_time) break;
+            fd = .5 + .5*(e/cpu_time);
+            boinc_fraction_done(fd);
+
+            if (boinc_time_to_checkpoint()) {
+                retval = do_checkpoint(out, NUM_ITERATIONS, h_idata, dimension);
+                if (retval) {
+                    fprintf(stderr, "%s APP: maxtrix_inversion checkpoint failed %d\n",
+                        boinc_msg_prefix(), retval
+                    );
+                    exit(1);
+                }
+                boinc_checkpoint_completed();
+            }
+            comp_result = do_a_giga_flop(i);
+        }
+    }
+    
+    boinc_fraction_done(1);
+#ifdef APP_GRAPHICS
+    update_shmem();
+#endif
+
+	printf("Done!");
+    getchar();
+    boinc_finish(0);
+}
+
+#ifdef _WIN32
+int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR Args, int WinMode) {
+    LPSTR command_line;
+    char* argv[100];
+    int argc;
+
+    command_line = GetCommandLine();
+    argc = parse_command_line( command_line, argv );
+    return main(argc, argv);
+}
+#endif
+
+void print(REAL *h_idata, int n) {
+	int j=0;
+	for (int i=0;i<n*n;++i) {
+		printf("%f    ",h_idata[i]);
+		if (j+1==n) {
+			printf("\n");
+			j=0;
+		} else {
+			++j;
+		}
+	}
+}
+
+// nxn matrix
+void generateRandomInputFile(int n) {
+	FILE *infile;
+	
+	infile=fopen(INPUT_FILENAME,"w");
+	REAL *h_idata = new REAL[n*n];
+	srand(n);
+    for( int i = 0; i < n; i++ ) {
+    	for (int j = 0; j < n; j++) {
+    		h_idata[i*n+j] = 2.0*(rand()%32768)/32768.0 - 1.0;
+    	}
+		h_idata[i*n+i] += sqrt((float)n);
+    }
+    int j=0;
+	for (int i=0;i<n*n;++i) {
+		fprintf(infile,"%15f",h_idata[i]);
+		if (j+1==n) {
+			fprintf(infile,"\n");
+			j=0;
+		} else {
+			++j;
+		}
+	}
+    fclose(infile);
+}
+
+int getMatrixDimension(FILE *infile) {
+	int w=0;
+	char c;
+	
+	fseek(infile,0,SEEK_SET);
+	while (true) {
+		do {
+			c=fgetc(infile);
+			if (c == EOF || c == '\n') {
+				goto exitLoop;
+			}
+		} while (isspace(c));
+		
+		if (isdigit(c) || c=='.' || c=='-') {
+			++w;
+		}
+		
+		do {
+			c=fgetc(infile);
+			if (c == EOF || c == '\n') {
+				goto exitLoop;
+			}
+		} while (isdigit(c) || c=='.' || c=='-');
+		
+		if (c==EOF || c == '\n') {
+			break;
+		}
+	}
+	exitLoop:
+	return w;
+}
+
+void fetchElementsIntoHostMemory(FILE *infile, REAL *h_idata) {
+	float num=0;
+	int i=0;
+	fseek(infile,0,SEEK_SET);
+	while (fscanf(infile,"%f",&num)==1) {
+		h_idata[i]=num;
+		++i;
+	}
+}
+
+void printToFile(MFILE *out, float *h_odata, int dimension) {
+	int count=0;
+    int move=0;
+    int num_elements=dimension*dimension;
+    while (num_elements>0) {
+		out->printf("%15f    ",h_odata[move]);
+		++count;
+		++move;
+		if (count==dimension) {
+			out->printf("\n");
+			count=0;
+		}
+		--num_elements;
+	}
+}
\ No newline at end of file
diff --git a/samples/nvcuda/cuda_config.h b/samples/nvcuda/cuda_config.h
new file mode 100644
index 0000000000..ce1603686b
--- /dev/null
+++ b/samples/nvcuda/cuda_config.h
@@ -0,0 +1,27 @@
+
+#ifdef DOUBLE_PRECISION
+#define REAL            double
+#define jREAL           jdouble
+#define jREALArray           jdoubleArray
+#else
+#define REAL            float
+#define jREAL           jfloat
+#define jREALArray           jfloatArray
+#endif
+
+
+
+inline void __cudaSafeCall( int err, const char *file, const int line )
+{
+  do {
+    if( err != 0) {
+      fprintf(stderr, "cudaSafeCall() Runtime API error in file <%s>, line %i : %s.\n",
+	      file, line, cudaGetErrorString((cudaError_t) err) );
+      exit(-1);
+    }
+  } while (0);
+}
+
+#define SAFECALL(err) __cudaSafeCall(err, __FILE__, __LINE__)
+#define CUDACHECK {cudaError_t error = cudaGetLastError(); if(error != 0){fprintf(stderr, "Error code %d: %s file %s line %i.\n",error,cudaGetErrorString(error),__FILE__,__LINE__);}}
+
diff --git a/samples/nvcuda/cuda_kernel.cu b/samples/nvcuda/cuda_kernel.cu
index cef8c6cafe..f9714ef5ba 100644
--- a/samples/nvcuda/cuda_kernel.cu
+++ b/samples/nvcuda/cuda_kernel.cu
@@ -1,36 +1,137 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation. 
- * Any use, reproduction, disclosure, or distribution of this software 
- * and related documentation without an express license agreement from
- * NVIDIA Corporation is strictly prohibited.
- *
- * Please refer to the applicable NVIDIA end user license agreement (EULA) 
- * associated with this source code for terms and conditions that govern 
- * your use of this NVIDIA software.
- * 
- */
 
-/* Template project which demonstrates the basics on how to setup a project 
- * example application.
- * Device code.
- */
-
-#ifndef _CUDA_KERNEL_H_
-#define _CUDA_KERNEL_H_
+// When VERIFY is defined, the sum of squared errors is calculated between the
+// identity matrix and the product A * incerse(A). For debugging...
+//#define VERIFY 1
 
 #include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include "config.h"
 
-////////////////////////////////////////////////////////////////////////////////
-//! Simple test kernel for device functionality
-//! @param g_idata  input data in global memory
-//! @param g_odata  output data in global memory
-////////////////////////////////////////////////////////////////////////////////
-__global__ void
-testKernel( float* g_idata, float* g_odata) 
-{
+void mathdispAI(const REAL *mat, int lda, int MAT_SIZE_h) {
+    fprintf(stderr, "\n");
+	int i,j;
+    for (j=0;j<MAT_SIZE_h;j++)  {
+		for (i=0;i<MAT_SIZE_h*2;i++) {
+	    	fprintf(stderr, "%6.3f",mat[j*lda*2+i]);
+	}
+	fprintf(stderr, "\n");
+    }
+    fprintf(stderr, "\n");
+} // mathdisp2
+
+void mathdispAId(const REAL * AId, int lda, int n) {
+	REAL * AI = new REAL[n*lda*2];
+	cudaMemcpy(AI,AId,sizeof(REAL)*n*lda*2,cudaMemcpyDeviceToHost);
+	mathdispAI(AI, lda, n);
+	delete [] AI;
 }
 
-#endif // #ifndef _CUDA_KERNEL_H_
+__global__ void GEStep1A(REAL * AI, int i, int n2, int lda2) {
+	int k = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (k>i && k < n2 && AI[i*lda2+k]!=0) {
+    REAL multiplyer = -AI[i*lda2+k]/AI[i*lda2+i];
+	int n = n2 / 2;
+    for (int j = i+1; j < n; j++) {
+      AI[j*lda2+k] += multiplyer*AI[j*lda2+i];
+    }
+  }
+}
+
+__global__ void GEStep2(REAL * AI,REAL diag,int i, int n2, int lda2) {
+	int k = blockIdx.x * blockDim.x + threadIdx.x;
+	if (k < n2) {
+		AI[i*lda2+k] /= diag;
+	}
+}
+
+__global__ void GEStep3(REAL * AI,int i, int n2, int lda2) {
+  int k = blockIdx.x * blockDim.x + threadIdx.x;
+  if (k > i && k < n2) {
+    REAL multiplyer = -AI[i*lda2+k];
+    for (int j = 0; j < i; j++) {
+      AI[j*lda2+k] += multiplyer*AI[j*lda2+i];
+    }
+  }
+}
+
+//extern void invert(REAL * A, int n);
+void invertge(REAL * AI_d, int lda, int n) {
+	int lda2 = lda * 2;
+  // perform elementary row operations till A in AI becomes identity matrix
+  for (int i = 0; i < n; i++) {
+    GEStep1A<<<(int)ceil((float)(1+(2*n-1)/32)),32>>>(AI_d,i,n*2, lda2);
+    CUDACHECK;
+    cudaThreadSynchronize();
+  }
+
+
+  for (int i = n-1; i >= 0; i--) {
+    REAL diag = 1.0;
+    SAFECALL(cudaMemcpy(&diag, &AI_d[i*lda2+i], sizeof(REAL), cudaMemcpyDeviceToHost));
+    GEStep2<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,diag,i,n*2, lda2);
+    CUDACHECK;
+
+    GEStep3<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,i,n*2, lda2);
+    CUDACHECK;
+    cudaThreadSynchronize();
+    CUDACHECK;
+  }
+} // invertge
+
+
+/* inverts nxn matrix A and stores result back in A */
+void invert(REAL * A, int n) {
+fprintf(stderr,"starting inversion n = %d ", n);
+    volatile clock_t gputime, gputime0;
+    gputime=clock();
+    gputime0 = gputime;
+
+    int lda = ((n+15)&~15|16);
+//lda=n;
+	REAL * AI = new REAL[n*lda*2];
+	memset(AI,0,sizeof(REAL)*n*lda*2);
+	for (int i = 0; i < n; i++) {
+		memcpy(&AI[lda*i*2], &A[n*i], sizeof(REAL)*n);
+		AI[lda*i*2+n+i] = 1;
+	}
+
+	REAL * AI_d;
+	SAFECALL(cudaMalloc((void **) &AI_d, sizeof(REAL)*n*lda*2));
+	SAFECALL(cudaMemcpy(AI_d, AI, sizeof(REAL)*n*lda*2, cudaMemcpyHostToDevice));
+
+	invertge(AI_d, lda, n);
+	SAFECALL(cudaMemcpy(AI, AI_d, sizeof(REAL)*n*lda*2, cudaMemcpyDeviceToHost));
+	cudaFree(AI_d);
+
+
+	gputime=clock()-gputime;fprintf(stderr, " %7.1f ms ",gputime/1.e3f);
+    	fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime);
+#ifdef VERIFY	
+	// let's verify that
+	REAL error=0.0;
+
+	// multiply inverse*xcopy, should be Identity matrix
+	for (int k = 0; k < n; k++) {
+	  for (int j = 0; j < n; j++) {
+	    REAL sum = 0;
+	    for (int i = 0; i < n; i++) {
+	      sum += AI[j*lda*2+n+i]*A[i*n+k];
+	    }
+	    if (j!=k) {
+	      error += sum * sum;
+	    } else {
+	      error += (1.0-sum) * (1.0-sum);
+	    }
+	  }
+	}
+    fprintf(stderr, " %6.2f SSE", error);
+#endif	
+
+	for (int i = 0; i < n; i++) {
+		memcpy(&A[n*i], &AI[lda*i*2+n], sizeof(REAL)*n);
+	}
+	free(AI);
+	fprintf(stderr," done!\n");
+} // invert
\ No newline at end of file