Makefile_mac completed

cuda_mac.c and cuda_kernel_mac.cu are targeted on Mac svn path=/trunk/boinc/; revision=21862
2010-07-01 17:53:11 +00:00 · 2010-07-01 17:53:11 +00:00 · fc87ee85d4
parent d59af26596
commit fc87ee85d4
6 changed files with 1120 additions and 2 deletions
--- a/samples/nvcuda/Makefile_mac
+++ b/samples/nvcuda/Makefile_mac
@ -0,0 +1,16 @@
+# Tuan Le
+# University of California, Berkeley
+# Berkeley Space Sciences Lab
+# tuanle86@berkeley.ed
+
+# Add source files here
+EXECUTABLE	:= example_app_nvcuda_mac
+# Cuda source files (compiled with cudacc)
+CUFILES		:= cuda_kernel_mac.cu
+# C/C++ source files (compiled with gcc / c++)
+CCFILES		:= cuda_mac.c \
+
+################################################################################
+# Rules and targets
+
+include common_mac.mk
--- a/samples/nvcuda/common_mac.mk
+++ b/samples/nvcuda/common_mac.mk
@ -0,0 +1,479 @@
+# Tuan Le
+# University of California, Berkeley
+# Berkeley Space Sciences Lab
+# tuanle86@berkeley.edu
+
+.SUFFIXES : .cu .cu_dbg.o .c_dbg.o .cpp_dbg.o .cu_rel.o .c_rel.o .cpp_rel.o .cubin .ptx
+
+# Add new SM Versions here as devices with new Compute Capability are released
+SM_VERSIONS   := 10 11 12 13 20
+
+CUDA_INSTALL_PATH ?= /usr/local/cuda
+
+ifdef cuda-install
+	CUDA_INSTALL_PATH := $(cuda-install)
+endif
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])
+
+# 'linux' is output for Linux system, 'darwin' for OS X
+DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
+ifneq ($(DARWIN),)
+   SNOWLEOPARD = $(strip $(findstring 10.6, $(shell egrep "<string>10\.6" /System/Library/CoreServices/SystemVersion.plist)))
+endif
+
+# detect 32-bit or 64-bit platform
+HP_64 = $(shell uname -m | grep 64)
+OSARCH= $(shell uname -m)
+
+# Basic directory setup for SDK
+# (override directories only if they are not already defined)
+SRCDIR     ?= 
+ROOTDIR    ?= /Developer/GPU\ Computing
+ROOTBINDIR ?= ../../samples/nvcuda
+BINDIR     ?= $(ROOTBINDIR)/$(OSLOWER)
+ROOTOBJDIR ?= obj
+
+# BOINC directory
+BOINC_DIR = ../..
+BOINC_API_DIR = $(BOINC_DIR)/api
+BOINC_LIB_DIR = $(BOINC_DIR)/lib
+BOINC_BUILD_DIR = $(BOINC_DIR)/mac_build/build/Deployment
+BOINC_CONFIG_DIR =  $(BOINC_DIR)/clientgui/mac
+FRAMEWORKS_DIR = /System/Library/Frameworks
+
+LIBDIR     := $(ROOTDIR)/C/lib
+COMMONDIR  := $(ROOTDIR)/C/common
+SHAREDDIR  := $(ROOTDIR)/shared/
+
+# Includes
+INCLUDES  += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc -I$(SHAREDDIR)/inc -I$(BOINC_CONFIG_DIR) -I$(BOINC_DIR) -I$(BOINC_LIB_DIR) -I$(BOINC_API_DIR)
+
+# Compilers
+NVCC       := $(CUDA_INSTALL_PATH)/bin/nvcc
+CXX        := g++ 
+CC         := gcc
+#Tuan edits LINK to have #(INCLUDES)
+LINK       := g++ -fPIC $(INCLUDES)
+
+# Warning flags
+CXXWARN_FLAGS := \
+	-W -Wall \
+	-Wimplicit \
+	-Wswitch \
+	-Wformat \
+	-Wchar-subscripts \
+	-Wparentheses \
+	-Wmultichar \
+	-Wtrigraphs \
+	-Wpointer-arith \
+	-Wcast-align \
+	-Wreturn-type \
+	-Wno-unused-function \
+	$(SPACE)
+
+CWARN_FLAGS := $(CXXWARN_FLAGS) \
+	-Wstrict-prototypes \
+	-Wmissing-prototypes \
+	-Wmissing-declarations \
+	-Wnested-externs \
+	-Wmain \
+
+# architecture flag for nvcc and gcc compilers build
+CUBIN_ARCH_FLAG :=
+CXX_ARCH_FLAGS  :=
+NVCCFLAGS       :=
+LIB_ARCH        := $(OSARCH)
+
+# Determining the necessary Cross-Compilation Flags
+# 32-bit OS, but we target 64-bit cross compilation
+ifeq ($(x86_64),1) 
+    NVCCFLAGS       += -m64
+    LIB_ARCH         = x86_64
+    CUDPPLIB_SUFFIX  = x86_64
+    ifneq ($(DARWIN),)
+         CXX_ARCH_FLAGS += -arch x86_64
+    else
+         CXX_ARCH_FLAGS += -m64
+    endif
+else 
+# 64-bit OS, and we target 32-bit cross compilation
+    ifeq ($(i386),1)
+        NVCCFLAGS       += -m32
+        LIB_ARCH         = i386
+        CUDPPLIB_SUFFIX  = i386
+        ifneq ($(DARWIN),)
+             CXX_ARCH_FLAGS += -arch i386
+        else
+             CXX_ARCH_FLAGS += -m32
+        endif
+    else 
+        ifeq "$(strip $(HP_64))" ""
+            LIB_ARCH        = i386
+            CUDPPLIB_SUFFIX = i386
+            NVCCFLAGS      += -m32
+            ifneq ($(DARWIN),)
+               CXX_ARCH_FLAGS += -arch i386
+            else
+               CXX_ARCH_FLAGS += -m32
+            endif
+        else
+            LIB_ARCH        = x86_64
+            CUDPPLIB_SUFFIX = x86_64
+            NVCCFLAGS      += -m64
+            ifneq ($(DARWIN),)
+               CXX_ARCH_FLAGS += -arch x86_64
+            else
+               CXX_ARCH_FLAGS += -m64
+            endif
+        endif
+    endif
+endif
+
+
+ifeq ($(noinline),1)
+   NVCCFLAGS   += -Xopencc -noinline
+   # Compiler-specific flags, when using noinline, we don't build for SM1x
+   GENCODE_SM10 := 
+   GENCODE_SM20 := -gencode=arch=compute_20,code=\"sm_20,compute_20\"
+else
+   # Compiler-specific flags (by default, we always use sm_10 and sm_20), unless we use the SMVERSION template
+   GENCODE_SM10 := -gencode=arch=compute_10,code=\"sm_10,compute_10\"
+   GENCODE_SM20 := -gencode=arch=compute_20,code=\"sm_20,compute_20\"
+endif
+
+CXXFLAGS  += $(CXXWARN_FLAGS) $(CXX_ARCH_FLAGS)
+CFLAGS    += $(CWARN_FLAGS) $(CXX_ARCH_FLAGS)
+LINKFLAGS +=
+LINK      += $(LINKFLAGS) $(CXX_ARCH_FLAGS)
+
+# This option for Mac allows CUDA applications to work without requiring to set DYLD_LIBRARY_PATH
+ifneq ($(DARWIN),)
+   LINK += -Xlinker -rpath $(CUDA_INSTALL_PATH)/lib
+endif
+
+# Common flags
+COMMONFLAGS += $(INCLUDES) -DUNIX
+
+# Debug/release configuration
+ifeq ($(dbg),1)
+	COMMONFLAGS += -g
+	NVCCFLAGS   += -D_DEBUG
+	CXXFLAGS    += -D_DEBUG
+	CFLAGS      += -D_DEBUG
+	BINSUBDIR   := debug
+	LIBSUFFIX   := D
+else 
+	COMMONFLAGS += -O2 
+	BINSUBDIR   := release
+	LIBSUFFIX   := 
+	NVCCFLAGS   += --compiler-options -fno-strict-aliasing
+	CXXFLAGS    += -fno-strict-aliasing
+	CFLAGS      += -fno-strict-aliasing
+endif
+
+# architecture flag for cubin build
+CUBIN_ARCH_FLAG :=
+
+# OpenGL is used or not (if it is used, then it is necessary to include GLEW)
+ifeq ($(USEGLLIB),1)
+    ifneq ($(DARWIN),)
+        OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries 
+        OPENGLLIB += -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a
+    else
+# this case for linux platforms
+	OPENGLLIB := -lGL -lGLU -lX11 -lXi -lXmu
+# check if x86_64 flag has been set, otherwise, check HP_64 is i386/x86_64
+        ifeq ($(x86_64),1) 
+	       OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64
+        else
+             ifeq ($(i386),)
+                 ifeq "$(strip $(HP_64))" ""
+	             OPENGLLIB += -lGLEW -L/usr/X11R6/lib
+                 else
+	             OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64
+                 endif
+             endif
+        endif
+# check if i386 flag has been set, otehrwise check HP_64 is i386/x86_64
+        ifeq ($(i386),1)
+	       OPENGLLIB += -lGLEW -L/usr/X11R6/lib
+        else
+             ifeq ($(x86_64),)
+                 ifeq "$(strip $(HP_64))" ""
+	             OPENGLLIB += -lGLEW -L/usr/X11R6/lib
+                 else
+	             OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64
+                 endif
+             endif
+        endif
+    endif
+endif
+
+ifeq ($(USEGLUT),1)
+    ifneq ($(DARWIN),)
+	OPENGLLIB += -framework GLUT
+    else
+        ifeq ($(x86_64),1)
+	     OPENGLLIB += -lglut -L/usr/lib64 
+        endif
+        ifeq ($(i386),1)
+	     OPENGLLIB += -lglut -L/usr/lib 
+        endif
+
+        ifeq ($(x86_64),)
+            ifeq ($(i386),)  
+	        OPENGLLIB += -lglut
+            endif
+        endif
+    endif
+endif
+
+ifeq ($(USEPARAMGL),1)
+	PARAMGLLIB := -lparamgl_$(LIB_ARCH)$(LIBSUFFIX)
+endif
+
+ifeq ($(USERENDERCHECKGL),1)
+	RENDERCHECKGLLIB := -lrendercheckgl_$(LIB_ARCH)$(LIBSUFFIX)
+endif
+
+ifeq ($(USECUDPP), 1)
+    CUDPPLIB := -lcudpp_$(CUDPPLIB_SUFFIX)$(LIBSUFFIX)
+
+    ifeq ($(emu), 1)
+        CUDPPLIB := $(CUDPPLIB)_emu
+    endif
+endif
+
+ifeq ($(USENVCUVID), 1)
+     ifneq ($(DARWIN),)
+         NVCUVIDLIB := -L../../common/lib/darwin -lnvcuvid
+     endif
+endif
+
+# Libs
+ifneq ($(DARWIN),)
+    LIB       := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib $(NVCUVIDLIB) 
+else
+  ifeq "$(strip $(HP_64))" ""
+    ifeq ($(x86_64),1)
+       LIB       := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib 
+    else
+       LIB       := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib
+    endif
+  else
+    ifeq ($(i386),1)
+       LIB       := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib
+    else
+       LIB       := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib
+    endif
+  endif
+endif
+
+# If dynamically linking to CUDA and CUDART, we exclude the libraries from the LIB
+ifeq ($(USECUDADYNLIB),1)
+     LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} -ldl -rdynamic 
+else
+# static linking, we will statically link against CUDA and CUDART
+  ifeq ($(USEDRVAPI),1)
+     LIB += -lcuda   ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} 
+  else
+     ifeq ($(emu),1) 
+         LIB += -lcudartemu
+     else 
+         LIB += -lcudart
+     endif
+     LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB}
+  endif
+endif
+
+ifeq ($(USECUFFT),1)
+  ifeq ($(emu),1)
+    LIB += -lcufftemu
+  else
+    LIB += -lcufft
+  endif
+endif
+
+ifeq ($(USECUBLAS),1)
+  ifeq ($(emu),1)
+    LIB += -lcublasemu
+  else
+    LIB += -lcublas
+  endif
+endif
+
+# Lib/exe configuration
+ifneq ($(STATIC_LIB),)
+	TARGETDIR := $(LIBDIR)
+	TARGET   := $(subst .a,_$(LIB_ARCH)$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB))
+	LINKLINE  = ar rucv $(TARGET) $(OBJS)
+else
+	ifneq ($(OMIT_CUTIL_LIB),1)
+
+#TUAN add - lboinc and -lboinc_api below
+		LIB += -lcutil_$(LIB_ARCH)$(LIBSUFFIX) -lshrutil_$(LIB_ARCH)$(LIBSUFFIX) -lboinc_api -L$(BOINC_BUILD_DIR) -lboinc -L$(BOINC_BUILD_DIR)
+	endif
+	# Device emulation configuration
+	ifeq ($(emu), 1)
+		NVCCFLAGS   += -deviceemu
+		CUDACCFLAGS += 
+		BINSUBDIR   := emu$(BINSUBDIR)
+		# consistency, makes developing easier
+		CXXFLAGS		+= -D__DEVICE_EMULATION__
+		CFLAGS			+= -D__DEVICE_EMULATION__
+	endif
+	TARGETDIR := $(BINDIR)/$(BINSUBDIR)
+	TARGET    := $(TARGETDIR)/$(EXECUTABLE)
+	LINKLINE  = $(LINK) -o $(TARGET) $(OBJS) $(LIB)
+endif
+
+# check if verbose 
+ifeq ($(verbose), 1)
+	VERBOSE :=
+else
+	VERBOSE := @
+endif
+
+################################################################################
+# Check for input flags and set compiler flags appropriately
+################################################################################
+ifeq ($(fastmath), 1)
+	NVCCFLAGS += -use_fast_math
+endif
+
+ifeq ($(nvccverbose), 1)
+        NVCCFLAGS += -v
+endif
+
+ifeq ($(keep), 1)
+	NVCCFLAGS += -keep
+	NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx
+endif
+
+ifdef maxregisters
+	NVCCFLAGS += -maxrregcount $(maxregisters)
+endif
+
+# Add cudacc flags
+NVCCFLAGS += $(CUDACCFLAGS)
+
+# Add common flags
+NVCCFLAGS += $(COMMONFLAGS)
+CXXFLAGS  += $(COMMONFLAGS)
+CFLAGS    += $(COMMONFLAGS)
+
+ifeq ($(nvcc_warn_verbose),1)
+	NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS)) 
+	NVCCFLAGS += --compiler-options -fno-strict-aliasing
+endif
+
+################################################################################
+# Set up object files
+################################################################################
+OBJDIR := $(ROOTOBJDIR)/$(LIB_ARCH)/$(BINSUBDIR)
+OBJS +=  $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(notdir $(CCFILES)))
+OBJS +=  $(patsubst %.c,$(OBJDIR)/%.c.o,$(notdir $(CFILES)))
+OBJS +=  $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(notdir $(CUFILES)))
+
+################################################################################
+# Set up cubin output files
+################################################################################
+CUBINDIR := $(SRCDIR)data
+CUBINS +=  $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES)))
+
+################################################################################
+# Set up PTX output files
+################################################################################
+PTXDIR := $(SRCDIR)data
+PTXBINS +=  $(patsubst %.cu,$(PTXDIR)/%.ptx,$(notdir $(PTXFILES)))
+
+################################################################################
+# Rules
+################################################################################
+$(OBJDIR)/%.c.o : $(SRCDIR)%.c $(C_DEPS)
+	$(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $<
+
+$(OBJDIR)/%.cpp.o : $(SRCDIR)%.cpp $(C_DEPS)
+	$(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile
+$(OBJDIR)/%.cu.o : $(SRCDIR)%.cu $(CU_DEPS)
+	$(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $<
+
+# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile
+$(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory
+	$(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $<
+
+$(PTXDIR)/%.ptx : $(SRCDIR)%.cu ptxdirectory
+	$(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -ptx $<
+
+#
+# The following definition is a template that gets instantiated for each SM
+# version (sm_10, sm_13, etc.) stored in SMVERSIONS.  It does 2 things:
+# 1. It adds to OBJS a .cu_sm_XX.o for each .cu file it finds in CUFILES_sm_XX.
+# 2. It generates a rule for building .cu_sm_XX.o files from the corresponding 
+#    .cu file.
+#
+# The intended use for this is to allow Makefiles that use common.mk to compile
+# files to different Compute Capability targets (aka SM arch version).  To do
+# so, in the Makefile, list files for each SM arch separately, like so:
+# This will be used over the default rule abov
+#
+# CUFILES_sm_10 := mycudakernel_sm10.cu app.cu
+# CUFILES_sm_12 := anothercudakernel_sm12.cu
+#
+define SMVERSION_template
+#OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_$(1))))
+OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_sm_$(1))))
+$(OBJDIR)/%.cu_$(1).o : $(SRCDIR)%.cu $(CU_DEPS)
+#	$(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS)  $(1)
+   # if we have noinline enabled, we only turn this enable this for SM 2.x architectures
+   ifeq ($(noinline),1)
+	$(VERBOSE)$(NVCC) $(GENCODE_SM20) -o $$@ -c $$< $(NVCCFLAGS)
+   else
+	$(VERBOSE)$(NVCC) -gencode=arch=compute_$(1),code=\"sm_$(1),compute_$(1)\" $(GENCODE_SM20) -o $$@ -c $$< $(NVCCFLAGS)
+   endif
+endef
+
+# This line invokes the above template for each arch version stored in
+# SM_VERSIONS.  The call funtion invokes the template, and the eval
+# function interprets it as make commands.
+$(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver))))
+
+###### TUAN This below line causes mmintrin.h and xmmintrin.h error.
+
+$(TARGET): makedirectories $(OBJS) $(CUBINS) $(PTXBINS) Makefile_mac
+	$(VERBOSE)$(LINKLINE)
+
+cubindirectory:
+	$(VERBOSE)mkdir -p $(CUBINDIR)
+
+ptxdirectory:
+	$(VERBOSE)mkdir -p $(PTXDIR)
+
+makedirectories:
+	$(VERBOSE)mkdir -p $(LIBDIR)
+	$(VERBOSE)mkdir -p $(OBJDIR)
+	$(VERBOSE)mkdir -p $(TARGETDIR)
+
+
+tidy :
+	$(VERBOSE)find . | egrep "#" | xargs rm -f
+	$(VERBOSE)find . | egrep "\~" | xargs rm -f
+
+clean : tidy
+#TUAN	$(VERBOSE)rm -f $(OBJS)
+	$(VERBOSE)rm -f $(CUBINS)
+	$(VERBOSE)rm -f $(PTXBINS)
+	$(VERBOSE)rm -f $(TARGET)
+	$(VERBOSE)rm -f $(NVCC_KEEP_CLEAN)
+	$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.ppm
+	$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.pgm
+	$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bin
+	$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bmp
+
+clobber : clean
+	$(VERBOSE)rm -rf $(ROOTOBJDIR)
--- a/samples/nvcuda/cuda_kernel_mac.cu
+++ b/samples/nvcuda/cuda_kernel_mac.cu
@ -0,0 +1,122 @@
+/*
+ * Tuan Le
+ * University of California, Berkeley
+ * Berkeley Space Sciences Lab
+ * tuanle86@berkeley.edu
+ */
+
+// When VERIFY is defined, the sum of squared errors is calculated between the
+// identity matrix and the product A * incerse(A). For debugging...
+//#define VERIFY 1
+
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include "cuda_config.h"
+
+__global__ void GEStep1A(REAL * AI, int i, int n2, int lda2) {
+	int k = blockIdx.x * blockDim.x + threadIdx.x;
+	if (k>i && k < n2 && AI[i*lda2+k]!=0) {
+		REAL multiplyer = -AI[i*lda2+k]/AI[i*lda2+i];
+		int n = n2 / 2;
+		for (int j = i+1; j < n; j++) {
+			AI[j*lda2+k] += multiplyer*AI[j*lda2+i];
+		}
+	}
+}
+
+__global__ void GEStep2(REAL * AI,REAL diag,int i, int n2, int lda2) {
+	int k = blockIdx.x * blockDim.x + threadIdx.x;
+	if (k < n2) {
+		AI[i*lda2+k] /= diag;
+	}
+}
+
+__global__ void GEStep3(REAL * AI,int i, int n2, int lda2) {
+	int k = blockIdx.x * blockDim.x + threadIdx.x;
+	if (k > i && k < n2) {
+		REAL multiplyer = -AI[i*lda2+k];
+		for (int j = 0; j < i; j++) {
+			AI[j*lda2+k] += multiplyer*AI[j*lda2+i];
+		}
+	}
+}
+
+//extern void invert(REAL * A, int n);
+void invertge(REAL * AI_d, int lda, int n) {
+	int lda2 = lda * 2;
+	// perform elementary row operations till A in AI becomes identity matrix
+	for (int i = 0; i < n; i++) {
+		GEStep1A<<<(int)ceil((float)(1+(2*n-1)/32)),32>>>(AI_d,i,n*2, lda2);
+		CUDACHECK;
+		cudaThreadSynchronize();
+	}
+
+	for (int i = n-1; i >= 0; i--) {
+		REAL diag = 1.0;
+		SAFECALL(cudaMemcpy(&diag, &AI_d[i*lda2+i], sizeof(REAL), cudaMemcpyDeviceToHost));
+		GEStep2<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,diag,i,n*2, lda2);
+		CUDACHECK;
+
+		GEStep3<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,i,n*2, lda2);
+		CUDACHECK;
+		cudaThreadSynchronize();
+		CUDACHECK;
+	}
+} // invertge
+
+
+/* inverts nxn matrix A and stores result back in A */
+extern void invert(REAL * A, int n) {
+	fprintf(stderr,"starting inversion n = %d ", n);
+	volatile clock_t gputime, gputime0;
+	gputime=clock();
+	gputime0 = gputime;
+
+	int lda = ((n+15)&~15|16);
+	//lda=n;
+	REAL * AI = (REAL *)malloc(sizeof(REAL)*(n*lda*2));
+	memset(AI,0,sizeof(REAL)*n*lda*2);
+	for (int i = 0; i < n; i++) {
+		memcpy(&AI[lda*i*2], &A[n*i], sizeof(REAL)*n);
+		AI[lda*i*2+n+i] = 1;
+	}
+
+	REAL * AI_d;
+	SAFECALL(cudaMalloc((void **) &AI_d, sizeof(REAL)*n*lda*2));
+	SAFECALL(cudaMemcpy(AI_d, AI, sizeof(REAL)*n*lda*2, cudaMemcpyHostToDevice));
+
+	invertge(AI_d, lda, n);
+	SAFECALL(cudaMemcpy(AI, AI_d, sizeof(REAL)*n*lda*2, cudaMemcpyDeviceToHost));
+	cudaFree(AI_d);
+
+
+	gputime=clock()-gputime;fprintf(stderr, " %7.1f ms ",gputime/1.e3f);
+    	fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime);
+#ifdef VERIFY	
+	// let's verify that
+	REAL error=0.0;
+
+	// multiply inverse*xcopy, should be Identity matrix
+	for (int k = 0; k < n; k++) {
+	  for (int j = 0; j < n; j++) {
+	    REAL sum = 0;
+	    for (int i = 0; i < n; i++) {
+	      sum += AI[j*lda*2+n+i]*A[i*n+k];
+	    }
+	    if (j!=k) {
+	      error += sum * sum;
+	    } else {
+	      error += (1.0-sum) * (1.0-sum);
+	    }
+	  }
+	}
+	fprintf(stderr, " %6.2f SSE", error);
+#endif	
+
+	for (int i = 0; i < n; i++) {
+		memcpy(&A[n*i], &AI[lda*i*2+n], sizeof(REAL)*n);
+	}
+	free(AI);
+	fprintf(stderr," done!\n");
+} // invert
--- a/samples/nvcuda/cuda_mac.c
+++ b/samples/nvcuda/cuda_mac.c
@ -0,0 +1,440 @@
+/*
+ * Tuan Le
+ * University of California, Berkeley
+ * Berkeley Space Sciences Lab
+ * tuanle86@berkeley.edu
+ */
+
+#ifdef _WIN32
+#include "boinc_win.h"
+#else
+#include "config.h"
+#include <cstdio>
+#include <cctype>
+#include <ctime>
+#include <cstring>
+#include <cstdlib>
+#include <csignal>
+#include <unistd.h>
+#endif
+
+#include <cuda_runtime.h>
+#include <cublas.h>
+#include "cuda_config.h"
+
+#include "str_util.h"
+#include "util.h"
+#include "filesys.h"
+#include "boinc_api.h"
+#include "mfile.h"
+#include "graphics2.h"
+
+struct UC_SHMEM {
+    double update_time;
+    double fraction_done;
+    double cpu_time;
+    BOINC_STATUS status;
+    int countdown;
+	// graphics app sets this to 5 repeatedly,
+	// main program decrements it once/sec.
+	// If it's zero, don't bother updating shmem
+};
+
+#ifdef APP_GRAPHICS
+#include "uc2.h"
+UC_SHMEM* shmem;
+#endif
+
+using std::string;
+
+#define CHECKPOINT_FILE "matrix_inversion_state"
+#define INPUT_FILENAME "input"
+#define OUTPUT_FILENAME "output"
+#define MATRIX_SIZE 10
+
+
+// execute the kernel NUM_ITERATIONS times
+#define NUM_ITERATIONS 51
+
+bool run_slow = false;
+bool early_exit = false;
+bool early_crash = false;
+bool early_sleep = false;
+double cpu_time = 20, comp_result;
+
+// do a billion floating-point ops
+// (note: I needed to add an arg to this;
+// otherwise the MS C++ compiler optimizes away
+// all but the first call to it!)
+//
+static double do_a_giga_flop(int foo) {
+    double x = 3.14159*foo;
+    int i;
+    for (i=0; i<500000000; i++) {
+        x += 5.12313123;
+        x *= 0.5398394834;
+    }
+    return x;
+}
+
+int do_checkpoint(MFILE& mf, int n, REAL *h_idata, int dimension) {
+    int retval;
+    string resolved_name;
+	
+    FILE* f = fopen("temp", "w");
+    if (!f) return 1;
+    fprintf(f, "%d", n); //write inversion number
+    fprintf(f, " ");
+    fprintf(f, "%d", dimension); //write dimension
+    fprintf(f, " ");
+    for (int i=0;i<dimension*dimension;++i) {
+		fprintf(f, " ");
+		fprintf(f, "%f", h_idata[i]);
+	}
+    fclose(f);
+    retval = mf.flush(); //not really necessary since we have not yet written anything to output file when doing checkpointing
+    if (retval) return retval;
+    boinc_resolve_filename_s(CHECKPOINT_FILE, resolved_name); //resolved_name is a string object
+    retval = boinc_rename("temp", resolved_name.c_str()); //c_str() will convert a string object to a char string with null terminator equivalent.
+	//	because we do rename. Thus temp does not appear, but the CHECKPOINT_FILE appears instead on the disk.
+    
+    if (retval) return retval;
+    return 0; //return 0 to indicate success.
+}
+
+#ifdef APP_GRAPHICS
+void update_shmem() {
+    if (!shmem) return;
+	
+    // always do this; otherwise a graphics app will immediately
+    // assume we're not alive
+    shmem->update_time = dtime();
+	
+    // Check whether a graphics app is running,
+    // and don't bother updating shmem if so.
+    // This doesn't matter here,
+    // but may be worth doing if updating shmem is expensive.
+    //
+    if (shmem->countdown > 0) {
+        // the graphics app sets this to 5 every time it renders a frame
+        shmem->countdown--;
+    } else {
+        return;
+    }
+    shmem->fraction_done = boinc_get_fraction_done();
+    shmem->cpu_time = boinc_worker_thread_cpu_time();;
+    boinc_get_status(&shmem->status);
+}
+#endif
+
+void print(REAL *A, int n); //on commandline
+void generateRandomInputFile(int n);
+int getMatrixDimension(FILE *infile);
+int countElementsInMatrix(FILE *infile);
+void fetchElementsIntoHostMemory(FILE *infile, REAL *h_idata);
+void printToFile(MFILE *out, float *h_odata, int dimension);
+extern void invert(REAL * A, int n);
+
+int main(int argc, char** argv)
+{   
+    int i, retval, lastInversion=0, checkpointExists=0, dimension=0;
+    double fd;
+    char input_path[512], output_path[512], chkpt_path[512], buf[256];
+    REAL* h_idata;
+    MFILE out;
+    FILE* state, *infile;
+    
+    generateRandomInputFile(MATRIX_SIZE); //call this if you don't want to construct the input file manually
+	
+    for (i=0; i<argc; i++) {
+        if (!strcmp(argv[i], "-early_exit")) early_exit = true;
+        if (!strcmp(argv[i], "-early_crash")) early_crash = true;
+        if (!strcmp(argv[i], "-early_sleep")) early_sleep = true;
+        if (!strcmp(argv[i], "-run_slow")) run_slow = true;
+        if (!strcmp(argv[i], "-cpu_time")) {
+            cpu_time = atof(argv[++i]);
+        }
+    }
+	
+	retval = boinc_init();
+    if (retval) {
+        fprintf(stderr, "%s boinc_init returned %d\n",
+				boinc_msg_prefix(buf), retval
+				);
+        exit(retval);
+    }
+    
+    // open the input file (resolve logical name first)
+    //
+    boinc_resolve_filename(INPUT_FILENAME, input_path, sizeof(input_path));
+    infile = boinc_fopen(input_path, "r");
+    if (!infile) {
+        fprintf(stderr,
+				"%s Couldn't find input file, resolved name %s.\n",
+				boinc_msg_prefix(buf), input_path
+				);
+        getchar();
+        exit(-1);
+    }
+    
+    boinc_resolve_filename(OUTPUT_FILENAME, output_path, sizeof(output_path));
+    
+    // See if there's a valid checkpoint file.
+    // If so retrieve the current matrix and inversion number
+    //
+    boinc_resolve_filename(CHECKPOINT_FILE, chkpt_path, sizeof(chkpt_path));
+    state = boinc_fopen(chkpt_path, "r");
+    if (state) {
+		printf("Checkpoint file is detected. Read from checkpoint file ... \n");
+		checkpointExists=fscanf(state, "%d", &lastInversion); 
+		if (checkpointExists == 1) {
+			printf("Last inversion # is : %d\n",lastInversion);	
+			fscanf(state,"%d",&dimension);
+			cudaMallocHost((void **)&h_idata,dimension*dimension*sizeof(REAL));
+			for (int i=0;i<dimension*dimension;++i) {
+				fscanf(state, "%f", &h_idata[i]);
+				//printf("--%f\n",h_idata[i]);
+			}
+		}
+        fclose(state);
+    } else {
+		printf("There's no valid checkpoint file!\n");
+	}
+    
+	retval = out.open(output_path, "wb");
+    
+    if (retval) {
+        fprintf(stderr, "%s APP: matrix_inversion output open failed:\n",
+				boinc_msg_prefix(buf)
+				);
+        fprintf(stderr, "%s resolved name %s, retval %d\n",
+				boinc_msg_prefix(buf), output_path, retval
+				);
+        perror("open");
+        exit(1);
+    }
+	
+#ifdef APP_GRAPHICS
+    // create shared mem segment for graphics, and arrange to update it
+    //
+    shmem = (UC_SHMEM*)boinc_graphics_make_shmem("matrix_inversion", sizeof(UC_SHMEM));
+    if (!shmem) {
+        fprintf(stderr, "%s failed to create shared mem segment\n",
+				boinc_msg_prefix(buf)
+				);
+    }
+    update_shmem();
+    boinc_register_timer_callback(update_shmem);
+#endif
+	
+    if (checkpointExists != 1) {
+		dimension=getMatrixDimension(infile);
+		printf("Matrix dimension: %d\n",dimension);
+		cudaMallocHost((void **)&h_idata,dimension*dimension*sizeof(REAL));
+		fetchElementsIntoHostMemory(infile,h_idata);
+		out.printf("\n----------------- Before being inversed ----------------\n\n");
+		printf("Computation is running ... Inverse the matrix %d times. Start at inversion #1\n",NUM_ITERATIONS);
+	} else {
+		out.printf("\n----------------- Last checkpointed inversion #%d ----------------\n\n",lastInversion);
+		printf("Computation is resumed ... Inverse the matrix %d more times. Start at inversion #%d\n",NUM_ITERATIONS-lastInversion,lastInversion+1);
+	}
+	printToFile(&out,h_idata,dimension);
+	
+	
+	
+	for (int i=lastInversion+1;i<=NUM_ITERATIONS;++i) {
+		invert(h_idata,dimension);
+		
+		printf("Finish inversion #%d\n",i);
+		
+		if (run_slow) {
+            boinc_sleep(1.);
+        }
+		
+        if (early_exit && i>30) {
+            exit(-10);
+        }
+		
+        if (early_crash && i>30) {
+            boinc_crash();
+        }
+        if (early_sleep && i>30) {
+            g_sleep = true;
+            while (1) boinc_sleep(1);
+        }
+		
+		if (boinc_time_to_checkpoint()) {
+			//if (i==7) {
+			printf("Perform checkpointing at inversion # %d\n",i);
+			
+			//we'll need to write the current matrix to the state file.
+			retval = do_checkpoint(out, i, h_idata, dimension); 
+            if (retval) {
+                fprintf(stderr, "%s APP: matrix_inversion checkpoint failed %d\n",
+						boinc_msg_prefix(buf), retval
+						);
+                exit(retval);
+            }
+            boinc_checkpoint_completed();
+        }
+		
+        fd = i/NUM_ITERATIONS;
+        if (cpu_time) fd /= 2;
+        boinc_fraction_done(fd);
+		
+	}
+    
+    out.printf("\n\n----------------- Final inversion #%d----------------\n\n",NUM_ITERATIONS);
+	printToFile(&out,h_idata,dimension);
+	
+    cudaFreeHost( h_idata );
+    retval = out.flush(); //force the output file to be closed.
+    if (retval) {
+        fprintf(stderr, "%s APP: matrix_inversion flush failed %d\n",
+				boinc_msg_prefix(buf), retval
+				);
+        exit(1);
+    }
+    
+    // burn up some CPU time if needed
+    //
+    if (cpu_time) {
+        double start = dtime();
+        for (int i=0; ; i++) {
+            double e = dtime()-start;
+            if (e > cpu_time) break;
+            fd = .5 + .5*(e/cpu_time);
+            boinc_fraction_done(fd);
+			
+            if (boinc_time_to_checkpoint()) {
+                retval = do_checkpoint(out, NUM_ITERATIONS, h_idata, dimension);
+                if (retval) {
+                    fprintf(stderr, "%s APP: maxtrix_inversion checkpoint failed %d\n",
+							boinc_msg_prefix(buf), retval
+							);
+                    exit(1);
+                }
+                boinc_checkpoint_completed();
+            }
+            comp_result = do_a_giga_flop(i);
+        }
+    }
+    
+    boinc_fraction_done(1);
+#ifdef APP_GRAPHICS
+    update_shmem();
+#endif
+	
+	printf("Done!");
+    getchar();
+    boinc_finish(0);
+}
+
+#ifdef _WIN32
+int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR Args, int WinMode) {
+    LPSTR command_line;
+    char* argv[100];
+    int argc;
+	
+    command_line = GetCommandLine();
+    argc = parse_command_line( command_line, argv );
+    return main(argc, argv);
+}
+#endif
+
+void print(REAL *h_idata, int n) {
+	int j=0;
+	for (int i=0;i<n*n;++i) {
+		printf("%f    ",h_idata[i]);
+		if (j+1==n) {
+			printf("\n");
+			j=0;
+		} else {
+			++j;
+		}
+	}
+}
+
+// nxn matrix
+void generateRandomInputFile(int n) {
+	FILE *infile;
+	
+	infile=fopen(INPUT_FILENAME,"w");
+	REAL *h_idata = new REAL[n*n];
+	srand(n);
+    for( int i = 0; i < n; i++ ) {
+    	for (int j = 0; j < n; j++) {
+    		h_idata[i*n+j] = 2.0*(rand()%32768)/32768.0 - 1.0;
+    	}
+		h_idata[i*n+i] += sqrt((float)n);
+    }
+    int j=0;
+	for (int i=0;i<n*n;++i) {
+		fprintf(infile,"%15f",h_idata[i]);
+		if (j+1==n) {
+			fprintf(infile,"\n");
+			j=0;
+		} else {
+			++j;
+		}
+	}
+    fclose(infile);
+}
+
+int getMatrixDimension(FILE *infile) {
+	int w=0;
+	char c;
+	
+	fseek(infile,0,SEEK_SET);
+	while (true) {
+		do {
+			c=fgetc(infile);
+			if (c == EOF || c == '\n') {
+				goto exitLoop;
+			}
+		} while (isspace(c));
+		
+		if (isdigit(c) || c=='.' || c=='-') {
+			++w;
+		}
+		
+		do {
+			c=fgetc(infile);
+			if (c == EOF || c == '\n') {
+				goto exitLoop;
+			}
+		} while (isdigit(c) || c=='.' || c=='-');
+		
+		if (c==EOF || c == '\n') {
+			break;
+		}
+	}
+exitLoop:
+	return w;
+}
+
+void fetchElementsIntoHostMemory(FILE *infile, REAL *h_idata) {
+	float num=0;
+	int i=0;
+	fseek(infile,0,SEEK_SET);
+	while (fscanf(infile,"%f",&num)==1) {
+		h_idata[i]=num;
+		++i;
+	}
+}
+
+void printToFile(MFILE *out, float *h_odata, int dimension) {
+	int count=0;
+    int move=0;
+    int num_elements=dimension*dimension;
+    while (num_elements>0) {
+		out->printf("%15f    ",h_odata[move]);
+		++count;
+		++move;
+		if (count==dimension) {
+			out->printf("\n");
+			count=0;
+		}
+		--num_elements;
+	}
+}
--- a/samples/nvcuda/readme.txt
+++ b/samples/nvcuda/readme.txt
@ -6,11 +6,21 @@ tuanle86@berkeley.edu
 ----------------------- Compiler ----------------------------

 You will need to install gcc 4.3 and g++ 4.3. It appears that Cuda 3.0 SDK has not yet worked
-with gcc 4.4 and g++ 4.4
+with gcc 4.4 and g++ 4.4. Please follow these steps:
+
+sudo aptitude install gcc-4.3 g++-4.3
+cd ~/NVIDIA_GPU_Computing_SDK/C (go to SDK source dir. You should find the right path on your machine)
+mkdir mygcc
+cd mygcc
+ln -s $(which g++-4.3) g++
+ln -s $(which gcc-4.3) gcc
+
+Note: mygcc directory must be created since I have made symlinks to the 4.3 compiler to mygcc directory
+in my current Makefile.

 ----------------------- Linux Makefile ----------------------

-Makefile needs to be edited on your machine before running. Please follow these steps:
+common.mk file needs to be edited on your machine before running. Please follow these steps:

 1) Open "boinc/samples/nvcuda/common.mk" with gedit
 2) Ctrl+f and search for "tuanle". You will find the following:
--- a/samples/nvcuda/readme_mac.txt
+++ b/samples/nvcuda/readme_mac.txt
@ -0,0 +1,51 @@
+Tuan Le
+University of California, Berkeley
+Berkeley Space Sciences Lab
+tuanle86@berkeley.edu
+
+----------------------- Mac OS Makefile ----------------------
+By default, the NVIDIA Cuda SDK for MacOS is installed at $ROOT/Developer/ under the directory name "GPU Computing".
+If it is installed somewhere else, then common_mac.mk file needs to be edited by the following steps:
+
+1) Open "boinc/samples/nvcuda/common_mac.mk"
+2) Look for "ROOTDIR  ?= /DeveloperGPU\ Computing
+3) Replace this path by appropriate path on your machine.
+4) Done!
+
+----------------------- Mac Troubleshooting ----------------
+
+Error: ./example_app_nvcuda_mac: library not found for -lcutil_i386. 
+Solution: run make from "CPU Computing" directory to build the cutil_i386 library.
+
+Note: It appears that nvcc compiler has trouble compiling .cu file that contains both BOINC and 
+      Cuda code. Any attempt to compile such .cu files will result in errors like the following:
+
+	/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(55): error: identifier "__builtin_ia32_emms" is undefined
+
+	/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(68): error: identifier "__builtin_ia32_vec_init_v2si" is undefined
+
+	/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(111): error: identifier "__builtin_ia32_vec_ext_v2si" is undefined
+
+	/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(150): error: identifier "__builtin_ia32_packsswb" is undefined
+
+	/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(165): error: identifier "__builtin_ia32_packssdw" is undefined
+      
+        ...........
+      
+      The solution that I came up with is to compile BOINC and Cuda seperately. Put BOINC code in .c file and CUDA code in .cu file.
+      Then, write an external function in .cu file that will make kernel calls, do computation and return a result that could be used
+      in .c file.     
+
+----------------------- Run Executable file ------------------
+
+"make -f Makefile_mac" command will create an executable file in "boinc/samples/nvcuda/darwin/release/".
+If your machine doesn't have CUDA-enabled GPU, then the executable file for this sample app in 
+the release directory will stop execution after some output statements on the terminal. In this case,
+it's best to run in emurelease mode. To generate an executable file in emurelease mode, type 
+"make -f Makefile_mac emu=1". The executable file is then created in "boinc/samples/nvcuda/darwin/emurelease/".
+Note that NVIDIA Cuda SDK version older than version 3.0 will no longer support emulation mode.
+
+Also, you need to define some environment variables before running the executable file.
+
+export PATH=/usr/local/cuda/bin:$PATH
+export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH