diff --git a/samples/nvcuda/Makefile_mac b/samples/nvcuda/Makefile_mac new file mode 100644 index 0000000000..272e457c95 --- /dev/null +++ b/samples/nvcuda/Makefile_mac @@ -0,0 +1,16 @@ +# Tuan Le +# University of California, Berkeley +# Berkeley Space Sciences Lab +# tuanle86@berkeley.ed + +# Add source files here +EXECUTABLE := example_app_nvcuda_mac +# Cuda source files (compiled with cudacc) +CUFILES := cuda_kernel_mac.cu +# C/C++ source files (compiled with gcc / c++) +CCFILES := cuda_mac.c \ + +################################################################################ +# Rules and targets + +include common_mac.mk \ No newline at end of file diff --git a/samples/nvcuda/common_mac.mk b/samples/nvcuda/common_mac.mk new file mode 100644 index 0000000000..0ea1ead834 --- /dev/null +++ b/samples/nvcuda/common_mac.mk @@ -0,0 +1,479 @@ +# Tuan Le +# University of California, Berkeley +# Berkeley Space Sciences Lab +# tuanle86@berkeley.edu + +.SUFFIXES : .cu .cu_dbg.o .c_dbg.o .cpp_dbg.o .cu_rel.o .c_rel.o .cpp_rel.o .cubin .ptx + +# Add new SM Versions here as devices with new Compute Capability are released +SM_VERSIONS := 10 11 12 13 20 + +CUDA_INSTALL_PATH ?= /usr/local/cuda + +ifdef cuda-install + CUDA_INSTALL_PATH := $(cuda-install) +endif + +# detect OS +OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) +OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:]) + +# 'linux' is output for Linux system, 'darwin' for OS X +DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) +ifneq ($(DARWIN),) + SNOWLEOPARD = $(strip $(findstring 10.6, $(shell egrep "10\.6" /System/Library/CoreServices/SystemVersion.plist))) +endif + +# detect 32-bit or 64-bit platform +HP_64 = $(shell uname -m | grep 64) +OSARCH= $(shell uname -m) + +# Basic directory setup for SDK +# (override directories only if they are not already defined) +SRCDIR ?= +ROOTDIR ?= /Developer/GPU\ Computing +ROOTBINDIR ?= ../../samples/nvcuda +BINDIR ?= $(ROOTBINDIR)/$(OSLOWER) +ROOTOBJDIR ?= obj + +# BOINC directory +BOINC_DIR = ../.. +BOINC_API_DIR = $(BOINC_DIR)/api +BOINC_LIB_DIR = $(BOINC_DIR)/lib +BOINC_BUILD_DIR = $(BOINC_DIR)/mac_build/build/Deployment +BOINC_CONFIG_DIR = $(BOINC_DIR)/clientgui/mac +FRAMEWORKS_DIR = /System/Library/Frameworks + +LIBDIR := $(ROOTDIR)/C/lib +COMMONDIR := $(ROOTDIR)/C/common +SHAREDDIR := $(ROOTDIR)/shared/ + +# Includes +INCLUDES += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc -I$(SHAREDDIR)/inc -I$(BOINC_CONFIG_DIR) -I$(BOINC_DIR) -I$(BOINC_LIB_DIR) -I$(BOINC_API_DIR) + +# Compilers +NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc +CXX := g++ +CC := gcc +#Tuan edits LINK to have #(INCLUDES) +LINK := g++ -fPIC $(INCLUDES) + +# Warning flags +CXXWARN_FLAGS := \ + -W -Wall \ + -Wimplicit \ + -Wswitch \ + -Wformat \ + -Wchar-subscripts \ + -Wparentheses \ + -Wmultichar \ + -Wtrigraphs \ + -Wpointer-arith \ + -Wcast-align \ + -Wreturn-type \ + -Wno-unused-function \ + $(SPACE) + +CWARN_FLAGS := $(CXXWARN_FLAGS) \ + -Wstrict-prototypes \ + -Wmissing-prototypes \ + -Wmissing-declarations \ + -Wnested-externs \ + -Wmain \ + +# architecture flag for nvcc and gcc compilers build +CUBIN_ARCH_FLAG := +CXX_ARCH_FLAGS := +NVCCFLAGS := +LIB_ARCH := $(OSARCH) + +# Determining the necessary Cross-Compilation Flags +# 32-bit OS, but we target 64-bit cross compilation +ifeq ($(x86_64),1) + NVCCFLAGS += -m64 + LIB_ARCH = x86_64 + CUDPPLIB_SUFFIX = x86_64 + ifneq ($(DARWIN),) + CXX_ARCH_FLAGS += -arch x86_64 + else + CXX_ARCH_FLAGS += -m64 + endif +else +# 64-bit OS, and we target 32-bit cross compilation + ifeq ($(i386),1) + NVCCFLAGS += -m32 + LIB_ARCH = i386 + CUDPPLIB_SUFFIX = i386 + ifneq ($(DARWIN),) + CXX_ARCH_FLAGS += -arch i386 + else + CXX_ARCH_FLAGS += -m32 + endif + else + ifeq "$(strip $(HP_64))" "" + LIB_ARCH = i386 + CUDPPLIB_SUFFIX = i386 + NVCCFLAGS += -m32 + ifneq ($(DARWIN),) + CXX_ARCH_FLAGS += -arch i386 + else + CXX_ARCH_FLAGS += -m32 + endif + else + LIB_ARCH = x86_64 + CUDPPLIB_SUFFIX = x86_64 + NVCCFLAGS += -m64 + ifneq ($(DARWIN),) + CXX_ARCH_FLAGS += -arch x86_64 + else + CXX_ARCH_FLAGS += -m64 + endif + endif + endif +endif + + +ifeq ($(noinline),1) + NVCCFLAGS += -Xopencc -noinline + # Compiler-specific flags, when using noinline, we don't build for SM1x + GENCODE_SM10 := + GENCODE_SM20 := -gencode=arch=compute_20,code=\"sm_20,compute_20\" +else + # Compiler-specific flags (by default, we always use sm_10 and sm_20), unless we use the SMVERSION template + GENCODE_SM10 := -gencode=arch=compute_10,code=\"sm_10,compute_10\" + GENCODE_SM20 := -gencode=arch=compute_20,code=\"sm_20,compute_20\" +endif + +CXXFLAGS += $(CXXWARN_FLAGS) $(CXX_ARCH_FLAGS) +CFLAGS += $(CWARN_FLAGS) $(CXX_ARCH_FLAGS) +LINKFLAGS += +LINK += $(LINKFLAGS) $(CXX_ARCH_FLAGS) + +# This option for Mac allows CUDA applications to work without requiring to set DYLD_LIBRARY_PATH +ifneq ($(DARWIN),) + LINK += -Xlinker -rpath $(CUDA_INSTALL_PATH)/lib +endif + +# Common flags +COMMONFLAGS += $(INCLUDES) -DUNIX + +# Debug/release configuration +ifeq ($(dbg),1) + COMMONFLAGS += -g + NVCCFLAGS += -D_DEBUG + CXXFLAGS += -D_DEBUG + CFLAGS += -D_DEBUG + BINSUBDIR := debug + LIBSUFFIX := D +else + COMMONFLAGS += -O2 + BINSUBDIR := release + LIBSUFFIX := + NVCCFLAGS += --compiler-options -fno-strict-aliasing + CXXFLAGS += -fno-strict-aliasing + CFLAGS += -fno-strict-aliasing +endif + +# architecture flag for cubin build +CUBIN_ARCH_FLAG := + +# OpenGL is used or not (if it is used, then it is necessary to include GLEW) +ifeq ($(USEGLLIB),1) + ifneq ($(DARWIN),) + OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries + OPENGLLIB += -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a + else +# this case for linux platforms + OPENGLLIB := -lGL -lGLU -lX11 -lXi -lXmu +# check if x86_64 flag has been set, otherwise, check HP_64 is i386/x86_64 + ifeq ($(x86_64),1) + OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64 + else + ifeq ($(i386),) + ifeq "$(strip $(HP_64))" "" + OPENGLLIB += -lGLEW -L/usr/X11R6/lib + else + OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64 + endif + endif + endif +# check if i386 flag has been set, otehrwise check HP_64 is i386/x86_64 + ifeq ($(i386),1) + OPENGLLIB += -lGLEW -L/usr/X11R6/lib + else + ifeq ($(x86_64),) + ifeq "$(strip $(HP_64))" "" + OPENGLLIB += -lGLEW -L/usr/X11R6/lib + else + OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64 + endif + endif + endif + endif +endif + +ifeq ($(USEGLUT),1) + ifneq ($(DARWIN),) + OPENGLLIB += -framework GLUT + else + ifeq ($(x86_64),1) + OPENGLLIB += -lglut -L/usr/lib64 + endif + ifeq ($(i386),1) + OPENGLLIB += -lglut -L/usr/lib + endif + + ifeq ($(x86_64),) + ifeq ($(i386),) + OPENGLLIB += -lglut + endif + endif + endif +endif + +ifeq ($(USEPARAMGL),1) + PARAMGLLIB := -lparamgl_$(LIB_ARCH)$(LIBSUFFIX) +endif + +ifeq ($(USERENDERCHECKGL),1) + RENDERCHECKGLLIB := -lrendercheckgl_$(LIB_ARCH)$(LIBSUFFIX) +endif + +ifeq ($(USECUDPP), 1) + CUDPPLIB := -lcudpp_$(CUDPPLIB_SUFFIX)$(LIBSUFFIX) + + ifeq ($(emu), 1) + CUDPPLIB := $(CUDPPLIB)_emu + endif +endif + +ifeq ($(USENVCUVID), 1) + ifneq ($(DARWIN),) + NVCUVIDLIB := -L../../common/lib/darwin -lnvcuvid + endif +endif + +# Libs +ifneq ($(DARWIN),) + LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib $(NVCUVIDLIB) +else + ifeq "$(strip $(HP_64))" "" + ifeq ($(x86_64),1) + LIB := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib + else + LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib + endif + else + ifeq ($(i386),1) + LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib + else + LIB := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib + endif + endif +endif + +# If dynamically linking to CUDA and CUDART, we exclude the libraries from the LIB +ifeq ($(USECUDADYNLIB),1) + LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} -ldl -rdynamic +else +# static linking, we will statically link against CUDA and CUDART + ifeq ($(USEDRVAPI),1) + LIB += -lcuda ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} + else + ifeq ($(emu),1) + LIB += -lcudartemu + else + LIB += -lcudart + endif + LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} + endif +endif + +ifeq ($(USECUFFT),1) + ifeq ($(emu),1) + LIB += -lcufftemu + else + LIB += -lcufft + endif +endif + +ifeq ($(USECUBLAS),1) + ifeq ($(emu),1) + LIB += -lcublasemu + else + LIB += -lcublas + endif +endif + +# Lib/exe configuration +ifneq ($(STATIC_LIB),) + TARGETDIR := $(LIBDIR) + TARGET := $(subst .a,_$(LIB_ARCH)$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB)) + LINKLINE = ar rucv $(TARGET) $(OBJS) +else + ifneq ($(OMIT_CUTIL_LIB),1) + +#TUAN add - lboinc and -lboinc_api below + LIB += -lcutil_$(LIB_ARCH)$(LIBSUFFIX) -lshrutil_$(LIB_ARCH)$(LIBSUFFIX) -lboinc_api -L$(BOINC_BUILD_DIR) -lboinc -L$(BOINC_BUILD_DIR) + endif + # Device emulation configuration + ifeq ($(emu), 1) + NVCCFLAGS += -deviceemu + CUDACCFLAGS += + BINSUBDIR := emu$(BINSUBDIR) + # consistency, makes developing easier + CXXFLAGS += -D__DEVICE_EMULATION__ + CFLAGS += -D__DEVICE_EMULATION__ + endif + TARGETDIR := $(BINDIR)/$(BINSUBDIR) + TARGET := $(TARGETDIR)/$(EXECUTABLE) + LINKLINE = $(LINK) -o $(TARGET) $(OBJS) $(LIB) +endif + +# check if verbose +ifeq ($(verbose), 1) + VERBOSE := +else + VERBOSE := @ +endif + +################################################################################ +# Check for input flags and set compiler flags appropriately +################################################################################ +ifeq ($(fastmath), 1) + NVCCFLAGS += -use_fast_math +endif + +ifeq ($(nvccverbose), 1) + NVCCFLAGS += -v +endif + +ifeq ($(keep), 1) + NVCCFLAGS += -keep + NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx +endif + +ifdef maxregisters + NVCCFLAGS += -maxrregcount $(maxregisters) +endif + +# Add cudacc flags +NVCCFLAGS += $(CUDACCFLAGS) + +# Add common flags +NVCCFLAGS += $(COMMONFLAGS) +CXXFLAGS += $(COMMONFLAGS) +CFLAGS += $(COMMONFLAGS) + +ifeq ($(nvcc_warn_verbose),1) + NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS)) + NVCCFLAGS += --compiler-options -fno-strict-aliasing +endif + +################################################################################ +# Set up object files +################################################################################ +OBJDIR := $(ROOTOBJDIR)/$(LIB_ARCH)/$(BINSUBDIR) +OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(notdir $(CCFILES))) +OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(notdir $(CFILES))) +OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(notdir $(CUFILES))) + +################################################################################ +# Set up cubin output files +################################################################################ +CUBINDIR := $(SRCDIR)data +CUBINS += $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES))) + +################################################################################ +# Set up PTX output files +################################################################################ +PTXDIR := $(SRCDIR)data +PTXBINS += $(patsubst %.cu,$(PTXDIR)/%.ptx,$(notdir $(PTXFILES))) + +################################################################################ +# Rules +################################################################################ +$(OBJDIR)/%.c.o : $(SRCDIR)%.c $(C_DEPS) + $(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $< + +$(OBJDIR)/%.cpp.o : $(SRCDIR)%.cpp $(C_DEPS) + $(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $< + +# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile +$(OBJDIR)/%.cu.o : $(SRCDIR)%.cu $(CU_DEPS) + $(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $< + +# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile +$(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory + $(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $< + +$(PTXDIR)/%.ptx : $(SRCDIR)%.cu ptxdirectory + $(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -ptx $< + +# +# The following definition is a template that gets instantiated for each SM +# version (sm_10, sm_13, etc.) stored in SMVERSIONS. It does 2 things: +# 1. It adds to OBJS a .cu_sm_XX.o for each .cu file it finds in CUFILES_sm_XX. +# 2. It generates a rule for building .cu_sm_XX.o files from the corresponding +# .cu file. +# +# The intended use for this is to allow Makefiles that use common.mk to compile +# files to different Compute Capability targets (aka SM arch version). To do +# so, in the Makefile, list files for each SM arch separately, like so: +# This will be used over the default rule abov +# +# CUFILES_sm_10 := mycudakernel_sm10.cu app.cu +# CUFILES_sm_12 := anothercudakernel_sm12.cu +# +define SMVERSION_template +#OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_$(1)))) +OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_sm_$(1)))) +$(OBJDIR)/%.cu_$(1).o : $(SRCDIR)%.cu $(CU_DEPS) +# $(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS) $(1) + # if we have noinline enabled, we only turn this enable this for SM 2.x architectures + ifeq ($(noinline),1) + $(VERBOSE)$(NVCC) $(GENCODE_SM20) -o $$@ -c $$< $(NVCCFLAGS) + else + $(VERBOSE)$(NVCC) -gencode=arch=compute_$(1),code=\"sm_$(1),compute_$(1)\" $(GENCODE_SM20) -o $$@ -c $$< $(NVCCFLAGS) + endif +endef + +# This line invokes the above template for each arch version stored in +# SM_VERSIONS. The call funtion invokes the template, and the eval +# function interprets it as make commands. +$(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver)))) + +###### TUAN This below line causes mmintrin.h and xmmintrin.h error. + +$(TARGET): makedirectories $(OBJS) $(CUBINS) $(PTXBINS) Makefile_mac + $(VERBOSE)$(LINKLINE) + +cubindirectory: + $(VERBOSE)mkdir -p $(CUBINDIR) + +ptxdirectory: + $(VERBOSE)mkdir -p $(PTXDIR) + +makedirectories: + $(VERBOSE)mkdir -p $(LIBDIR) + $(VERBOSE)mkdir -p $(OBJDIR) + $(VERBOSE)mkdir -p $(TARGETDIR) + + +tidy : + $(VERBOSE)find . | egrep "#" | xargs rm -f + $(VERBOSE)find . | egrep "\~" | xargs rm -f + +clean : tidy +#TUAN $(VERBOSE)rm -f $(OBJS) + $(VERBOSE)rm -f $(CUBINS) + $(VERBOSE)rm -f $(PTXBINS) + $(VERBOSE)rm -f $(TARGET) + $(VERBOSE)rm -f $(NVCC_KEEP_CLEAN) + $(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.ppm + $(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.pgm + $(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bin + $(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bmp + +clobber : clean + $(VERBOSE)rm -rf $(ROOTOBJDIR) \ No newline at end of file diff --git a/samples/nvcuda/cuda_kernel_mac.cu b/samples/nvcuda/cuda_kernel_mac.cu new file mode 100644 index 0000000000..0bad040f64 --- /dev/null +++ b/samples/nvcuda/cuda_kernel_mac.cu @@ -0,0 +1,122 @@ +/* + * Tuan Le + * University of California, Berkeley + * Berkeley Space Sciences Lab + * tuanle86@berkeley.edu + */ + +// When VERIFY is defined, the sum of squared errors is calculated between the +// identity matrix and the product A * incerse(A). For debugging... +//#define VERIFY 1 + +#include +#include +#include +#include "cuda_config.h" + +__global__ void GEStep1A(REAL * AI, int i, int n2, int lda2) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k>i && k < n2 && AI[i*lda2+k]!=0) { + REAL multiplyer = -AI[i*lda2+k]/AI[i*lda2+i]; + int n = n2 / 2; + for (int j = i+1; j < n; j++) { + AI[j*lda2+k] += multiplyer*AI[j*lda2+i]; + } + } +} + +__global__ void GEStep2(REAL * AI,REAL diag,int i, int n2, int lda2) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k < n2) { + AI[i*lda2+k] /= diag; + } +} + +__global__ void GEStep3(REAL * AI,int i, int n2, int lda2) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k > i && k < n2) { + REAL multiplyer = -AI[i*lda2+k]; + for (int j = 0; j < i; j++) { + AI[j*lda2+k] += multiplyer*AI[j*lda2+i]; + } + } +} + +//extern void invert(REAL * A, int n); +void invertge(REAL * AI_d, int lda, int n) { + int lda2 = lda * 2; + // perform elementary row operations till A in AI becomes identity matrix + for (int i = 0; i < n; i++) { + GEStep1A<<<(int)ceil((float)(1+(2*n-1)/32)),32>>>(AI_d,i,n*2, lda2); + CUDACHECK; + cudaThreadSynchronize(); + } + + for (int i = n-1; i >= 0; i--) { + REAL diag = 1.0; + SAFECALL(cudaMemcpy(&diag, &AI_d[i*lda2+i], sizeof(REAL), cudaMemcpyDeviceToHost)); + GEStep2<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,diag,i,n*2, lda2); + CUDACHECK; + + GEStep3<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,i,n*2, lda2); + CUDACHECK; + cudaThreadSynchronize(); + CUDACHECK; + } +} // invertge + + +/* inverts nxn matrix A and stores result back in A */ +extern void invert(REAL * A, int n) { + fprintf(stderr,"starting inversion n = %d ", n); + volatile clock_t gputime, gputime0; + gputime=clock(); + gputime0 = gputime; + + int lda = ((n+15)&~15|16); + //lda=n; + REAL * AI = (REAL *)malloc(sizeof(REAL)*(n*lda*2)); + memset(AI,0,sizeof(REAL)*n*lda*2); + for (int i = 0; i < n; i++) { + memcpy(&AI[lda*i*2], &A[n*i], sizeof(REAL)*n); + AI[lda*i*2+n+i] = 1; + } + + REAL * AI_d; + SAFECALL(cudaMalloc((void **) &AI_d, sizeof(REAL)*n*lda*2)); + SAFECALL(cudaMemcpy(AI_d, AI, sizeof(REAL)*n*lda*2, cudaMemcpyHostToDevice)); + + invertge(AI_d, lda, n); + SAFECALL(cudaMemcpy(AI, AI_d, sizeof(REAL)*n*lda*2, cudaMemcpyDeviceToHost)); + cudaFree(AI_d); + + + gputime=clock()-gputime;fprintf(stderr, " %7.1f ms ",gputime/1.e3f); + fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime); +#ifdef VERIFY + // let's verify that + REAL error=0.0; + + // multiply inverse*xcopy, should be Identity matrix + for (int k = 0; k < n; k++) { + for (int j = 0; j < n; j++) { + REAL sum = 0; + for (int i = 0; i < n; i++) { + sum += AI[j*lda*2+n+i]*A[i*n+k]; + } + if (j!=k) { + error += sum * sum; + } else { + error += (1.0-sum) * (1.0-sum); + } + } + } + fprintf(stderr, " %6.2f SSE", error); +#endif + + for (int i = 0; i < n; i++) { + memcpy(&A[n*i], &AI[lda*i*2+n], sizeof(REAL)*n); + } + free(AI); + fprintf(stderr," done!\n"); +} // invert diff --git a/samples/nvcuda/cuda_mac.c b/samples/nvcuda/cuda_mac.c new file mode 100644 index 0000000000..540bea680a --- /dev/null +++ b/samples/nvcuda/cuda_mac.c @@ -0,0 +1,440 @@ +/* + * Tuan Le + * University of California, Berkeley + * Berkeley Space Sciences Lab + * tuanle86@berkeley.edu + */ + +#ifdef _WIN32 +#include "boinc_win.h" +#else +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#endif + +#include +#include +#include "cuda_config.h" + +#include "str_util.h" +#include "util.h" +#include "filesys.h" +#include "boinc_api.h" +#include "mfile.h" +#include "graphics2.h" + +struct UC_SHMEM { + double update_time; + double fraction_done; + double cpu_time; + BOINC_STATUS status; + int countdown; + // graphics app sets this to 5 repeatedly, + // main program decrements it once/sec. + // If it's zero, don't bother updating shmem +}; + +#ifdef APP_GRAPHICS +#include "uc2.h" +UC_SHMEM* shmem; +#endif + +using std::string; + +#define CHECKPOINT_FILE "matrix_inversion_state" +#define INPUT_FILENAME "input" +#define OUTPUT_FILENAME "output" +#define MATRIX_SIZE 10 + + +// execute the kernel NUM_ITERATIONS times +#define NUM_ITERATIONS 51 + +bool run_slow = false; +bool early_exit = false; +bool early_crash = false; +bool early_sleep = false; +double cpu_time = 20, comp_result; + +// do a billion floating-point ops +// (note: I needed to add an arg to this; +// otherwise the MS C++ compiler optimizes away +// all but the first call to it!) +// +static double do_a_giga_flop(int foo) { + double x = 3.14159*foo; + int i; + for (i=0; i<500000000; i++) { + x += 5.12313123; + x *= 0.5398394834; + } + return x; +} + +int do_checkpoint(MFILE& mf, int n, REAL *h_idata, int dimension) { + int retval; + string resolved_name; + + FILE* f = fopen("temp", "w"); + if (!f) return 1; + fprintf(f, "%d", n); //write inversion number + fprintf(f, " "); + fprintf(f, "%d", dimension); //write dimension + fprintf(f, " "); + for (int i=0;iupdate_time = dtime(); + + // Check whether a graphics app is running, + // and don't bother updating shmem if so. + // This doesn't matter here, + // but may be worth doing if updating shmem is expensive. + // + if (shmem->countdown > 0) { + // the graphics app sets this to 5 every time it renders a frame + shmem->countdown--; + } else { + return; + } + shmem->fraction_done = boinc_get_fraction_done(); + shmem->cpu_time = boinc_worker_thread_cpu_time();; + boinc_get_status(&shmem->status); +} +#endif + +void print(REAL *A, int n); //on commandline +void generateRandomInputFile(int n); +int getMatrixDimension(FILE *infile); +int countElementsInMatrix(FILE *infile); +void fetchElementsIntoHostMemory(FILE *infile, REAL *h_idata); +void printToFile(MFILE *out, float *h_odata, int dimension); +extern void invert(REAL * A, int n); + +int main(int argc, char** argv) +{ + int i, retval, lastInversion=0, checkpointExists=0, dimension=0; + double fd; + char input_path[512], output_path[512], chkpt_path[512], buf[256]; + REAL* h_idata; + MFILE out; + FILE* state, *infile; + + generateRandomInputFile(MATRIX_SIZE); //call this if you don't want to construct the input file manually + + for (i=0; i30) { + exit(-10); + } + + if (early_crash && i>30) { + boinc_crash(); + } + if (early_sleep && i>30) { + g_sleep = true; + while (1) boinc_sleep(1); + } + + if (boinc_time_to_checkpoint()) { + //if (i==7) { + printf("Perform checkpointing at inversion # %d\n",i); + + //we'll need to write the current matrix to the state file. + retval = do_checkpoint(out, i, h_idata, dimension); + if (retval) { + fprintf(stderr, "%s APP: matrix_inversion checkpoint failed %d\n", + boinc_msg_prefix(buf), retval + ); + exit(retval); + } + boinc_checkpoint_completed(); + } + + fd = i/NUM_ITERATIONS; + if (cpu_time) fd /= 2; + boinc_fraction_done(fd); + + } + + out.printf("\n\n----------------- Final inversion #%d----------------\n\n",NUM_ITERATIONS); + printToFile(&out,h_idata,dimension); + + cudaFreeHost( h_idata ); + retval = out.flush(); //force the output file to be closed. + if (retval) { + fprintf(stderr, "%s APP: matrix_inversion flush failed %d\n", + boinc_msg_prefix(buf), retval + ); + exit(1); + } + + // burn up some CPU time if needed + // + if (cpu_time) { + double start = dtime(); + for (int i=0; ; i++) { + double e = dtime()-start; + if (e > cpu_time) break; + fd = .5 + .5*(e/cpu_time); + boinc_fraction_done(fd); + + if (boinc_time_to_checkpoint()) { + retval = do_checkpoint(out, NUM_ITERATIONS, h_idata, dimension); + if (retval) { + fprintf(stderr, "%s APP: maxtrix_inversion checkpoint failed %d\n", + boinc_msg_prefix(buf), retval + ); + exit(1); + } + boinc_checkpoint_completed(); + } + comp_result = do_a_giga_flop(i); + } + } + + boinc_fraction_done(1); +#ifdef APP_GRAPHICS + update_shmem(); +#endif + + printf("Done!"); + getchar(); + boinc_finish(0); +} + +#ifdef _WIN32 +int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR Args, int WinMode) { + LPSTR command_line; + char* argv[100]; + int argc; + + command_line = GetCommandLine(); + argc = parse_command_line( command_line, argv ); + return main(argc, argv); +} +#endif + +void print(REAL *h_idata, int n) { + int j=0; + for (int i=0;i0) { + out->printf("%15f ",h_odata[move]); + ++count; + ++move; + if (count==dimension) { + out->printf("\n"); + count=0; + } + --num_elements; + } +} diff --git a/samples/nvcuda/readme.txt b/samples/nvcuda/readme.txt index 5334bedc8e..323cfcbde7 100644 --- a/samples/nvcuda/readme.txt +++ b/samples/nvcuda/readme.txt @@ -6,11 +6,21 @@ tuanle86@berkeley.edu ----------------------- Compiler ---------------------------- You will need to install gcc 4.3 and g++ 4.3. It appears that Cuda 3.0 SDK has not yet worked -with gcc 4.4 and g++ 4.4 +with gcc 4.4 and g++ 4.4. Please follow these steps: + +sudo aptitude install gcc-4.3 g++-4.3 +cd ~/NVIDIA_GPU_Computing_SDK/C (go to SDK source dir. You should find the right path on your machine) +mkdir mygcc +cd mygcc +ln -s $(which g++-4.3) g++ +ln -s $(which gcc-4.3) gcc + +Note: mygcc directory must be created since I have made symlinks to the 4.3 compiler to mygcc directory +in my current Makefile. ----------------------- Linux Makefile ---------------------- -Makefile needs to be edited on your machine before running. Please follow these steps: +common.mk file needs to be edited on your machine before running. Please follow these steps: 1) Open "boinc/samples/nvcuda/common.mk" with gedit 2) Ctrl+f and search for "tuanle". You will find the following: diff --git a/samples/nvcuda/readme_mac.txt b/samples/nvcuda/readme_mac.txt new file mode 100644 index 0000000000..905a96e710 --- /dev/null +++ b/samples/nvcuda/readme_mac.txt @@ -0,0 +1,51 @@ +Tuan Le +University of California, Berkeley +Berkeley Space Sciences Lab +tuanle86@berkeley.edu + +----------------------- Mac OS Makefile ---------------------- +By default, the NVIDIA Cuda SDK for MacOS is installed at $ROOT/Developer/ under the directory name "GPU Computing". +If it is installed somewhere else, then common_mac.mk file needs to be edited by the following steps: + +1) Open "boinc/samples/nvcuda/common_mac.mk" +2) Look for "ROOTDIR ?= /DeveloperGPU\ Computing +3) Replace this path by appropriate path on your machine. +4) Done! + +----------------------- Mac Troubleshooting ---------------- + +Error: ./example_app_nvcuda_mac: library not found for -lcutil_i386. +Solution: run make from "CPU Computing" directory to build the cutil_i386 library. + +Note: It appears that nvcc compiler has trouble compiling .cu file that contains both BOINC and + Cuda code. Any attempt to compile such .cu files will result in errors like the following: + + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(55): error: identifier "__builtin_ia32_emms" is undefined + + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(68): error: identifier "__builtin_ia32_vec_init_v2si" is undefined + + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(111): error: identifier "__builtin_ia32_vec_ext_v2si" is undefined + + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(150): error: identifier "__builtin_ia32_packsswb" is undefined + + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(165): error: identifier "__builtin_ia32_packssdw" is undefined + + ........... + + The solution that I came up with is to compile BOINC and Cuda seperately. Put BOINC code in .c file and CUDA code in .cu file. + Then, write an external function in .cu file that will make kernel calls, do computation and return a result that could be used + in .c file. + +----------------------- Run Executable file ------------------ + +"make -f Makefile_mac" command will create an executable file in "boinc/samples/nvcuda/darwin/release/". +If your machine doesn't have CUDA-enabled GPU, then the executable file for this sample app in +the release directory will stop execution after some output statements on the terminal. In this case, +it's best to run in emurelease mode. To generate an executable file in emurelease mode, type +"make -f Makefile_mac emu=1". The executable file is then created in "boinc/samples/nvcuda/darwin/emurelease/". +Note that NVIDIA Cuda SDK version older than version 3.0 will no longer support emulation mode. + +Also, you need to define some environment variables before running the executable file. + +export PATH=/usr/local/cuda/bin:$PATH +export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH \ No newline at end of file