diff --git a/samples/nvcuda/Makefile b/samples/nvcuda/Makefile new file mode 100644 index 0000000000..46065927ae --- /dev/null +++ b/samples/nvcuda/Makefile @@ -0,0 +1,16 @@ +# Tuan Le +# University of California, Berkeley +# Berkeley Space Sciences Lab +# tuanle86@berkeley.ed + +# Add source files here +EXECUTABLE := example_app_nvcuda +# Cuda source files (compiled with cudacc) +CUFILES := cuda.cu +# C/C++ source files (compiled with gcc / c++) +CCFILES := \ + +################################################################################ +# Rules and targets + +include common.mk diff --git a/samples/nvcuda/common.mk b/samples/nvcuda/common.mk new file mode 100644 index 0000000000..54c2808399 --- /dev/null +++ b/samples/nvcuda/common.mk @@ -0,0 +1,464 @@ +# Tuan Le +# University of California, Berkeley +# Berkeley Space Sciences Lab +# tuanle86@berkeley.edu + +.SUFFIXES : .cu .cu_dbg.o .c_dbg.o .cpp_dbg.o .cu_rel.o .c_rel.o .cpp_rel.o .cubin .ptx + +# Add new SM Versions here as devices with new Compute Capability are released +SM_VERSIONS := 10 11 12 13 20 + +CUDA_INSTALL_PATH ?= /usr/local/cuda + +ifdef cuda-install + CUDA_INSTALL_PATH := $(cuda-install) +endif + +# detect OS +OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) +OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:]) + +# 'linux' is output for Linux system, 'darwin' for OS X +DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) +ifneq ($(DARWIN),) + SNOWLEOPARD = $(strip $(findstring 10.6, $(shell egrep "10\.6" /System/Library/CoreServices/SystemVersion.plist))) +endif + +# detect 32-bit or 64-bit platform +HP_64 = $(shell uname -m | grep 64) +OSARCH= $(shell uname -m) + +# Basic directory setup for SDK +# (override directories only if they are not already defined) +SRCDIR ?= +ROOTDIR ?= .. + +# ROOTDIR/bin is the directory where executable file will be put in + +ROOTBINDIR ?= ../../samples/nvcuda +BINDIR ?= $(ROOTBINDIR)/$(OSLOWER) +ROOTOBJDIR ?= obj + +# BOINC directory +BOINC_DIR = ../.. +BOINC_API_DIR = $(BOINC_DIR)/api +BOINC_LIB_DIR = $(BOINC_DIR)/lib + +# Directory for cutil_i386 as well as other utils while linking (-lcutil_i386) +LIBDIR := /home/tuanle/NVIDIA_GPU_Computing_SDK/C/lib +COMMONDIR := /home/tuanle/NVIDIA_GPU_Computing_SDK/C/common +SHAREDDIR := /home/tuanle/NVIDIA_GPU_Computing_SDK/shared + +# Compilers +NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc +CXX := g++-4.3 +CC := gcc-4.3 +LINK := g++-4.3 -fPIC + +# Includes +INCLUDES += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc -I$(SHAREDDIR)/inc -I $(BOINC_API_DIR) -I $(BOINC_LIB_DIR) -I $(BOINC_DIR) + +# Warning flags +CXXWARN_FLAGS := \ + -W -Wall \ + -Wimplicit \ + -Wswitch \ + -Wformat \ + -Wchar-subscripts \ + -Wparentheses \ + -Wmultichar \ + -Wtrigraphs \ + -Wpointer-arith \ + -Wcast-align \ + -Wreturn-type \ + -Wno-unused-function \ + $(SPACE) + +CWARN_FLAGS := $(CXXWARN_FLAGS) \ + -Wstrict-prototypes \ + -Wmissing-prototypes \ + -Wmissing-declarations \ + -Wnested-externs \ + -Wmain \ + +# architecture flag for nvcc and gcc compilers build +CUBIN_ARCH_FLAG := +CXX_ARCH_FLAGS := +NVCCFLAGS := +LIB_ARCH := $(OSARCH) + +# Determining the necessary Cross-Compilation Flags +# 32-bit OS, but we target 64-bit cross compilation +ifeq ($(x86_64),1) + NVCCFLAGS += -m64 + LIB_ARCH = x86_64 + CUDPPLIB_SUFFIX = x86_64 + + ifneq ($(DARWIN),) + CXX_ARCH_FLAGS += -arch x86_64 + else + CXX_ARCH_FLAGS += -m64 + endif +else +# 64-bit OS, and we target 32-bit cross compilation + ifeq ($(i386),1) + NVCCFLAGS += -m32 + LIB_ARCH = i386 + CUDPPLIB_SUFFIX = i386 + + ifneq ($(DARWIN),) + CXX_ARCH_FLAGS += -arch i386 + else + CXX_ARCH_FLAGS += -m32 + endif + else + ifneq ($(SNOWLEOPARD),) + NVCCFLAGS += -m32 + CXX_ARCH_FLAGS += -m32 -arch i386 + LIB_ARCH = i386 + CUDPPLIB_SUFFIX = i386 + else + ifeq "$(strip $(HP_64))" "" + LIB_ARCH = i386 + CUDPPLIB_SUFFIX = i386 + else + LIB_ARCH = x86_64 + CUDPPLIB_SUFFIX = x86_64 + endif + endif + endif +endif + +# Compiler-specific flags (by default, we always use sm_10 and sm_20), unless we use the SMVERSION template +GENCODE_SM10 := -gencode=arch=compute_10,code=\"sm_10,compute_10\" +GENCODE_SM20 := -gencode=arch=compute_20,code=\"sm_20,compute_20\" + +CXXFLAGS += $(CXXWARN_FLAGS) $(CXX_ARCH_FLAGS) +CFLAGS += $(CWARN_FLAGS) $(CXX_ARCH_FLAGS) +LINKFLAGS += +LINK += $(LINKFLAGS) $(CXX_ARCH_FLAGS) + +# This option for Mac allows CUDA applications to work without requiring to set DYLD_LIBRARY_PATH +ifneq ($(DARWIN),) + LINK += -Xlinker -rpath $(CUDA_INSTALL_PATH)/lib +endif + +# Common flags +COMMONFLAGS += $(INCLUDES) -DUNIX + +# Debug/release configuration +ifeq ($(dbg),1) + COMMONFLAGS += -g + NVCCFLAGS += -D_DEBUG + CXXFLAGS += -D_DEBUG + CFLAGS += -D_DEBUG + BINSUBDIR := debug + LIBSUFFIX := D +else + COMMONFLAGS += -O2 + BINSUBDIR := release + LIBSUFFIX := + NVCCFLAGS += --compiler-options -fno-strict-aliasing + CXXFLAGS += -fno-strict-aliasing + CFLAGS += -fno-strict-aliasing +endif + +# architecture flag for cubin build +CUBIN_ARCH_FLAG := + +# OpenGL is used or not (if it is used, then it is necessary to include GLEW) +ifeq ($(USEGLLIB),1) + ifneq ($(DARWIN),) + OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries + OPENGLLIB += -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a + else +# this case for linux platforms + OPENGLLIB := -lGL -lGLU -lX11 -lXi -lXmu +# check if x86_64 flag has been set, otherwise, check HP_64 is i386/x86_64 + ifeq ($(x86_64),1) + OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64 + else + ifeq ($(i386),) + ifeq "$(strip $(HP_64))" "" + OPENGLLIB += -lGLEW -L/usr/X11R6/lib + else + OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64 + endif + endif + endif +# check if i386 flag has been set, otehrwise check HP_64 is i386/x86_64 + ifeq ($(i386),1) + OPENGLLIB += -lGLEW -L/usr/X11R6/lib + else + ifeq ($(x86_64),) + ifeq "$(strip $(HP_64))" "" + OPENGLLIB += -lGLEW -L/usr/X11R6/lib + else + OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64 + endif + endif + endif + endif +endif + +ifeq ($(USEGLUT),1) + ifneq ($(DARWIN),) + OPENGLLIB += -framework GLUT + else + ifeq ($(x86_64),1) + OPENGLLIB += -lglut -L/usr/lib64 + endif + ifeq ($(i386),1) + OPENGLLIB += -lglut -L/usr/lib + endif + + ifeq ($(x86_64),) + ifeq ($(i386),) + OPENGLLIB += -lglut + endif + endif + endif +endif + +ifeq ($(USEPARAMGL),1) + PARAMGLLIB := -lparamgl_$(LIB_ARCH)$(LIBSUFFIX) +endif + +ifeq ($(USERENDERCHECKGL),1) + RENDERCHECKGLLIB := -lrendercheckgl_$(LIB_ARCH)$(LIBSUFFIX) +endif + +ifeq ($(USECUDPP), 1) + CUDPPLIB := -lcudpp_$(CUDPPLIB_SUFFIX)$(LIBSUFFIX) + + ifeq ($(emu), 1) + CUDPPLIB := $(CUDPPLIB)_emu + endif +endif + +ifeq ($(USENVCUVID), 1) + ifneq ($(DARWIN),) + NVCUVIDLIB := -L/home/tuanle/NVIDIA_GPU_Computing_SDK/C/common/lib/darwin -lnvcuvid + endif +endif + +# Libs +ifneq ($(DARWIN),) + LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib $(NVCUVIDLIB) +else + ifeq "$(strip $(HP_64))" "" + ifeq ($(x86_64),1) + LIB := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib + else + LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib + endif + else + ifeq ($(i386),1) + LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib + else + LIB := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib + endif + endif +endif + +# If dynamically linking to CUDA and CUDART, we exclude the libraries from the LIB +ifeq ($(USECUDADYNLIB),1) + LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} -ldl -rdynamic +else +# static linking, we will statically link against CUDA and CUDART + ifeq ($(USEDRVAPI),1) + LIB += -lcuda ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} + else + ifeq ($(emu),1) + LIB += -lcudartemu + else + LIB += -lcudart + endif + LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} + endif +endif + +ifeq ($(USECUFFT),1) + ifeq ($(emu),1) + LIB += -lcufftemu + else + LIB += -lcufft + endif +endif + +ifeq ($(USECUBLAS),1) + ifeq ($(emu),1) + LIB += -lcublasemu + else + LIB += -lcublas + endif +endif + +# Lib/exe configuration +ifneq ($(STATIC_LIB),) + TARGETDIR := $(LIBDIR) + TARGET := $(subst .a,_$(LIB_ARCH)$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB)) + LINKLINE = ar rucv $(TARGET) $(OBJS) +else + ifneq ($(OMIT_CUTIL_LIB),1) + +#TUAN add -lboinc and -lboinc_api below + LIB += -lcutil_$(LIB_ARCH)$(LIBSUFFIX) -lshrutil_$(LIB_ARCH)$(LIBSUFFIX) -lboinc_api -L$(BOINC_API_DIR) -lboinc -L$(BOINC_LIB_DIR) + endif + # Device emulation configuration + ifeq ($(emu), 1) + NVCCFLAGS += -deviceemu + CUDACCFLAGS += + BINSUBDIR := emu$(BINSUBDIR) + # consistency, makes developing easier + CXXFLAGS += -D__DEVICE_EMULATION__ + CFLAGS += -D__DEVICE_EMULATION__ + endif + TARGETDIR := $(BINDIR)/$(BINSUBDIR) + TARGET := $(TARGETDIR)/$(EXECUTABLE) + LINKLINE = $(LINK) -o $(TARGET) $(OBJS) $(LIB) +endif + +# check if verbose +ifeq ($(verbose), 1) + VERBOSE := +else + VERBOSE := @ +endif + +################################################################################ +# Check for input flags and set compiler flags appropriately +################################################################################ +ifeq ($(fastmath), 1) + NVCCFLAGS += -use_fast_math +endif + +ifeq ($(keep), 1) + NVCCFLAGS += -keep + NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx +endif + +ifdef maxregisters + NVCCFLAGS += -maxrregcount $(maxregisters) +endif + +# Add cudacc flags +NVCCFLAGS += $(CUDACCFLAGS) + +# Add common flags +NVCCFLAGS += $(COMMONFLAGS) +CXXFLAGS += $(COMMONFLAGS) +CFLAGS += $(COMMONFLAGS) + +# use gcc-4.3 +NVCCFLAGS+=--compiler-bindir=${HOME}/NVIDIA_GPU_Computing_SDK/C/mygcc + +ifeq ($(nvcc_warn_verbose),1) + NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS)) + NVCCFLAGS += --compiler-options -fno-strict-aliasing +endif + +################################################################################ +# Set up object files +################################################################################ +OBJDIR := $(ROOTOBJDIR)/$(LIB_ARCH)/$(BINSUBDIR) +OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(notdir $(CCFILES))) +OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(notdir $(CFILES))) +OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(notdir $(CUFILES))) + +################################################################################ +# Set up cubin output files +################################################################################ +CUBINDIR := $(SRCDIR)data +CUBINS += $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES))) + +################################################################################ +# Set up PTX output files +################################################################################ +PTXDIR := $(SRCDIR)data +PTXBINS += $(patsubst %.cu,$(PTXDIR)/%.ptx,$(notdir $(PTXFILES))) + +################################################################################ +# Rules +################################################################################ +$(OBJDIR)/%.c.o : $(SRCDIR)%.c $(C_DEPS) + $(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $< -pthread \ + $(BOINC_API_DIR)/libboinc_api.a \ + $(BOINC_LIB_DIR)/libboinc.a + +$(OBJDIR)/%.cpp.o : $(SRCDIR)%.cpp $(C_DEPS) + $(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $< -pthread \ + $(BOINC_API_DIR)/libboinc_api.a \ + $(BOINC_LIB_DIR)/libboinc.a + +# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile +$(OBJDIR)/%.cu.o : $(SRCDIR)%.cu $(CU_DEPS) + $(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $< + +# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile +$(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory + $(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $< + +$(PTXDIR)/%.ptx : $(SRCDIR)%.cu ptxdirectory + $(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -ptx $< + +# +# The following definition is a template that gets instantiated for each SM +# version (sm_10, sm_13, etc.) stored in SMVERSIONS. It does 2 things: +# 1. It adds to OBJS a .cu_sm_XX.o for each .cu file it finds in CUFILES_sm_XX. +# 2. It generates a rule for building .cu_sm_XX.o files from the corresponding +# .cu file. +# +# The intended use for this is to allow Makefiles that use common.mk to compile +# files to different Compute Capability targets (aka SM arch version). To do +# so, in the Makefile, list files for each SM arch separately, like so: +# This will be used over the default rule abov +# +# CUFILES_sm_10 := mycudakernel_sm10.cu app.cu +# CUFILES_sm_12 := anothercudakernel_sm12.cu +# +define SMVERSION_template +#OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_$(1)))) +OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_sm_$(1)))) +$(OBJDIR)/%.cu_$(1).o : $(SRCDIR)%.cu $(CU_DEPS) +# $(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS) $(1) + $(VERBOSE)$(NVCC) -gencode=arch=compute_$(1),code=\"sm_$(1),compute_$(1)\" $(GENCODE_SM20) -o $$@ -c $$< $(NVCCFLAGS) +endef + +# This line invokes the above template for each arch version stored in +# SM_VERSIONS. The call funtion invokes the template, and the eval +# function interprets it as make commands. +$(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver)))) + +$(TARGET): makedirectories $(OBJS) $(CUBINS) $(PTXBINS) Makefile + $(VERBOSE)$(LINKLINE) + +cubindirectory: + $(VERBOSE)mkdir -p $(CUBINDIR) + +ptxdirectory: + $(VERBOSE)mkdir -p $(PTXDIR) + +makedirectories: + $(VERBOSE)mkdir -p $(LIBDIR) + $(VERBOSE)mkdir -p $(OBJDIR) + $(VERBOSE)mkdir -p $(TARGETDIR) + + +tidy : + $(VERBOSE)find . | egrep "#" | xargs rm -f + $(VERBOSE)find . | egrep "\~" | xargs rm -f + +clean : tidy + $(VERBOSE)rm -f $(OBJS) + $(VERBOSE)rm -f $(CUBINS) + $(VERBOSE)rm -f $(PTXBINS) + $(VERBOSE)rm -f $(TARGET) + $(VERBOSE)rm -f $(NVCC_KEEP_CLEAN) + $(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.ppm + $(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.pgm + $(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bin + $(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bmp + +clobber : clean + $(VERBOSE)rm -rf $(ROOTOBJDIR) diff --git a/samples/nvcuda/cuda.cu b/samples/nvcuda/cuda.cu index 2ed6683838..a639fdafbc 100644 --- a/samples/nvcuda/cuda.cu +++ b/samples/nvcuda/cuda.cu @@ -1,22 +1,7 @@ /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* - * cuda.cu - * Copyright (C) 2010 Tuan Le + * Tuan Le + * University of California, Berkeley + * Berkeley Space Sciences Lab * tuanle86@berkeley.edu */ @@ -45,6 +30,17 @@ #include "mfile.h" #include "graphics2.h" +struct UC_SHMEM { + double update_time; + double fraction_done; + double cpu_time; + BOINC_STATUS status; + int countdown; + // graphics app sets this to 5 repeatedly, + // main program decrements it once/sec. + // If it's zero, don't bother updating shmem +}; + #ifdef APP_GRAPHICS #include "uc2.h" UC_SHMEM* shmem; @@ -59,7 +55,7 @@ using std::string; // execute the kernel NUM_ITERATIONS times -#define NUM_ITERATIONS 19 +#define NUM_ITERATIONS 51 bool run_slow = false; bool early_exit = false; @@ -143,13 +139,13 @@ int main(int argc, char** argv) { int i, retval, lastInversion=0, checkpointExists=0, dimension=0; double fd; - char input_path[512], output_path[512], chkpt_path[512]; + char input_path[512], output_path[512], chkpt_path[512], buf[256]; REAL* h_idata; unsigned int mem_size; MFILE out; FILE* state, *infile; - //generateRandomInputFile(MATRIX_SIZE); //call this if you don't want to construct the input file manually + generateRandomInputFile(MATRIX_SIZE); //call this if you don't want to construct the input file manually for (i=0; i