mirror of https://github.com/BOINC/boinc.git
Makefile_mac completed
cuda_mac.c and cuda_kernel_mac.cu are targeted on Mac svn path=/trunk/boinc/; revision=21862
This commit is contained in:
parent
d59af26596
commit
fc87ee85d4
|
@ -0,0 +1,16 @@
|
|||
# Tuan Le
|
||||
# University of California, Berkeley
|
||||
# Berkeley Space Sciences Lab
|
||||
# tuanle86@berkeley.ed
|
||||
|
||||
# Add source files here
|
||||
EXECUTABLE := example_app_nvcuda_mac
|
||||
# Cuda source files (compiled with cudacc)
|
||||
CUFILES := cuda_kernel_mac.cu
|
||||
# C/C++ source files (compiled with gcc / c++)
|
||||
CCFILES := cuda_mac.c \
|
||||
|
||||
################################################################################
|
||||
# Rules and targets
|
||||
|
||||
include common_mac.mk
|
|
@ -0,0 +1,479 @@
|
|||
# Tuan Le
|
||||
# University of California, Berkeley
|
||||
# Berkeley Space Sciences Lab
|
||||
# tuanle86@berkeley.edu
|
||||
|
||||
.SUFFIXES : .cu .cu_dbg.o .c_dbg.o .cpp_dbg.o .cu_rel.o .c_rel.o .cpp_rel.o .cubin .ptx
|
||||
|
||||
# Add new SM Versions here as devices with new Compute Capability are released
|
||||
SM_VERSIONS := 10 11 12 13 20
|
||||
|
||||
CUDA_INSTALL_PATH ?= /usr/local/cuda
|
||||
|
||||
ifdef cuda-install
|
||||
CUDA_INSTALL_PATH := $(cuda-install)
|
||||
endif
|
||||
|
||||
# detect OS
|
||||
OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
|
||||
OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])
|
||||
|
||||
# 'linux' is output for Linux system, 'darwin' for OS X
|
||||
DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
|
||||
ifneq ($(DARWIN),)
|
||||
SNOWLEOPARD = $(strip $(findstring 10.6, $(shell egrep "<string>10\.6" /System/Library/CoreServices/SystemVersion.plist)))
|
||||
endif
|
||||
|
||||
# detect 32-bit or 64-bit platform
|
||||
HP_64 = $(shell uname -m | grep 64)
|
||||
OSARCH= $(shell uname -m)
|
||||
|
||||
# Basic directory setup for SDK
|
||||
# (override directories only if they are not already defined)
|
||||
SRCDIR ?=
|
||||
ROOTDIR ?= /Developer/GPU\ Computing
|
||||
ROOTBINDIR ?= ../../samples/nvcuda
|
||||
BINDIR ?= $(ROOTBINDIR)/$(OSLOWER)
|
||||
ROOTOBJDIR ?= obj
|
||||
|
||||
# BOINC directory
|
||||
BOINC_DIR = ../..
|
||||
BOINC_API_DIR = $(BOINC_DIR)/api
|
||||
BOINC_LIB_DIR = $(BOINC_DIR)/lib
|
||||
BOINC_BUILD_DIR = $(BOINC_DIR)/mac_build/build/Deployment
|
||||
BOINC_CONFIG_DIR = $(BOINC_DIR)/clientgui/mac
|
||||
FRAMEWORKS_DIR = /System/Library/Frameworks
|
||||
|
||||
LIBDIR := $(ROOTDIR)/C/lib
|
||||
COMMONDIR := $(ROOTDIR)/C/common
|
||||
SHAREDDIR := $(ROOTDIR)/shared/
|
||||
|
||||
# Includes
|
||||
INCLUDES += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc -I$(SHAREDDIR)/inc -I$(BOINC_CONFIG_DIR) -I$(BOINC_DIR) -I$(BOINC_LIB_DIR) -I$(BOINC_API_DIR)
|
||||
|
||||
# Compilers
|
||||
NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc
|
||||
CXX := g++
|
||||
CC := gcc
|
||||
#Tuan edits LINK to have #(INCLUDES)
|
||||
LINK := g++ -fPIC $(INCLUDES)
|
||||
|
||||
# Warning flags
|
||||
CXXWARN_FLAGS := \
|
||||
-W -Wall \
|
||||
-Wimplicit \
|
||||
-Wswitch \
|
||||
-Wformat \
|
||||
-Wchar-subscripts \
|
||||
-Wparentheses \
|
||||
-Wmultichar \
|
||||
-Wtrigraphs \
|
||||
-Wpointer-arith \
|
||||
-Wcast-align \
|
||||
-Wreturn-type \
|
||||
-Wno-unused-function \
|
||||
$(SPACE)
|
||||
|
||||
CWARN_FLAGS := $(CXXWARN_FLAGS) \
|
||||
-Wstrict-prototypes \
|
||||
-Wmissing-prototypes \
|
||||
-Wmissing-declarations \
|
||||
-Wnested-externs \
|
||||
-Wmain \
|
||||
|
||||
# architecture flag for nvcc and gcc compilers build
|
||||
CUBIN_ARCH_FLAG :=
|
||||
CXX_ARCH_FLAGS :=
|
||||
NVCCFLAGS :=
|
||||
LIB_ARCH := $(OSARCH)
|
||||
|
||||
# Determining the necessary Cross-Compilation Flags
|
||||
# 32-bit OS, but we target 64-bit cross compilation
|
||||
ifeq ($(x86_64),1)
|
||||
NVCCFLAGS += -m64
|
||||
LIB_ARCH = x86_64
|
||||
CUDPPLIB_SUFFIX = x86_64
|
||||
ifneq ($(DARWIN),)
|
||||
CXX_ARCH_FLAGS += -arch x86_64
|
||||
else
|
||||
CXX_ARCH_FLAGS += -m64
|
||||
endif
|
||||
else
|
||||
# 64-bit OS, and we target 32-bit cross compilation
|
||||
ifeq ($(i386),1)
|
||||
NVCCFLAGS += -m32
|
||||
LIB_ARCH = i386
|
||||
CUDPPLIB_SUFFIX = i386
|
||||
ifneq ($(DARWIN),)
|
||||
CXX_ARCH_FLAGS += -arch i386
|
||||
else
|
||||
CXX_ARCH_FLAGS += -m32
|
||||
endif
|
||||
else
|
||||
ifeq "$(strip $(HP_64))" ""
|
||||
LIB_ARCH = i386
|
||||
CUDPPLIB_SUFFIX = i386
|
||||
NVCCFLAGS += -m32
|
||||
ifneq ($(DARWIN),)
|
||||
CXX_ARCH_FLAGS += -arch i386
|
||||
else
|
||||
CXX_ARCH_FLAGS += -m32
|
||||
endif
|
||||
else
|
||||
LIB_ARCH = x86_64
|
||||
CUDPPLIB_SUFFIX = x86_64
|
||||
NVCCFLAGS += -m64
|
||||
ifneq ($(DARWIN),)
|
||||
CXX_ARCH_FLAGS += -arch x86_64
|
||||
else
|
||||
CXX_ARCH_FLAGS += -m64
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(noinline),1)
|
||||
NVCCFLAGS += -Xopencc -noinline
|
||||
# Compiler-specific flags, when using noinline, we don't build for SM1x
|
||||
GENCODE_SM10 :=
|
||||
GENCODE_SM20 := -gencode=arch=compute_20,code=\"sm_20,compute_20\"
|
||||
else
|
||||
# Compiler-specific flags (by default, we always use sm_10 and sm_20), unless we use the SMVERSION template
|
||||
GENCODE_SM10 := -gencode=arch=compute_10,code=\"sm_10,compute_10\"
|
||||
GENCODE_SM20 := -gencode=arch=compute_20,code=\"sm_20,compute_20\"
|
||||
endif
|
||||
|
||||
CXXFLAGS += $(CXXWARN_FLAGS) $(CXX_ARCH_FLAGS)
|
||||
CFLAGS += $(CWARN_FLAGS) $(CXX_ARCH_FLAGS)
|
||||
LINKFLAGS +=
|
||||
LINK += $(LINKFLAGS) $(CXX_ARCH_FLAGS)
|
||||
|
||||
# This option for Mac allows CUDA applications to work without requiring to set DYLD_LIBRARY_PATH
|
||||
ifneq ($(DARWIN),)
|
||||
LINK += -Xlinker -rpath $(CUDA_INSTALL_PATH)/lib
|
||||
endif
|
||||
|
||||
# Common flags
|
||||
COMMONFLAGS += $(INCLUDES) -DUNIX
|
||||
|
||||
# Debug/release configuration
|
||||
ifeq ($(dbg),1)
|
||||
COMMONFLAGS += -g
|
||||
NVCCFLAGS += -D_DEBUG
|
||||
CXXFLAGS += -D_DEBUG
|
||||
CFLAGS += -D_DEBUG
|
||||
BINSUBDIR := debug
|
||||
LIBSUFFIX := D
|
||||
else
|
||||
COMMONFLAGS += -O2
|
||||
BINSUBDIR := release
|
||||
LIBSUFFIX :=
|
||||
NVCCFLAGS += --compiler-options -fno-strict-aliasing
|
||||
CXXFLAGS += -fno-strict-aliasing
|
||||
CFLAGS += -fno-strict-aliasing
|
||||
endif
|
||||
|
||||
# architecture flag for cubin build
|
||||
CUBIN_ARCH_FLAG :=
|
||||
|
||||
# OpenGL is used or not (if it is used, then it is necessary to include GLEW)
|
||||
ifeq ($(USEGLLIB),1)
|
||||
ifneq ($(DARWIN),)
|
||||
OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries
|
||||
OPENGLLIB += -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a
|
||||
else
|
||||
# this case for linux platforms
|
||||
OPENGLLIB := -lGL -lGLU -lX11 -lXi -lXmu
|
||||
# check if x86_64 flag has been set, otherwise, check HP_64 is i386/x86_64
|
||||
ifeq ($(x86_64),1)
|
||||
OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64
|
||||
else
|
||||
ifeq ($(i386),)
|
||||
ifeq "$(strip $(HP_64))" ""
|
||||
OPENGLLIB += -lGLEW -L/usr/X11R6/lib
|
||||
else
|
||||
OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
# check if i386 flag has been set, otehrwise check HP_64 is i386/x86_64
|
||||
ifeq ($(i386),1)
|
||||
OPENGLLIB += -lGLEW -L/usr/X11R6/lib
|
||||
else
|
||||
ifeq ($(x86_64),)
|
||||
ifeq "$(strip $(HP_64))" ""
|
||||
OPENGLLIB += -lGLEW -L/usr/X11R6/lib
|
||||
else
|
||||
OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(USEGLUT),1)
|
||||
ifneq ($(DARWIN),)
|
||||
OPENGLLIB += -framework GLUT
|
||||
else
|
||||
ifeq ($(x86_64),1)
|
||||
OPENGLLIB += -lglut -L/usr/lib64
|
||||
endif
|
||||
ifeq ($(i386),1)
|
||||
OPENGLLIB += -lglut -L/usr/lib
|
||||
endif
|
||||
|
||||
ifeq ($(x86_64),)
|
||||
ifeq ($(i386),)
|
||||
OPENGLLIB += -lglut
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(USEPARAMGL),1)
|
||||
PARAMGLLIB := -lparamgl_$(LIB_ARCH)$(LIBSUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(USERENDERCHECKGL),1)
|
||||
RENDERCHECKGLLIB := -lrendercheckgl_$(LIB_ARCH)$(LIBSUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(USECUDPP), 1)
|
||||
CUDPPLIB := -lcudpp_$(CUDPPLIB_SUFFIX)$(LIBSUFFIX)
|
||||
|
||||
ifeq ($(emu), 1)
|
||||
CUDPPLIB := $(CUDPPLIB)_emu
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(USENVCUVID), 1)
|
||||
ifneq ($(DARWIN),)
|
||||
NVCUVIDLIB := -L../../common/lib/darwin -lnvcuvid
|
||||
endif
|
||||
endif
|
||||
|
||||
# Libs
|
||||
ifneq ($(DARWIN),)
|
||||
LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib $(NVCUVIDLIB)
|
||||
else
|
||||
ifeq "$(strip $(HP_64))" ""
|
||||
ifeq ($(x86_64),1)
|
||||
LIB := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib
|
||||
else
|
||||
LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib
|
||||
endif
|
||||
else
|
||||
ifeq ($(i386),1)
|
||||
LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib
|
||||
else
|
||||
LIB := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# If dynamically linking to CUDA and CUDART, we exclude the libraries from the LIB
|
||||
ifeq ($(USECUDADYNLIB),1)
|
||||
LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} -ldl -rdynamic
|
||||
else
|
||||
# static linking, we will statically link against CUDA and CUDART
|
||||
ifeq ($(USEDRVAPI),1)
|
||||
LIB += -lcuda ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB}
|
||||
else
|
||||
ifeq ($(emu),1)
|
||||
LIB += -lcudartemu
|
||||
else
|
||||
LIB += -lcudart
|
||||
endif
|
||||
LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB}
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(USECUFFT),1)
|
||||
ifeq ($(emu),1)
|
||||
LIB += -lcufftemu
|
||||
else
|
||||
LIB += -lcufft
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(USECUBLAS),1)
|
||||
ifeq ($(emu),1)
|
||||
LIB += -lcublasemu
|
||||
else
|
||||
LIB += -lcublas
|
||||
endif
|
||||
endif
|
||||
|
||||
# Lib/exe configuration
|
||||
ifneq ($(STATIC_LIB),)
|
||||
TARGETDIR := $(LIBDIR)
|
||||
TARGET := $(subst .a,_$(LIB_ARCH)$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB))
|
||||
LINKLINE = ar rucv $(TARGET) $(OBJS)
|
||||
else
|
||||
ifneq ($(OMIT_CUTIL_LIB),1)
|
||||
|
||||
#TUAN add - lboinc and -lboinc_api below
|
||||
LIB += -lcutil_$(LIB_ARCH)$(LIBSUFFIX) -lshrutil_$(LIB_ARCH)$(LIBSUFFIX) -lboinc_api -L$(BOINC_BUILD_DIR) -lboinc -L$(BOINC_BUILD_DIR)
|
||||
endif
|
||||
# Device emulation configuration
|
||||
ifeq ($(emu), 1)
|
||||
NVCCFLAGS += -deviceemu
|
||||
CUDACCFLAGS +=
|
||||
BINSUBDIR := emu$(BINSUBDIR)
|
||||
# consistency, makes developing easier
|
||||
CXXFLAGS += -D__DEVICE_EMULATION__
|
||||
CFLAGS += -D__DEVICE_EMULATION__
|
||||
endif
|
||||
TARGETDIR := $(BINDIR)/$(BINSUBDIR)
|
||||
TARGET := $(TARGETDIR)/$(EXECUTABLE)
|
||||
LINKLINE = $(LINK) -o $(TARGET) $(OBJS) $(LIB)
|
||||
endif
|
||||
|
||||
# check if verbose
|
||||
ifeq ($(verbose), 1)
|
||||
VERBOSE :=
|
||||
else
|
||||
VERBOSE := @
|
||||
endif
|
||||
|
||||
################################################################################
|
||||
# Check for input flags and set compiler flags appropriately
|
||||
################################################################################
|
||||
ifeq ($(fastmath), 1)
|
||||
NVCCFLAGS += -use_fast_math
|
||||
endif
|
||||
|
||||
ifeq ($(nvccverbose), 1)
|
||||
NVCCFLAGS += -v
|
||||
endif
|
||||
|
||||
ifeq ($(keep), 1)
|
||||
NVCCFLAGS += -keep
|
||||
NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx
|
||||
endif
|
||||
|
||||
ifdef maxregisters
|
||||
NVCCFLAGS += -maxrregcount $(maxregisters)
|
||||
endif
|
||||
|
||||
# Add cudacc flags
|
||||
NVCCFLAGS += $(CUDACCFLAGS)
|
||||
|
||||
# Add common flags
|
||||
NVCCFLAGS += $(COMMONFLAGS)
|
||||
CXXFLAGS += $(COMMONFLAGS)
|
||||
CFLAGS += $(COMMONFLAGS)
|
||||
|
||||
ifeq ($(nvcc_warn_verbose),1)
|
||||
NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS))
|
||||
NVCCFLAGS += --compiler-options -fno-strict-aliasing
|
||||
endif
|
||||
|
||||
################################################################################
|
||||
# Set up object files
|
||||
################################################################################
|
||||
OBJDIR := $(ROOTOBJDIR)/$(LIB_ARCH)/$(BINSUBDIR)
|
||||
OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(notdir $(CCFILES)))
|
||||
OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(notdir $(CFILES)))
|
||||
OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(notdir $(CUFILES)))
|
||||
|
||||
################################################################################
|
||||
# Set up cubin output files
|
||||
################################################################################
|
||||
CUBINDIR := $(SRCDIR)data
|
||||
CUBINS += $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES)))
|
||||
|
||||
################################################################################
|
||||
# Set up PTX output files
|
||||
################################################################################
|
||||
PTXDIR := $(SRCDIR)data
|
||||
PTXBINS += $(patsubst %.cu,$(PTXDIR)/%.ptx,$(notdir $(PTXFILES)))
|
||||
|
||||
################################################################################
|
||||
# Rules
|
||||
################################################################################
|
||||
$(OBJDIR)/%.c.o : $(SRCDIR)%.c $(C_DEPS)
|
||||
$(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $<
|
||||
|
||||
$(OBJDIR)/%.cpp.o : $(SRCDIR)%.cpp $(C_DEPS)
|
||||
$(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile
|
||||
$(OBJDIR)/%.cu.o : $(SRCDIR)%.cu $(CU_DEPS)
|
||||
$(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $<
|
||||
|
||||
# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile
|
||||
$(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory
|
||||
$(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $<
|
||||
|
||||
$(PTXDIR)/%.ptx : $(SRCDIR)%.cu ptxdirectory
|
||||
$(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -ptx $<
|
||||
|
||||
#
|
||||
# The following definition is a template that gets instantiated for each SM
|
||||
# version (sm_10, sm_13, etc.) stored in SMVERSIONS. It does 2 things:
|
||||
# 1. It adds to OBJS a .cu_sm_XX.o for each .cu file it finds in CUFILES_sm_XX.
|
||||
# 2. It generates a rule for building .cu_sm_XX.o files from the corresponding
|
||||
# .cu file.
|
||||
#
|
||||
# The intended use for this is to allow Makefiles that use common.mk to compile
|
||||
# files to different Compute Capability targets (aka SM arch version). To do
|
||||
# so, in the Makefile, list files for each SM arch separately, like so:
|
||||
# This will be used over the default rule abov
|
||||
#
|
||||
# CUFILES_sm_10 := mycudakernel_sm10.cu app.cu
|
||||
# CUFILES_sm_12 := anothercudakernel_sm12.cu
|
||||
#
|
||||
define SMVERSION_template
|
||||
#OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_$(1))))
|
||||
OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_sm_$(1))))
|
||||
$(OBJDIR)/%.cu_$(1).o : $(SRCDIR)%.cu $(CU_DEPS)
|
||||
# $(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS) $(1)
|
||||
# if we have noinline enabled, we only turn this enable this for SM 2.x architectures
|
||||
ifeq ($(noinline),1)
|
||||
$(VERBOSE)$(NVCC) $(GENCODE_SM20) -o $$@ -c $$< $(NVCCFLAGS)
|
||||
else
|
||||
$(VERBOSE)$(NVCC) -gencode=arch=compute_$(1),code=\"sm_$(1),compute_$(1)\" $(GENCODE_SM20) -o $$@ -c $$< $(NVCCFLAGS)
|
||||
endif
|
||||
endef
|
||||
|
||||
# This line invokes the above template for each arch version stored in
|
||||
# SM_VERSIONS. The call funtion invokes the template, and the eval
|
||||
# function interprets it as make commands.
|
||||
$(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver))))
|
||||
|
||||
###### TUAN This below line causes mmintrin.h and xmmintrin.h error.
|
||||
|
||||
$(TARGET): makedirectories $(OBJS) $(CUBINS) $(PTXBINS) Makefile_mac
|
||||
$(VERBOSE)$(LINKLINE)
|
||||
|
||||
cubindirectory:
|
||||
$(VERBOSE)mkdir -p $(CUBINDIR)
|
||||
|
||||
ptxdirectory:
|
||||
$(VERBOSE)mkdir -p $(PTXDIR)
|
||||
|
||||
makedirectories:
|
||||
$(VERBOSE)mkdir -p $(LIBDIR)
|
||||
$(VERBOSE)mkdir -p $(OBJDIR)
|
||||
$(VERBOSE)mkdir -p $(TARGETDIR)
|
||||
|
||||
|
||||
tidy :
|
||||
$(VERBOSE)find . | egrep "#" | xargs rm -f
|
||||
$(VERBOSE)find . | egrep "\~" | xargs rm -f
|
||||
|
||||
clean : tidy
|
||||
#TUAN $(VERBOSE)rm -f $(OBJS)
|
||||
$(VERBOSE)rm -f $(CUBINS)
|
||||
$(VERBOSE)rm -f $(PTXBINS)
|
||||
$(VERBOSE)rm -f $(TARGET)
|
||||
$(VERBOSE)rm -f $(NVCC_KEEP_CLEAN)
|
||||
$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.ppm
|
||||
$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.pgm
|
||||
$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bin
|
||||
$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bmp
|
||||
|
||||
clobber : clean
|
||||
$(VERBOSE)rm -rf $(ROOTOBJDIR)
|
|
@ -0,0 +1,122 @@
|
|||
/*
|
||||
* Tuan Le
|
||||
* University of California, Berkeley
|
||||
* Berkeley Space Sciences Lab
|
||||
* tuanle86@berkeley.edu
|
||||
*/
|
||||
|
||||
// When VERIFY is defined, the sum of squared errors is calculated between the
|
||||
// identity matrix and the product A * incerse(A). For debugging...
|
||||
//#define VERIFY 1
|
||||
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include "cuda_config.h"
|
||||
|
||||
__global__ void GEStep1A(REAL * AI, int i, int n2, int lda2) {
|
||||
int k = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (k>i && k < n2 && AI[i*lda2+k]!=0) {
|
||||
REAL multiplyer = -AI[i*lda2+k]/AI[i*lda2+i];
|
||||
int n = n2 / 2;
|
||||
for (int j = i+1; j < n; j++) {
|
||||
AI[j*lda2+k] += multiplyer*AI[j*lda2+i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void GEStep2(REAL * AI,REAL diag,int i, int n2, int lda2) {
|
||||
int k = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (k < n2) {
|
||||
AI[i*lda2+k] /= diag;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void GEStep3(REAL * AI,int i, int n2, int lda2) {
|
||||
int k = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (k > i && k < n2) {
|
||||
REAL multiplyer = -AI[i*lda2+k];
|
||||
for (int j = 0; j < i; j++) {
|
||||
AI[j*lda2+k] += multiplyer*AI[j*lda2+i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//extern void invert(REAL * A, int n);
|
||||
void invertge(REAL * AI_d, int lda, int n) {
|
||||
int lda2 = lda * 2;
|
||||
// perform elementary row operations till A in AI becomes identity matrix
|
||||
for (int i = 0; i < n; i++) {
|
||||
GEStep1A<<<(int)ceil((float)(1+(2*n-1)/32)),32>>>(AI_d,i,n*2, lda2);
|
||||
CUDACHECK;
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
|
||||
for (int i = n-1; i >= 0; i--) {
|
||||
REAL diag = 1.0;
|
||||
SAFECALL(cudaMemcpy(&diag, &AI_d[i*lda2+i], sizeof(REAL), cudaMemcpyDeviceToHost));
|
||||
GEStep2<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,diag,i,n*2, lda2);
|
||||
CUDACHECK;
|
||||
|
||||
GEStep3<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,i,n*2, lda2);
|
||||
CUDACHECK;
|
||||
cudaThreadSynchronize();
|
||||
CUDACHECK;
|
||||
}
|
||||
} // invertge
|
||||
|
||||
|
||||
/* inverts nxn matrix A and stores result back in A */
|
||||
extern void invert(REAL * A, int n) {
|
||||
fprintf(stderr,"starting inversion n = %d ", n);
|
||||
volatile clock_t gputime, gputime0;
|
||||
gputime=clock();
|
||||
gputime0 = gputime;
|
||||
|
||||
int lda = ((n+15)&~15|16);
|
||||
//lda=n;
|
||||
REAL * AI = (REAL *)malloc(sizeof(REAL)*(n*lda*2));
|
||||
memset(AI,0,sizeof(REAL)*n*lda*2);
|
||||
for (int i = 0; i < n; i++) {
|
||||
memcpy(&AI[lda*i*2], &A[n*i], sizeof(REAL)*n);
|
||||
AI[lda*i*2+n+i] = 1;
|
||||
}
|
||||
|
||||
REAL * AI_d;
|
||||
SAFECALL(cudaMalloc((void **) &AI_d, sizeof(REAL)*n*lda*2));
|
||||
SAFECALL(cudaMemcpy(AI_d, AI, sizeof(REAL)*n*lda*2, cudaMemcpyHostToDevice));
|
||||
|
||||
invertge(AI_d, lda, n);
|
||||
SAFECALL(cudaMemcpy(AI, AI_d, sizeof(REAL)*n*lda*2, cudaMemcpyDeviceToHost));
|
||||
cudaFree(AI_d);
|
||||
|
||||
|
||||
gputime=clock()-gputime;fprintf(stderr, " %7.1f ms ",gputime/1.e3f);
|
||||
fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime);
|
||||
#ifdef VERIFY
|
||||
// let's verify that
|
||||
REAL error=0.0;
|
||||
|
||||
// multiply inverse*xcopy, should be Identity matrix
|
||||
for (int k = 0; k < n; k++) {
|
||||
for (int j = 0; j < n; j++) {
|
||||
REAL sum = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
sum += AI[j*lda*2+n+i]*A[i*n+k];
|
||||
}
|
||||
if (j!=k) {
|
||||
error += sum * sum;
|
||||
} else {
|
||||
error += (1.0-sum) * (1.0-sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf(stderr, " %6.2f SSE", error);
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
memcpy(&A[n*i], &AI[lda*i*2+n], sizeof(REAL)*n);
|
||||
}
|
||||
free(AI);
|
||||
fprintf(stderr," done!\n");
|
||||
} // invert
|
|
@ -0,0 +1,440 @@
|
|||
/*
|
||||
* Tuan Le
|
||||
* University of California, Berkeley
|
||||
* Berkeley Space Sciences Lab
|
||||
* tuanle86@berkeley.edu
|
||||
*/
|
||||
|
||||
#ifdef _WIN32
|
||||
#include "boinc_win.h"
|
||||
#else
|
||||
#include "config.h"
|
||||
#include <cstdio>
|
||||
#include <cctype>
|
||||
#include <ctime>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <csignal>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cublas.h>
|
||||
#include "cuda_config.h"
|
||||
|
||||
#include "str_util.h"
|
||||
#include "util.h"
|
||||
#include "filesys.h"
|
||||
#include "boinc_api.h"
|
||||
#include "mfile.h"
|
||||
#include "graphics2.h"
|
||||
|
||||
struct UC_SHMEM {
|
||||
double update_time;
|
||||
double fraction_done;
|
||||
double cpu_time;
|
||||
BOINC_STATUS status;
|
||||
int countdown;
|
||||
// graphics app sets this to 5 repeatedly,
|
||||
// main program decrements it once/sec.
|
||||
// If it's zero, don't bother updating shmem
|
||||
};
|
||||
|
||||
#ifdef APP_GRAPHICS
|
||||
#include "uc2.h"
|
||||
UC_SHMEM* shmem;
|
||||
#endif
|
||||
|
||||
using std::string;
|
||||
|
||||
#define CHECKPOINT_FILE "matrix_inversion_state"
|
||||
#define INPUT_FILENAME "input"
|
||||
#define OUTPUT_FILENAME "output"
|
||||
#define MATRIX_SIZE 10
|
||||
|
||||
|
||||
// execute the kernel NUM_ITERATIONS times
|
||||
#define NUM_ITERATIONS 51
|
||||
|
||||
bool run_slow = false;
|
||||
bool early_exit = false;
|
||||
bool early_crash = false;
|
||||
bool early_sleep = false;
|
||||
double cpu_time = 20, comp_result;
|
||||
|
||||
// do a billion floating-point ops
|
||||
// (note: I needed to add an arg to this;
|
||||
// otherwise the MS C++ compiler optimizes away
|
||||
// all but the first call to it!)
|
||||
//
|
||||
static double do_a_giga_flop(int foo) {
|
||||
double x = 3.14159*foo;
|
||||
int i;
|
||||
for (i=0; i<500000000; i++) {
|
||||
x += 5.12313123;
|
||||
x *= 0.5398394834;
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
int do_checkpoint(MFILE& mf, int n, REAL *h_idata, int dimension) {
|
||||
int retval;
|
||||
string resolved_name;
|
||||
|
||||
FILE* f = fopen("temp", "w");
|
||||
if (!f) return 1;
|
||||
fprintf(f, "%d", n); //write inversion number
|
||||
fprintf(f, " ");
|
||||
fprintf(f, "%d", dimension); //write dimension
|
||||
fprintf(f, " ");
|
||||
for (int i=0;i<dimension*dimension;++i) {
|
||||
fprintf(f, " ");
|
||||
fprintf(f, "%f", h_idata[i]);
|
||||
}
|
||||
fclose(f);
|
||||
retval = mf.flush(); //not really necessary since we have not yet written anything to output file when doing checkpointing
|
||||
if (retval) return retval;
|
||||
boinc_resolve_filename_s(CHECKPOINT_FILE, resolved_name); //resolved_name is a string object
|
||||
retval = boinc_rename("temp", resolved_name.c_str()); //c_str() will convert a string object to a char string with null terminator equivalent.
|
||||
// because we do rename. Thus temp does not appear, but the CHECKPOINT_FILE appears instead on the disk.
|
||||
|
||||
if (retval) return retval;
|
||||
return 0; //return 0 to indicate success.
|
||||
}
|
||||
|
||||
#ifdef APP_GRAPHICS
|
||||
void update_shmem() {
|
||||
if (!shmem) return;
|
||||
|
||||
// always do this; otherwise a graphics app will immediately
|
||||
// assume we're not alive
|
||||
shmem->update_time = dtime();
|
||||
|
||||
// Check whether a graphics app is running,
|
||||
// and don't bother updating shmem if so.
|
||||
// This doesn't matter here,
|
||||
// but may be worth doing if updating shmem is expensive.
|
||||
//
|
||||
if (shmem->countdown > 0) {
|
||||
// the graphics app sets this to 5 every time it renders a frame
|
||||
shmem->countdown--;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
shmem->fraction_done = boinc_get_fraction_done();
|
||||
shmem->cpu_time = boinc_worker_thread_cpu_time();;
|
||||
boinc_get_status(&shmem->status);
|
||||
}
|
||||
#endif
|
||||
|
||||
void print(REAL *A, int n); //on commandline
|
||||
void generateRandomInputFile(int n);
|
||||
int getMatrixDimension(FILE *infile);
|
||||
int countElementsInMatrix(FILE *infile);
|
||||
void fetchElementsIntoHostMemory(FILE *infile, REAL *h_idata);
|
||||
void printToFile(MFILE *out, float *h_odata, int dimension);
|
||||
extern void invert(REAL * A, int n);
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
int i, retval, lastInversion=0, checkpointExists=0, dimension=0;
|
||||
double fd;
|
||||
char input_path[512], output_path[512], chkpt_path[512], buf[256];
|
||||
REAL* h_idata;
|
||||
MFILE out;
|
||||
FILE* state, *infile;
|
||||
|
||||
generateRandomInputFile(MATRIX_SIZE); //call this if you don't want to construct the input file manually
|
||||
|
||||
for (i=0; i<argc; i++) {
|
||||
if (!strcmp(argv[i], "-early_exit")) early_exit = true;
|
||||
if (!strcmp(argv[i], "-early_crash")) early_crash = true;
|
||||
if (!strcmp(argv[i], "-early_sleep")) early_sleep = true;
|
||||
if (!strcmp(argv[i], "-run_slow")) run_slow = true;
|
||||
if (!strcmp(argv[i], "-cpu_time")) {
|
||||
cpu_time = atof(argv[++i]);
|
||||
}
|
||||
}
|
||||
|
||||
retval = boinc_init();
|
||||
if (retval) {
|
||||
fprintf(stderr, "%s boinc_init returned %d\n",
|
||||
boinc_msg_prefix(buf), retval
|
||||
);
|
||||
exit(retval);
|
||||
}
|
||||
|
||||
// open the input file (resolve logical name first)
|
||||
//
|
||||
boinc_resolve_filename(INPUT_FILENAME, input_path, sizeof(input_path));
|
||||
infile = boinc_fopen(input_path, "r");
|
||||
if (!infile) {
|
||||
fprintf(stderr,
|
||||
"%s Couldn't find input file, resolved name %s.\n",
|
||||
boinc_msg_prefix(buf), input_path
|
||||
);
|
||||
getchar();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
boinc_resolve_filename(OUTPUT_FILENAME, output_path, sizeof(output_path));
|
||||
|
||||
// See if there's a valid checkpoint file.
|
||||
// If so retrieve the current matrix and inversion number
|
||||
//
|
||||
boinc_resolve_filename(CHECKPOINT_FILE, chkpt_path, sizeof(chkpt_path));
|
||||
state = boinc_fopen(chkpt_path, "r");
|
||||
if (state) {
|
||||
printf("Checkpoint file is detected. Read from checkpoint file ... \n");
|
||||
checkpointExists=fscanf(state, "%d", &lastInversion);
|
||||
if (checkpointExists == 1) {
|
||||
printf("Last inversion # is : %d\n",lastInversion);
|
||||
fscanf(state,"%d",&dimension);
|
||||
cudaMallocHost((void **)&h_idata,dimension*dimension*sizeof(REAL));
|
||||
for (int i=0;i<dimension*dimension;++i) {
|
||||
fscanf(state, "%f", &h_idata[i]);
|
||||
//printf("--%f\n",h_idata[i]);
|
||||
}
|
||||
}
|
||||
fclose(state);
|
||||
} else {
|
||||
printf("There's no valid checkpoint file!\n");
|
||||
}
|
||||
|
||||
retval = out.open(output_path, "wb");
|
||||
|
||||
if (retval) {
|
||||
fprintf(stderr, "%s APP: matrix_inversion output open failed:\n",
|
||||
boinc_msg_prefix(buf)
|
||||
);
|
||||
fprintf(stderr, "%s resolved name %s, retval %d\n",
|
||||
boinc_msg_prefix(buf), output_path, retval
|
||||
);
|
||||
perror("open");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
#ifdef APP_GRAPHICS
|
||||
// create shared mem segment for graphics, and arrange to update it
|
||||
//
|
||||
shmem = (UC_SHMEM*)boinc_graphics_make_shmem("matrix_inversion", sizeof(UC_SHMEM));
|
||||
if (!shmem) {
|
||||
fprintf(stderr, "%s failed to create shared mem segment\n",
|
||||
boinc_msg_prefix(buf)
|
||||
);
|
||||
}
|
||||
update_shmem();
|
||||
boinc_register_timer_callback(update_shmem);
|
||||
#endif
|
||||
|
||||
if (checkpointExists != 1) {
|
||||
dimension=getMatrixDimension(infile);
|
||||
printf("Matrix dimension: %d\n",dimension);
|
||||
cudaMallocHost((void **)&h_idata,dimension*dimension*sizeof(REAL));
|
||||
fetchElementsIntoHostMemory(infile,h_idata);
|
||||
out.printf("\n----------------- Before being inversed ----------------\n\n");
|
||||
printf("Computation is running ... Inverse the matrix %d times. Start at inversion #1\n",NUM_ITERATIONS);
|
||||
} else {
|
||||
out.printf("\n----------------- Last checkpointed inversion #%d ----------------\n\n",lastInversion);
|
||||
printf("Computation is resumed ... Inverse the matrix %d more times. Start at inversion #%d\n",NUM_ITERATIONS-lastInversion,lastInversion+1);
|
||||
}
|
||||
printToFile(&out,h_idata,dimension);
|
||||
|
||||
|
||||
|
||||
for (int i=lastInversion+1;i<=NUM_ITERATIONS;++i) {
|
||||
invert(h_idata,dimension);
|
||||
|
||||
printf("Finish inversion #%d\n",i);
|
||||
|
||||
if (run_slow) {
|
||||
boinc_sleep(1.);
|
||||
}
|
||||
|
||||
if (early_exit && i>30) {
|
||||
exit(-10);
|
||||
}
|
||||
|
||||
if (early_crash && i>30) {
|
||||
boinc_crash();
|
||||
}
|
||||
if (early_sleep && i>30) {
|
||||
g_sleep = true;
|
||||
while (1) boinc_sleep(1);
|
||||
}
|
||||
|
||||
if (boinc_time_to_checkpoint()) {
|
||||
//if (i==7) {
|
||||
printf("Perform checkpointing at inversion # %d\n",i);
|
||||
|
||||
//we'll need to write the current matrix to the state file.
|
||||
retval = do_checkpoint(out, i, h_idata, dimension);
|
||||
if (retval) {
|
||||
fprintf(stderr, "%s APP: matrix_inversion checkpoint failed %d\n",
|
||||
boinc_msg_prefix(buf), retval
|
||||
);
|
||||
exit(retval);
|
||||
}
|
||||
boinc_checkpoint_completed();
|
||||
}
|
||||
|
||||
fd = i/NUM_ITERATIONS;
|
||||
if (cpu_time) fd /= 2;
|
||||
boinc_fraction_done(fd);
|
||||
|
||||
}
|
||||
|
||||
out.printf("\n\n----------------- Final inversion #%d----------------\n\n",NUM_ITERATIONS);
|
||||
printToFile(&out,h_idata,dimension);
|
||||
|
||||
cudaFreeHost( h_idata );
|
||||
retval = out.flush(); //force the output file to be closed.
|
||||
if (retval) {
|
||||
fprintf(stderr, "%s APP: matrix_inversion flush failed %d\n",
|
||||
boinc_msg_prefix(buf), retval
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// burn up some CPU time if needed
|
||||
//
|
||||
if (cpu_time) {
|
||||
double start = dtime();
|
||||
for (int i=0; ; i++) {
|
||||
double e = dtime()-start;
|
||||
if (e > cpu_time) break;
|
||||
fd = .5 + .5*(e/cpu_time);
|
||||
boinc_fraction_done(fd);
|
||||
|
||||
if (boinc_time_to_checkpoint()) {
|
||||
retval = do_checkpoint(out, NUM_ITERATIONS, h_idata, dimension);
|
||||
if (retval) {
|
||||
fprintf(stderr, "%s APP: maxtrix_inversion checkpoint failed %d\n",
|
||||
boinc_msg_prefix(buf), retval
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
boinc_checkpoint_completed();
|
||||
}
|
||||
comp_result = do_a_giga_flop(i);
|
||||
}
|
||||
}
|
||||
|
||||
boinc_fraction_done(1);
|
||||
#ifdef APP_GRAPHICS
|
||||
update_shmem();
|
||||
#endif
|
||||
|
||||
printf("Done!");
|
||||
getchar();
|
||||
boinc_finish(0);
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR Args, int WinMode) {
|
||||
LPSTR command_line;
|
||||
char* argv[100];
|
||||
int argc;
|
||||
|
||||
command_line = GetCommandLine();
|
||||
argc = parse_command_line( command_line, argv );
|
||||
return main(argc, argv);
|
||||
}
|
||||
#endif
|
||||
|
||||
void print(REAL *h_idata, int n) {
|
||||
int j=0;
|
||||
for (int i=0;i<n*n;++i) {
|
||||
printf("%f ",h_idata[i]);
|
||||
if (j+1==n) {
|
||||
printf("\n");
|
||||
j=0;
|
||||
} else {
|
||||
++j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// nxn matrix
|
||||
void generateRandomInputFile(int n) {
|
||||
FILE *infile;
|
||||
|
||||
infile=fopen(INPUT_FILENAME,"w");
|
||||
REAL *h_idata = new REAL[n*n];
|
||||
srand(n);
|
||||
for( int i = 0; i < n; i++ ) {
|
||||
for (int j = 0; j < n; j++) {
|
||||
h_idata[i*n+j] = 2.0*(rand()%32768)/32768.0 - 1.0;
|
||||
}
|
||||
h_idata[i*n+i] += sqrt((float)n);
|
||||
}
|
||||
int j=0;
|
||||
for (int i=0;i<n*n;++i) {
|
||||
fprintf(infile,"%15f",h_idata[i]);
|
||||
if (j+1==n) {
|
||||
fprintf(infile,"\n");
|
||||
j=0;
|
||||
} else {
|
||||
++j;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
}
|
||||
|
||||
int getMatrixDimension(FILE *infile) {
|
||||
int w=0;
|
||||
char c;
|
||||
|
||||
fseek(infile,0,SEEK_SET);
|
||||
while (true) {
|
||||
do {
|
||||
c=fgetc(infile);
|
||||
if (c == EOF || c == '\n') {
|
||||
goto exitLoop;
|
||||
}
|
||||
} while (isspace(c));
|
||||
|
||||
if (isdigit(c) || c=='.' || c=='-') {
|
||||
++w;
|
||||
}
|
||||
|
||||
do {
|
||||
c=fgetc(infile);
|
||||
if (c == EOF || c == '\n') {
|
||||
goto exitLoop;
|
||||
}
|
||||
} while (isdigit(c) || c=='.' || c=='-');
|
||||
|
||||
if (c==EOF || c == '\n') {
|
||||
break;
|
||||
}
|
||||
}
|
||||
exitLoop:
|
||||
return w;
|
||||
}
|
||||
|
||||
void fetchElementsIntoHostMemory(FILE *infile, REAL *h_idata) {
|
||||
float num=0;
|
||||
int i=0;
|
||||
fseek(infile,0,SEEK_SET);
|
||||
while (fscanf(infile,"%f",&num)==1) {
|
||||
h_idata[i]=num;
|
||||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
void printToFile(MFILE *out, float *h_odata, int dimension) {
|
||||
int count=0;
|
||||
int move=0;
|
||||
int num_elements=dimension*dimension;
|
||||
while (num_elements>0) {
|
||||
out->printf("%15f ",h_odata[move]);
|
||||
++count;
|
||||
++move;
|
||||
if (count==dimension) {
|
||||
out->printf("\n");
|
||||
count=0;
|
||||
}
|
||||
--num_elements;
|
||||
}
|
||||
}
|
|
@ -6,11 +6,21 @@ tuanle86@berkeley.edu
|
|||
----------------------- Compiler ----------------------------
|
||||
|
||||
You will need to install gcc 4.3 and g++ 4.3. It appears that Cuda 3.0 SDK has not yet worked
|
||||
with gcc 4.4 and g++ 4.4
|
||||
with gcc 4.4 and g++ 4.4. Please follow these steps:
|
||||
|
||||
sudo aptitude install gcc-4.3 g++-4.3
|
||||
cd ~/NVIDIA_GPU_Computing_SDK/C (go to SDK source dir. You should find the right path on your machine)
|
||||
mkdir mygcc
|
||||
cd mygcc
|
||||
ln -s $(which g++-4.3) g++
|
||||
ln -s $(which gcc-4.3) gcc
|
||||
|
||||
Note: mygcc directory must be created since I have made symlinks to the 4.3 compiler to mygcc directory
|
||||
in my current Makefile.
|
||||
|
||||
----------------------- Linux Makefile ----------------------
|
||||
|
||||
Makefile needs to be edited on your machine before running. Please follow these steps:
|
||||
common.mk file needs to be edited on your machine before running. Please follow these steps:
|
||||
|
||||
1) Open "boinc/samples/nvcuda/common.mk" with gedit
|
||||
2) Ctrl+f and search for "tuanle". You will find the following:
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
Tuan Le
|
||||
University of California, Berkeley
|
||||
Berkeley Space Sciences Lab
|
||||
tuanle86@berkeley.edu
|
||||
|
||||
----------------------- Mac OS Makefile ----------------------
|
||||
By default, the NVIDIA Cuda SDK for MacOS is installed at $ROOT/Developer/ under the directory name "GPU Computing".
|
||||
If it is installed somewhere else, then common_mac.mk file needs to be edited by the following steps:
|
||||
|
||||
1) Open "boinc/samples/nvcuda/common_mac.mk"
|
||||
2) Look for "ROOTDIR ?= /DeveloperGPU\ Computing
|
||||
3) Replace this path by appropriate path on your machine.
|
||||
4) Done!
|
||||
|
||||
----------------------- Mac Troubleshooting ----------------
|
||||
|
||||
Error: ./example_app_nvcuda_mac: library not found for -lcutil_i386.
|
||||
Solution: run make from "CPU Computing" directory to build the cutil_i386 library.
|
||||
|
||||
Note: It appears that nvcc compiler has trouble compiling .cu file that contains both BOINC and
|
||||
Cuda code. Any attempt to compile such .cu files will result in errors like the following:
|
||||
|
||||
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(55): error: identifier "__builtin_ia32_emms" is undefined
|
||||
|
||||
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(68): error: identifier "__builtin_ia32_vec_init_v2si" is undefined
|
||||
|
||||
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(111): error: identifier "__builtin_ia32_vec_ext_v2si" is undefined
|
||||
|
||||
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(150): error: identifier "__builtin_ia32_packsswb" is undefined
|
||||
|
||||
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(165): error: identifier "__builtin_ia32_packssdw" is undefined
|
||||
|
||||
...........
|
||||
|
||||
The solution that I came up with is to compile BOINC and Cuda seperately. Put BOINC code in .c file and CUDA code in .cu file.
|
||||
Then, write an external function in .cu file that will make kernel calls, do computation and return a result that could be used
|
||||
in .c file.
|
||||
|
||||
----------------------- Run Executable file ------------------
|
||||
|
||||
"make -f Makefile_mac" command will create an executable file in "boinc/samples/nvcuda/darwin/release/".
|
||||
If your machine doesn't have CUDA-enabled GPU, then the executable file for this sample app in
|
||||
the release directory will stop execution after some output statements on the terminal. In this case,
|
||||
it's best to run in emurelease mode. To generate an executable file in emurelease mode, type
|
||||
"make -f Makefile_mac emu=1". The executable file is then created in "boinc/samples/nvcuda/darwin/emurelease/".
|
||||
Note that NVIDIA Cuda SDK version older than version 3.0 will no longer support emulation mode.
|
||||
|
||||
Also, you need to define some environment variables before running the executable file.
|
||||
|
||||
export PATH=/usr/local/cuda/bin:$PATH
|
||||
export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
|
Loading…
Reference in New Issue