Makefile_mac completed

cuda_mac.c and cuda_kernel_mac.cu are targeted on Mac

svn path=/trunk/boinc/; revision=21862
This commit is contained in:
Tuan Le 2010-07-01 17:53:11 +00:00
parent d59af26596
commit fc87ee85d4
6 changed files with 1120 additions and 2 deletions

View File

@ -0,0 +1,16 @@
# Tuan Le
# University of California, Berkeley
# Berkeley Space Sciences Lab
# tuanle86@berkeley.ed
# Add source files here
EXECUTABLE := example_app_nvcuda_mac
# Cuda source files (compiled with cudacc)
CUFILES := cuda_kernel_mac.cu
# C/C++ source files (compiled with gcc / c++)
CCFILES := cuda_mac.c \
################################################################################
# Rules and targets
include common_mac.mk

View File

@ -0,0 +1,479 @@
# Tuan Le
# University of California, Berkeley
# Berkeley Space Sciences Lab
# tuanle86@berkeley.edu
.SUFFIXES : .cu .cu_dbg.o .c_dbg.o .cpp_dbg.o .cu_rel.o .c_rel.o .cpp_rel.o .cubin .ptx
# Add new SM Versions here as devices with new Compute Capability are released
SM_VERSIONS := 10 11 12 13 20
CUDA_INSTALL_PATH ?= /usr/local/cuda
ifdef cuda-install
CUDA_INSTALL_PATH := $(cuda-install)
endif
# detect OS
OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])
# 'linux' is output for Linux system, 'darwin' for OS X
DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
ifneq ($(DARWIN),)
SNOWLEOPARD = $(strip $(findstring 10.6, $(shell egrep "<string>10\.6" /System/Library/CoreServices/SystemVersion.plist)))
endif
# detect 32-bit or 64-bit platform
HP_64 = $(shell uname -m | grep 64)
OSARCH= $(shell uname -m)
# Basic directory setup for SDK
# (override directories only if they are not already defined)
SRCDIR ?=
ROOTDIR ?= /Developer/GPU\ Computing
ROOTBINDIR ?= ../../samples/nvcuda
BINDIR ?= $(ROOTBINDIR)/$(OSLOWER)
ROOTOBJDIR ?= obj
# BOINC directory
BOINC_DIR = ../..
BOINC_API_DIR = $(BOINC_DIR)/api
BOINC_LIB_DIR = $(BOINC_DIR)/lib
BOINC_BUILD_DIR = $(BOINC_DIR)/mac_build/build/Deployment
BOINC_CONFIG_DIR = $(BOINC_DIR)/clientgui/mac
FRAMEWORKS_DIR = /System/Library/Frameworks
LIBDIR := $(ROOTDIR)/C/lib
COMMONDIR := $(ROOTDIR)/C/common
SHAREDDIR := $(ROOTDIR)/shared/
# Includes
INCLUDES += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc -I$(SHAREDDIR)/inc -I$(BOINC_CONFIG_DIR) -I$(BOINC_DIR) -I$(BOINC_LIB_DIR) -I$(BOINC_API_DIR)
# Compilers
NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc
CXX := g++
CC := gcc
#Tuan edits LINK to have #(INCLUDES)
LINK := g++ -fPIC $(INCLUDES)
# Warning flags
CXXWARN_FLAGS := \
-W -Wall \
-Wimplicit \
-Wswitch \
-Wformat \
-Wchar-subscripts \
-Wparentheses \
-Wmultichar \
-Wtrigraphs \
-Wpointer-arith \
-Wcast-align \
-Wreturn-type \
-Wno-unused-function \
$(SPACE)
CWARN_FLAGS := $(CXXWARN_FLAGS) \
-Wstrict-prototypes \
-Wmissing-prototypes \
-Wmissing-declarations \
-Wnested-externs \
-Wmain \
# architecture flag for nvcc and gcc compilers build
CUBIN_ARCH_FLAG :=
CXX_ARCH_FLAGS :=
NVCCFLAGS :=
LIB_ARCH := $(OSARCH)
# Determining the necessary Cross-Compilation Flags
# 32-bit OS, but we target 64-bit cross compilation
ifeq ($(x86_64),1)
NVCCFLAGS += -m64
LIB_ARCH = x86_64
CUDPPLIB_SUFFIX = x86_64
ifneq ($(DARWIN),)
CXX_ARCH_FLAGS += -arch x86_64
else
CXX_ARCH_FLAGS += -m64
endif
else
# 64-bit OS, and we target 32-bit cross compilation
ifeq ($(i386),1)
NVCCFLAGS += -m32
LIB_ARCH = i386
CUDPPLIB_SUFFIX = i386
ifneq ($(DARWIN),)
CXX_ARCH_FLAGS += -arch i386
else
CXX_ARCH_FLAGS += -m32
endif
else
ifeq "$(strip $(HP_64))" ""
LIB_ARCH = i386
CUDPPLIB_SUFFIX = i386
NVCCFLAGS += -m32
ifneq ($(DARWIN),)
CXX_ARCH_FLAGS += -arch i386
else
CXX_ARCH_FLAGS += -m32
endif
else
LIB_ARCH = x86_64
CUDPPLIB_SUFFIX = x86_64
NVCCFLAGS += -m64
ifneq ($(DARWIN),)
CXX_ARCH_FLAGS += -arch x86_64
else
CXX_ARCH_FLAGS += -m64
endif
endif
endif
endif
ifeq ($(noinline),1)
NVCCFLAGS += -Xopencc -noinline
# Compiler-specific flags, when using noinline, we don't build for SM1x
GENCODE_SM10 :=
GENCODE_SM20 := -gencode=arch=compute_20,code=\"sm_20,compute_20\"
else
# Compiler-specific flags (by default, we always use sm_10 and sm_20), unless we use the SMVERSION template
GENCODE_SM10 := -gencode=arch=compute_10,code=\"sm_10,compute_10\"
GENCODE_SM20 := -gencode=arch=compute_20,code=\"sm_20,compute_20\"
endif
CXXFLAGS += $(CXXWARN_FLAGS) $(CXX_ARCH_FLAGS)
CFLAGS += $(CWARN_FLAGS) $(CXX_ARCH_FLAGS)
LINKFLAGS +=
LINK += $(LINKFLAGS) $(CXX_ARCH_FLAGS)
# This option for Mac allows CUDA applications to work without requiring to set DYLD_LIBRARY_PATH
ifneq ($(DARWIN),)
LINK += -Xlinker -rpath $(CUDA_INSTALL_PATH)/lib
endif
# Common flags
COMMONFLAGS += $(INCLUDES) -DUNIX
# Debug/release configuration
ifeq ($(dbg),1)
COMMONFLAGS += -g
NVCCFLAGS += -D_DEBUG
CXXFLAGS += -D_DEBUG
CFLAGS += -D_DEBUG
BINSUBDIR := debug
LIBSUFFIX := D
else
COMMONFLAGS += -O2
BINSUBDIR := release
LIBSUFFIX :=
NVCCFLAGS += --compiler-options -fno-strict-aliasing
CXXFLAGS += -fno-strict-aliasing
CFLAGS += -fno-strict-aliasing
endif
# architecture flag for cubin build
CUBIN_ARCH_FLAG :=
# OpenGL is used or not (if it is used, then it is necessary to include GLEW)
ifeq ($(USEGLLIB),1)
ifneq ($(DARWIN),)
OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries
OPENGLLIB += -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a
else
# this case for linux platforms
OPENGLLIB := -lGL -lGLU -lX11 -lXi -lXmu
# check if x86_64 flag has been set, otherwise, check HP_64 is i386/x86_64
ifeq ($(x86_64),1)
OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64
else
ifeq ($(i386),)
ifeq "$(strip $(HP_64))" ""
OPENGLLIB += -lGLEW -L/usr/X11R6/lib
else
OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64
endif
endif
endif
# check if i386 flag has been set, otehrwise check HP_64 is i386/x86_64
ifeq ($(i386),1)
OPENGLLIB += -lGLEW -L/usr/X11R6/lib
else
ifeq ($(x86_64),)
ifeq "$(strip $(HP_64))" ""
OPENGLLIB += -lGLEW -L/usr/X11R6/lib
else
OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64
endif
endif
endif
endif
endif
ifeq ($(USEGLUT),1)
ifneq ($(DARWIN),)
OPENGLLIB += -framework GLUT
else
ifeq ($(x86_64),1)
OPENGLLIB += -lglut -L/usr/lib64
endif
ifeq ($(i386),1)
OPENGLLIB += -lglut -L/usr/lib
endif
ifeq ($(x86_64),)
ifeq ($(i386),)
OPENGLLIB += -lglut
endif
endif
endif
endif
ifeq ($(USEPARAMGL),1)
PARAMGLLIB := -lparamgl_$(LIB_ARCH)$(LIBSUFFIX)
endif
ifeq ($(USERENDERCHECKGL),1)
RENDERCHECKGLLIB := -lrendercheckgl_$(LIB_ARCH)$(LIBSUFFIX)
endif
ifeq ($(USECUDPP), 1)
CUDPPLIB := -lcudpp_$(CUDPPLIB_SUFFIX)$(LIBSUFFIX)
ifeq ($(emu), 1)
CUDPPLIB := $(CUDPPLIB)_emu
endif
endif
ifeq ($(USENVCUVID), 1)
ifneq ($(DARWIN),)
NVCUVIDLIB := -L../../common/lib/darwin -lnvcuvid
endif
endif
# Libs
ifneq ($(DARWIN),)
LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib $(NVCUVIDLIB)
else
ifeq "$(strip $(HP_64))" ""
ifeq ($(x86_64),1)
LIB := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib
else
LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib
endif
else
ifeq ($(i386),1)
LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib
else
LIB := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib
endif
endif
endif
# If dynamically linking to CUDA and CUDART, we exclude the libraries from the LIB
ifeq ($(USECUDADYNLIB),1)
LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} -ldl -rdynamic
else
# static linking, we will statically link against CUDA and CUDART
ifeq ($(USEDRVAPI),1)
LIB += -lcuda ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB}
else
ifeq ($(emu),1)
LIB += -lcudartemu
else
LIB += -lcudart
endif
LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB}
endif
endif
ifeq ($(USECUFFT),1)
ifeq ($(emu),1)
LIB += -lcufftemu
else
LIB += -lcufft
endif
endif
ifeq ($(USECUBLAS),1)
ifeq ($(emu),1)
LIB += -lcublasemu
else
LIB += -lcublas
endif
endif
# Lib/exe configuration
ifneq ($(STATIC_LIB),)
TARGETDIR := $(LIBDIR)
TARGET := $(subst .a,_$(LIB_ARCH)$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB))
LINKLINE = ar rucv $(TARGET) $(OBJS)
else
ifneq ($(OMIT_CUTIL_LIB),1)
#TUAN add - lboinc and -lboinc_api below
LIB += -lcutil_$(LIB_ARCH)$(LIBSUFFIX) -lshrutil_$(LIB_ARCH)$(LIBSUFFIX) -lboinc_api -L$(BOINC_BUILD_DIR) -lboinc -L$(BOINC_BUILD_DIR)
endif
# Device emulation configuration
ifeq ($(emu), 1)
NVCCFLAGS += -deviceemu
CUDACCFLAGS +=
BINSUBDIR := emu$(BINSUBDIR)
# consistency, makes developing easier
CXXFLAGS += -D__DEVICE_EMULATION__
CFLAGS += -D__DEVICE_EMULATION__
endif
TARGETDIR := $(BINDIR)/$(BINSUBDIR)
TARGET := $(TARGETDIR)/$(EXECUTABLE)
LINKLINE = $(LINK) -o $(TARGET) $(OBJS) $(LIB)
endif
# check if verbose
ifeq ($(verbose), 1)
VERBOSE :=
else
VERBOSE := @
endif
################################################################################
# Check for input flags and set compiler flags appropriately
################################################################################
ifeq ($(fastmath), 1)
NVCCFLAGS += -use_fast_math
endif
ifeq ($(nvccverbose), 1)
NVCCFLAGS += -v
endif
ifeq ($(keep), 1)
NVCCFLAGS += -keep
NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx
endif
ifdef maxregisters
NVCCFLAGS += -maxrregcount $(maxregisters)
endif
# Add cudacc flags
NVCCFLAGS += $(CUDACCFLAGS)
# Add common flags
NVCCFLAGS += $(COMMONFLAGS)
CXXFLAGS += $(COMMONFLAGS)
CFLAGS += $(COMMONFLAGS)
ifeq ($(nvcc_warn_verbose),1)
NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS))
NVCCFLAGS += --compiler-options -fno-strict-aliasing
endif
################################################################################
# Set up object files
################################################################################
OBJDIR := $(ROOTOBJDIR)/$(LIB_ARCH)/$(BINSUBDIR)
OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(notdir $(CCFILES)))
OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(notdir $(CFILES)))
OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(notdir $(CUFILES)))
################################################################################
# Set up cubin output files
################################################################################
CUBINDIR := $(SRCDIR)data
CUBINS += $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES)))
################################################################################
# Set up PTX output files
################################################################################
PTXDIR := $(SRCDIR)data
PTXBINS += $(patsubst %.cu,$(PTXDIR)/%.ptx,$(notdir $(PTXFILES)))
################################################################################
# Rules
################################################################################
$(OBJDIR)/%.c.o : $(SRCDIR)%.c $(C_DEPS)
$(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $<
$(OBJDIR)/%.cpp.o : $(SRCDIR)%.cpp $(C_DEPS)
$(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $<
# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile
$(OBJDIR)/%.cu.o : $(SRCDIR)%.cu $(CU_DEPS)
$(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $<
# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile
$(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory
$(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $<
$(PTXDIR)/%.ptx : $(SRCDIR)%.cu ptxdirectory
$(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -ptx $<
#
# The following definition is a template that gets instantiated for each SM
# version (sm_10, sm_13, etc.) stored in SMVERSIONS. It does 2 things:
# 1. It adds to OBJS a .cu_sm_XX.o for each .cu file it finds in CUFILES_sm_XX.
# 2. It generates a rule for building .cu_sm_XX.o files from the corresponding
# .cu file.
#
# The intended use for this is to allow Makefiles that use common.mk to compile
# files to different Compute Capability targets (aka SM arch version). To do
# so, in the Makefile, list files for each SM arch separately, like so:
# This will be used over the default rule abov
#
# CUFILES_sm_10 := mycudakernel_sm10.cu app.cu
# CUFILES_sm_12 := anothercudakernel_sm12.cu
#
define SMVERSION_template
#OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_$(1))))
OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_sm_$(1))))
$(OBJDIR)/%.cu_$(1).o : $(SRCDIR)%.cu $(CU_DEPS)
# $(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS) $(1)
# if we have noinline enabled, we only turn this enable this for SM 2.x architectures
ifeq ($(noinline),1)
$(VERBOSE)$(NVCC) $(GENCODE_SM20) -o $$@ -c $$< $(NVCCFLAGS)
else
$(VERBOSE)$(NVCC) -gencode=arch=compute_$(1),code=\"sm_$(1),compute_$(1)\" $(GENCODE_SM20) -o $$@ -c $$< $(NVCCFLAGS)
endif
endef
# This line invokes the above template for each arch version stored in
# SM_VERSIONS. The call funtion invokes the template, and the eval
# function interprets it as make commands.
$(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver))))
###### TUAN This below line causes mmintrin.h and xmmintrin.h error.
$(TARGET): makedirectories $(OBJS) $(CUBINS) $(PTXBINS) Makefile_mac
$(VERBOSE)$(LINKLINE)
cubindirectory:
$(VERBOSE)mkdir -p $(CUBINDIR)
ptxdirectory:
$(VERBOSE)mkdir -p $(PTXDIR)
makedirectories:
$(VERBOSE)mkdir -p $(LIBDIR)
$(VERBOSE)mkdir -p $(OBJDIR)
$(VERBOSE)mkdir -p $(TARGETDIR)
tidy :
$(VERBOSE)find . | egrep "#" | xargs rm -f
$(VERBOSE)find . | egrep "\~" | xargs rm -f
clean : tidy
#TUAN $(VERBOSE)rm -f $(OBJS)
$(VERBOSE)rm -f $(CUBINS)
$(VERBOSE)rm -f $(PTXBINS)
$(VERBOSE)rm -f $(TARGET)
$(VERBOSE)rm -f $(NVCC_KEEP_CLEAN)
$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.ppm
$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.pgm
$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bin
$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bmp
clobber : clean
$(VERBOSE)rm -rf $(ROOTOBJDIR)

View File

@ -0,0 +1,122 @@
/*
* Tuan Le
* University of California, Berkeley
* Berkeley Space Sciences Lab
* tuanle86@berkeley.edu
*/
// When VERIFY is defined, the sum of squared errors is calculated between the
// identity matrix and the product A * incerse(A). For debugging...
//#define VERIFY 1
#include <stdio.h>
#include <math.h>
#include <time.h>
#include "cuda_config.h"
__global__ void GEStep1A(REAL * AI, int i, int n2, int lda2) {
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (k>i && k < n2 && AI[i*lda2+k]!=0) {
REAL multiplyer = -AI[i*lda2+k]/AI[i*lda2+i];
int n = n2 / 2;
for (int j = i+1; j < n; j++) {
AI[j*lda2+k] += multiplyer*AI[j*lda2+i];
}
}
}
__global__ void GEStep2(REAL * AI,REAL diag,int i, int n2, int lda2) {
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (k < n2) {
AI[i*lda2+k] /= diag;
}
}
__global__ void GEStep3(REAL * AI,int i, int n2, int lda2) {
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (k > i && k < n2) {
REAL multiplyer = -AI[i*lda2+k];
for (int j = 0; j < i; j++) {
AI[j*lda2+k] += multiplyer*AI[j*lda2+i];
}
}
}
//extern void invert(REAL * A, int n);
void invertge(REAL * AI_d, int lda, int n) {
int lda2 = lda * 2;
// perform elementary row operations till A in AI becomes identity matrix
for (int i = 0; i < n; i++) {
GEStep1A<<<(int)ceil((float)(1+(2*n-1)/32)),32>>>(AI_d,i,n*2, lda2);
CUDACHECK;
cudaThreadSynchronize();
}
for (int i = n-1; i >= 0; i--) {
REAL diag = 1.0;
SAFECALL(cudaMemcpy(&diag, &AI_d[i*lda2+i], sizeof(REAL), cudaMemcpyDeviceToHost));
GEStep2<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,diag,i,n*2, lda2);
CUDACHECK;
GEStep3<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,i,n*2, lda2);
CUDACHECK;
cudaThreadSynchronize();
CUDACHECK;
}
} // invertge
/* inverts nxn matrix A and stores result back in A */
extern void invert(REAL * A, int n) {
fprintf(stderr,"starting inversion n = %d ", n);
volatile clock_t gputime, gputime0;
gputime=clock();
gputime0 = gputime;
int lda = ((n+15)&~15|16);
//lda=n;
REAL * AI = (REAL *)malloc(sizeof(REAL)*(n*lda*2));
memset(AI,0,sizeof(REAL)*n*lda*2);
for (int i = 0; i < n; i++) {
memcpy(&AI[lda*i*2], &A[n*i], sizeof(REAL)*n);
AI[lda*i*2+n+i] = 1;
}
REAL * AI_d;
SAFECALL(cudaMalloc((void **) &AI_d, sizeof(REAL)*n*lda*2));
SAFECALL(cudaMemcpy(AI_d, AI, sizeof(REAL)*n*lda*2, cudaMemcpyHostToDevice));
invertge(AI_d, lda, n);
SAFECALL(cudaMemcpy(AI, AI_d, sizeof(REAL)*n*lda*2, cudaMemcpyDeviceToHost));
cudaFree(AI_d);
gputime=clock()-gputime;fprintf(stderr, " %7.1f ms ",gputime/1.e3f);
fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime);
#ifdef VERIFY
// let's verify that
REAL error=0.0;
// multiply inverse*xcopy, should be Identity matrix
for (int k = 0; k < n; k++) {
for (int j = 0; j < n; j++) {
REAL sum = 0;
for (int i = 0; i < n; i++) {
sum += AI[j*lda*2+n+i]*A[i*n+k];
}
if (j!=k) {
error += sum * sum;
} else {
error += (1.0-sum) * (1.0-sum);
}
}
}
fprintf(stderr, " %6.2f SSE", error);
#endif
for (int i = 0; i < n; i++) {
memcpy(&A[n*i], &AI[lda*i*2+n], sizeof(REAL)*n);
}
free(AI);
fprintf(stderr," done!\n");
} // invert

440
samples/nvcuda/cuda_mac.c Normal file
View File

@ -0,0 +1,440 @@
/*
* Tuan Le
* University of California, Berkeley
* Berkeley Space Sciences Lab
* tuanle86@berkeley.edu
*/
#ifdef _WIN32
#include "boinc_win.h"
#else
#include "config.h"
#include <cstdio>
#include <cctype>
#include <ctime>
#include <cstring>
#include <cstdlib>
#include <csignal>
#include <unistd.h>
#endif
#include <cuda_runtime.h>
#include <cublas.h>
#include "cuda_config.h"
#include "str_util.h"
#include "util.h"
#include "filesys.h"
#include "boinc_api.h"
#include "mfile.h"
#include "graphics2.h"
struct UC_SHMEM {
double update_time;
double fraction_done;
double cpu_time;
BOINC_STATUS status;
int countdown;
// graphics app sets this to 5 repeatedly,
// main program decrements it once/sec.
// If it's zero, don't bother updating shmem
};
#ifdef APP_GRAPHICS
#include "uc2.h"
UC_SHMEM* shmem;
#endif
using std::string;
#define CHECKPOINT_FILE "matrix_inversion_state"
#define INPUT_FILENAME "input"
#define OUTPUT_FILENAME "output"
#define MATRIX_SIZE 10
// execute the kernel NUM_ITERATIONS times
#define NUM_ITERATIONS 51
bool run_slow = false;
bool early_exit = false;
bool early_crash = false;
bool early_sleep = false;
double cpu_time = 20, comp_result;
// do a billion floating-point ops
// (note: I needed to add an arg to this;
// otherwise the MS C++ compiler optimizes away
// all but the first call to it!)
//
static double do_a_giga_flop(int foo) {
double x = 3.14159*foo;
int i;
for (i=0; i<500000000; i++) {
x += 5.12313123;
x *= 0.5398394834;
}
return x;
}
int do_checkpoint(MFILE& mf, int n, REAL *h_idata, int dimension) {
int retval;
string resolved_name;
FILE* f = fopen("temp", "w");
if (!f) return 1;
fprintf(f, "%d", n); //write inversion number
fprintf(f, " ");
fprintf(f, "%d", dimension); //write dimension
fprintf(f, " ");
for (int i=0;i<dimension*dimension;++i) {
fprintf(f, " ");
fprintf(f, "%f", h_idata[i]);
}
fclose(f);
retval = mf.flush(); //not really necessary since we have not yet written anything to output file when doing checkpointing
if (retval) return retval;
boinc_resolve_filename_s(CHECKPOINT_FILE, resolved_name); //resolved_name is a string object
retval = boinc_rename("temp", resolved_name.c_str()); //c_str() will convert a string object to a char string with null terminator equivalent.
// because we do rename. Thus temp does not appear, but the CHECKPOINT_FILE appears instead on the disk.
if (retval) return retval;
return 0; //return 0 to indicate success.
}
#ifdef APP_GRAPHICS
void update_shmem() {
if (!shmem) return;
// always do this; otherwise a graphics app will immediately
// assume we're not alive
shmem->update_time = dtime();
// Check whether a graphics app is running,
// and don't bother updating shmem if so.
// This doesn't matter here,
// but may be worth doing if updating shmem is expensive.
//
if (shmem->countdown > 0) {
// the graphics app sets this to 5 every time it renders a frame
shmem->countdown--;
} else {
return;
}
shmem->fraction_done = boinc_get_fraction_done();
shmem->cpu_time = boinc_worker_thread_cpu_time();;
boinc_get_status(&shmem->status);
}
#endif
void print(REAL *A, int n); //on commandline
void generateRandomInputFile(int n);
int getMatrixDimension(FILE *infile);
int countElementsInMatrix(FILE *infile);
void fetchElementsIntoHostMemory(FILE *infile, REAL *h_idata);
void printToFile(MFILE *out, float *h_odata, int dimension);
extern void invert(REAL * A, int n);
int main(int argc, char** argv)
{
int i, retval, lastInversion=0, checkpointExists=0, dimension=0;
double fd;
char input_path[512], output_path[512], chkpt_path[512], buf[256];
REAL* h_idata;
MFILE out;
FILE* state, *infile;
generateRandomInputFile(MATRIX_SIZE); //call this if you don't want to construct the input file manually
for (i=0; i<argc; i++) {
if (!strcmp(argv[i], "-early_exit")) early_exit = true;
if (!strcmp(argv[i], "-early_crash")) early_crash = true;
if (!strcmp(argv[i], "-early_sleep")) early_sleep = true;
if (!strcmp(argv[i], "-run_slow")) run_slow = true;
if (!strcmp(argv[i], "-cpu_time")) {
cpu_time = atof(argv[++i]);
}
}
retval = boinc_init();
if (retval) {
fprintf(stderr, "%s boinc_init returned %d\n",
boinc_msg_prefix(buf), retval
);
exit(retval);
}
// open the input file (resolve logical name first)
//
boinc_resolve_filename(INPUT_FILENAME, input_path, sizeof(input_path));
infile = boinc_fopen(input_path, "r");
if (!infile) {
fprintf(stderr,
"%s Couldn't find input file, resolved name %s.\n",
boinc_msg_prefix(buf), input_path
);
getchar();
exit(-1);
}
boinc_resolve_filename(OUTPUT_FILENAME, output_path, sizeof(output_path));
// See if there's a valid checkpoint file.
// If so retrieve the current matrix and inversion number
//
boinc_resolve_filename(CHECKPOINT_FILE, chkpt_path, sizeof(chkpt_path));
state = boinc_fopen(chkpt_path, "r");
if (state) {
printf("Checkpoint file is detected. Read from checkpoint file ... \n");
checkpointExists=fscanf(state, "%d", &lastInversion);
if (checkpointExists == 1) {
printf("Last inversion # is : %d\n",lastInversion);
fscanf(state,"%d",&dimension);
cudaMallocHost((void **)&h_idata,dimension*dimension*sizeof(REAL));
for (int i=0;i<dimension*dimension;++i) {
fscanf(state, "%f", &h_idata[i]);
//printf("--%f\n",h_idata[i]);
}
}
fclose(state);
} else {
printf("There's no valid checkpoint file!\n");
}
retval = out.open(output_path, "wb");
if (retval) {
fprintf(stderr, "%s APP: matrix_inversion output open failed:\n",
boinc_msg_prefix(buf)
);
fprintf(stderr, "%s resolved name %s, retval %d\n",
boinc_msg_prefix(buf), output_path, retval
);
perror("open");
exit(1);
}
#ifdef APP_GRAPHICS
// create shared mem segment for graphics, and arrange to update it
//
shmem = (UC_SHMEM*)boinc_graphics_make_shmem("matrix_inversion", sizeof(UC_SHMEM));
if (!shmem) {
fprintf(stderr, "%s failed to create shared mem segment\n",
boinc_msg_prefix(buf)
);
}
update_shmem();
boinc_register_timer_callback(update_shmem);
#endif
if (checkpointExists != 1) {
dimension=getMatrixDimension(infile);
printf("Matrix dimension: %d\n",dimension);
cudaMallocHost((void **)&h_idata,dimension*dimension*sizeof(REAL));
fetchElementsIntoHostMemory(infile,h_idata);
out.printf("\n----------------- Before being inversed ----------------\n\n");
printf("Computation is running ... Inverse the matrix %d times. Start at inversion #1\n",NUM_ITERATIONS);
} else {
out.printf("\n----------------- Last checkpointed inversion #%d ----------------\n\n",lastInversion);
printf("Computation is resumed ... Inverse the matrix %d more times. Start at inversion #%d\n",NUM_ITERATIONS-lastInversion,lastInversion+1);
}
printToFile(&out,h_idata,dimension);
for (int i=lastInversion+1;i<=NUM_ITERATIONS;++i) {
invert(h_idata,dimension);
printf("Finish inversion #%d\n",i);
if (run_slow) {
boinc_sleep(1.);
}
if (early_exit && i>30) {
exit(-10);
}
if (early_crash && i>30) {
boinc_crash();
}
if (early_sleep && i>30) {
g_sleep = true;
while (1) boinc_sleep(1);
}
if (boinc_time_to_checkpoint()) {
//if (i==7) {
printf("Perform checkpointing at inversion # %d\n",i);
//we'll need to write the current matrix to the state file.
retval = do_checkpoint(out, i, h_idata, dimension);
if (retval) {
fprintf(stderr, "%s APP: matrix_inversion checkpoint failed %d\n",
boinc_msg_prefix(buf), retval
);
exit(retval);
}
boinc_checkpoint_completed();
}
fd = i/NUM_ITERATIONS;
if (cpu_time) fd /= 2;
boinc_fraction_done(fd);
}
out.printf("\n\n----------------- Final inversion #%d----------------\n\n",NUM_ITERATIONS);
printToFile(&out,h_idata,dimension);
cudaFreeHost( h_idata );
retval = out.flush(); //force the output file to be closed.
if (retval) {
fprintf(stderr, "%s APP: matrix_inversion flush failed %d\n",
boinc_msg_prefix(buf), retval
);
exit(1);
}
// burn up some CPU time if needed
//
if (cpu_time) {
double start = dtime();
for (int i=0; ; i++) {
double e = dtime()-start;
if (e > cpu_time) break;
fd = .5 + .5*(e/cpu_time);
boinc_fraction_done(fd);
if (boinc_time_to_checkpoint()) {
retval = do_checkpoint(out, NUM_ITERATIONS, h_idata, dimension);
if (retval) {
fprintf(stderr, "%s APP: maxtrix_inversion checkpoint failed %d\n",
boinc_msg_prefix(buf), retval
);
exit(1);
}
boinc_checkpoint_completed();
}
comp_result = do_a_giga_flop(i);
}
}
boinc_fraction_done(1);
#ifdef APP_GRAPHICS
update_shmem();
#endif
printf("Done!");
getchar();
boinc_finish(0);
}
#ifdef _WIN32
int WINAPI WinMain(HINSTANCE hInst, HINSTANCE hPrevInst, LPSTR Args, int WinMode) {
LPSTR command_line;
char* argv[100];
int argc;
command_line = GetCommandLine();
argc = parse_command_line( command_line, argv );
return main(argc, argv);
}
#endif
void print(REAL *h_idata, int n) {
int j=0;
for (int i=0;i<n*n;++i) {
printf("%f ",h_idata[i]);
if (j+1==n) {
printf("\n");
j=0;
} else {
++j;
}
}
}
// nxn matrix
void generateRandomInputFile(int n) {
FILE *infile;
infile=fopen(INPUT_FILENAME,"w");
REAL *h_idata = new REAL[n*n];
srand(n);
for( int i = 0; i < n; i++ ) {
for (int j = 0; j < n; j++) {
h_idata[i*n+j] = 2.0*(rand()%32768)/32768.0 - 1.0;
}
h_idata[i*n+i] += sqrt((float)n);
}
int j=0;
for (int i=0;i<n*n;++i) {
fprintf(infile,"%15f",h_idata[i]);
if (j+1==n) {
fprintf(infile,"\n");
j=0;
} else {
++j;
}
}
fclose(infile);
}
int getMatrixDimension(FILE *infile) {
int w=0;
char c;
fseek(infile,0,SEEK_SET);
while (true) {
do {
c=fgetc(infile);
if (c == EOF || c == '\n') {
goto exitLoop;
}
} while (isspace(c));
if (isdigit(c) || c=='.' || c=='-') {
++w;
}
do {
c=fgetc(infile);
if (c == EOF || c == '\n') {
goto exitLoop;
}
} while (isdigit(c) || c=='.' || c=='-');
if (c==EOF || c == '\n') {
break;
}
}
exitLoop:
return w;
}
void fetchElementsIntoHostMemory(FILE *infile, REAL *h_idata) {
float num=0;
int i=0;
fseek(infile,0,SEEK_SET);
while (fscanf(infile,"%f",&num)==1) {
h_idata[i]=num;
++i;
}
}
void printToFile(MFILE *out, float *h_odata, int dimension) {
int count=0;
int move=0;
int num_elements=dimension*dimension;
while (num_elements>0) {
out->printf("%15f ",h_odata[move]);
++count;
++move;
if (count==dimension) {
out->printf("\n");
count=0;
}
--num_elements;
}
}

View File

@ -6,11 +6,21 @@ tuanle86@berkeley.edu
----------------------- Compiler ----------------------------
You will need to install gcc 4.3 and g++ 4.3. It appears that Cuda 3.0 SDK has not yet worked
with gcc 4.4 and g++ 4.4
with gcc 4.4 and g++ 4.4. Please follow these steps:
sudo aptitude install gcc-4.3 g++-4.3
cd ~/NVIDIA_GPU_Computing_SDK/C (go to SDK source dir. You should find the right path on your machine)
mkdir mygcc
cd mygcc
ln -s $(which g++-4.3) g++
ln -s $(which gcc-4.3) gcc
Note: mygcc directory must be created since I have made symlinks to the 4.3 compiler to mygcc directory
in my current Makefile.
----------------------- Linux Makefile ----------------------
Makefile needs to be edited on your machine before running. Please follow these steps:
common.mk file needs to be edited on your machine before running. Please follow these steps:
1) Open "boinc/samples/nvcuda/common.mk" with gedit
2) Ctrl+f and search for "tuanle". You will find the following:

View File

@ -0,0 +1,51 @@
Tuan Le
University of California, Berkeley
Berkeley Space Sciences Lab
tuanle86@berkeley.edu
----------------------- Mac OS Makefile ----------------------
By default, the NVIDIA Cuda SDK for MacOS is installed at $ROOT/Developer/ under the directory name "GPU Computing".
If it is installed somewhere else, then common_mac.mk file needs to be edited by the following steps:
1) Open "boinc/samples/nvcuda/common_mac.mk"
2) Look for "ROOTDIR ?= /DeveloperGPU\ Computing
3) Replace this path by appropriate path on your machine.
4) Done!
----------------------- Mac Troubleshooting ----------------
Error: ./example_app_nvcuda_mac: library not found for -lcutil_i386.
Solution: run make from "CPU Computing" directory to build the cutil_i386 library.
Note: It appears that nvcc compiler has trouble compiling .cu file that contains both BOINC and
Cuda code. Any attempt to compile such .cu files will result in errors like the following:
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(55): error: identifier "__builtin_ia32_emms" is undefined
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(68): error: identifier "__builtin_ia32_vec_init_v2si" is undefined
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(111): error: identifier "__builtin_ia32_vec_ext_v2si" is undefined
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(150): error: identifier "__builtin_ia32_packsswb" is undefined
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(165): error: identifier "__builtin_ia32_packssdw" is undefined
...........
The solution that I came up with is to compile BOINC and Cuda seperately. Put BOINC code in .c file and CUDA code in .cu file.
Then, write an external function in .cu file that will make kernel calls, do computation and return a result that could be used
in .c file.
----------------------- Run Executable file ------------------
"make -f Makefile_mac" command will create an executable file in "boinc/samples/nvcuda/darwin/release/".
If your machine doesn't have CUDA-enabled GPU, then the executable file for this sample app in
the release directory will stop execution after some output statements on the terminal. In this case,
it's best to run in emurelease mode. To generate an executable file in emurelease mode, type
"make -f Makefile_mac emu=1". The executable file is then created in "boinc/samples/nvcuda/darwin/emurelease/".
Note that NVIDIA Cuda SDK version older than version 3.0 will no longer support emulation mode.
Also, you need to define some environment variables before running the executable file.
export PATH=/usr/local/cuda/bin:$PATH
export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH