boinc/samples/nvcuda/cuda_kernel.cu

// This file is part of BOINC.
// http://boinc.berkeley.edu
// Copyright (C) 2008 University of California
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation,
// either version 3 of the License, or (at your option) any later version.
//
// BOINC is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC.  If not, see <http://www.gnu.org/licenses/>.
//
// This file contains kernel definition for matrix inversion. The external function
// "invert" serves as an interface between cuda_kernel.cu and cuda.cpp
//
// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues
// Contributor: Tuan Le (tuanle86@berkeley.edu)

// When VERIFY is defined, the sum of squared errors is calculated between the
// identity matrix and the product A * incerse(A). For debugging...
//#define VERIFY 1
#include <stdio.h>
#include <math.h>
#include <time.h>
#include "cuda_config.h"

__global__ void GEStep1A(REAL * AI, int i, int n2, int lda2) {
    int k = blockIdx.x * blockDim.x + threadIdx.x;
    if (k>i && k < n2 && AI[i*lda2+k]!=0) {
        REAL multiplyer = -AI[i*lda2+k]/AI[i*lda2+i];
        int n = n2 / 2;
        for (int j = i+1; j < n; j++) {
            AI[j*lda2+k] += multiplyer*AI[j*lda2+i];
        }
    }
}

__global__ void GEStep2(REAL * AI,REAL diag,int i, int n2, int lda2) {
    int k = blockIdx.x * blockDim.x + threadIdx.x;
    if (k < n2) {
        AI[i*lda2+k] /= diag;
    }
}

__global__ void GEStep3(REAL * AI,int i, int n2, int lda2) {
    int k = blockIdx.x * blockDim.x + threadIdx.x;
    if (k > i && k < n2) {
        REAL multiplyer = -AI[i*lda2+k];
        for (int j = 0; j < i; j++) {
            AI[j*lda2+k] += multiplyer*AI[j*lda2+i];
        }
    }
}

/* Helper function for invert. Kernel calls are made in this function */
void invertge(REAL * AI_d, int lda, int n) {
    int lda2 = lda * 2;
    // perform elementary row operations till A in AI becomes identity matrix
    for (int i = 0; i < n; i++) {
        GEStep1A<<<(int)ceil((float)(1+(2*n-1)/32)),32>>>(AI_d,i,n*2, lda2);
        CUDACHECK;
        cudaThreadSynchronize();
    }

    for (int i = n-1; i >= 0; i--) {
        REAL diag = 1.0;
        SAFECALL(cudaMemcpy(&diag, &AI_d[i*lda2+i], sizeof(REAL), cudaMemcpyDeviceToHost));
        GEStep2<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,diag,i,n*2, lda2);
        CUDACHECK;

        GEStep3<<<(int)ceil((float)(1+(n*2-1)/32)),32>>>(AI_d,i,n*2, lda2);
        CUDACHECK;
        cudaThreadSynchronize();
        CUDACHECK;
    }
}

/* inverts nxn matrix A and stores result back in A */
extern void invert(REAL * A, int n) {
    fprintf(stderr,"starting inversion n = %d ", n);
    volatile clock_t gputime;
    gputime=clock();

    int lda = ((n+15)&~15|16);
    REAL * AI = (REAL *)malloc(sizeof(REAL)*(n*lda*2));
    memset(AI,0,sizeof(REAL)*n*lda*2);
    for (int i = 0; i < n; i++) {
        memcpy(&AI[lda*i*2], &A[n*i], sizeof(REAL)*n);
        AI[lda*i*2+n+i] = 1;
    }

    REAL * AI_d;
    SAFECALL(cudaMalloc((void **) &AI_d, sizeof(REAL)*n*lda*2));
    SAFECALL(cudaMemcpy(AI_d, AI, sizeof(REAL)*n*lda*2, cudaMemcpyHostToDevice));

    invertge(AI_d, lda, n);
    SAFECALL(cudaMemcpy(AI, AI_d, sizeof(REAL)*n*lda*2, cudaMemcpyDeviceToHost));
    cudaFree(AI_d);
    gputime=clock()-gputime;fprintf(stderr, " %7.1f ms ",gputime/1.e3f);
    fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime);

#ifdef VERIFY	
	// let's verify that
    REAL error=0.0;
    // multiply inverse*xcopy, should be Identity matrix
    for (int k = 0; k < n; k++) {
        for (int j = 0; j < n; j++) {
            REAL sum = 0;
            for (int i = 0; i < n; i++) {
                sum += AI[j*lda*2+n+i]*A[i*n+k];
            }
            if (j!=k) {
                error += sum * sum;
            } else {
                error += (1.0-sum) * (1.0-sum);
            }
        }
    }
    fprintf(stderr, " %6.2f SSE", error);
#endif	

    for (int i = 0; i < n; i++) {
        memcpy(&A[n*i], &AI[lda*i*2+n], sizeof(REAL)*n);
    }
    free(AI);
    fprintf(stderr," done!\n");
}
BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`// This file is part of BOINC.`
			`// http://boinc.berkeley.edu`
			`// Copyright (C) 2008 University of California`
			`//`
			`// BOINC is free software; you can redistribute it and/or modify it`
			`// under the terms of the GNU Lesser General Public License`
			`// as published by the Free Software Foundation,`
			`// either version 3 of the License, or (at your option) any later version.`
			`//`
			`// BOINC is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.`
			`// See the GNU Lesser General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU Lesser General Public License`
			`// along with BOINC. If not, see <http://www.gnu.org/licenses/>.`
			`//`
			`// This file contains kernel definition for matrix inversion. The external function`
			`// "invert" serves as an interface between cuda_kernel.cu and cuda.cpp`
			`//`
			`// See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues`
			`// Contributor: Tuan Le (tuanle86@berkeley.edu)`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00
			`// When VERIFY is defined, the sum of squared errors is calculated between the`
			`// identity matrix and the product A * incerse(A). For debugging...`
			`//#define VERIFY 1`
- samples: plug in a skeleton CUDA sample with nothing in it. Setup Windows project environment so the thing will at least compile. The nvcc compiler is a bit touchy. samples/cuda cuda.cu cuda_kernel.cu win_build/ boinc.sln cuda.rules cuda.vcproj svn path=/trunk/boinc/; revision=21655 2010-05-28 21:25:27 +00:00			`#include <stdio.h>`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00			`#include <math.h>`
			`#include <time.h>`
svn path=/trunk/boinc/; revision=21920 2010-07-13 21:36:29 +00:00			`#include "cuda_config.h"`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00
			`__global__ void GEStep1A(REAL * AI, int i, int n2, int lda2) {`
BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`int k = blockIdx.x * blockDim.x + threadIdx.x;`
			`if (k>i && k < n2 && AI[i*lda2+k]!=0) {`
			`REAL multiplyer = -AI[ilda2+k]/AI[ilda2+i];`
			`int n = n2 / 2;`
			`for (int j = i+1; j < n; j++) {`
			`AI[jlda2+k] += multiplyerAI[j*lda2+i];`
			`}`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00			`}`
- samples: plug in a skeleton CUDA sample with nothing in it. Setup Windows project environment so the thing will at least compile. The nvcc compiler is a bit touchy. samples/cuda cuda.cu cuda_kernel.cu win_build/ boinc.sln cuda.rules cuda.vcproj svn path=/trunk/boinc/; revision=21655 2010-05-28 21:25:27 +00:00			`}`

svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00			`__global__ void GEStep2(REAL * AI,REAL diag,int i, int n2, int lda2) {`
BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`int k = blockIdx.x * blockDim.x + threadIdx.x;`
			`if (k < n2) {`
			`AI[i*lda2+k] /= diag;`
			`}`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00			`}`

			`__global__ void GEStep3(REAL * AI,int i, int n2, int lda2) {`
BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`int k = blockIdx.x * blockDim.x + threadIdx.x;`
			`if (k > i && k < n2) {`
			`REAL multiplyer = -AI[i*lda2+k];`
			`for (int j = 0; j < i; j++) {`
			`AI[jlda2+k] += multiplyerAI[j*lda2+i];`
			`}`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00			`}`
			`}`

BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`/* Helper function for invert. Kernel calls are made in this function */`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00			`void invertge(REAL * AI_d, int lda, int n) {`
BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`int lda2 = lda * 2;`
			`// perform elementary row operations till A in AI becomes identity matrix`
			`for (int i = 0; i < n; i++) {`
			`GEStep1A<<<(int)ceil((float)(1+(2n-1)/32)),32>>>(AI_d,i,n2, lda2);`
			`CUDACHECK;`
			`cudaThreadSynchronize();`
			`}`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00
BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`for (int i = n-1; i >= 0; i--) {`
			`REAL diag = 1.0;`
			`SAFECALL(cudaMemcpy(&diag, &AI_d[i*lda2+i], sizeof(REAL), cudaMemcpyDeviceToHost));`
			`GEStep2<<<(int)ceil((float)(1+(n2-1)/32)),32>>>(AI_d,diag,i,n2, lda2);`
			`CUDACHECK;`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00
BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`GEStep3<<<(int)ceil((float)(1+(n2-1)/32)),32>>>(AI_d,i,n2, lda2);`
			`CUDACHECK;`
			`cudaThreadSynchronize();`
			`CUDACHECK;`
			`}`
			`}`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00
			`/* inverts nxn matrix A and stores result back in A */`
BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`extern void invert(REAL * A, int n) {`
			`fprintf(stderr,"starting inversion n = %d ", n);`
remove some unused variables svn path=/trunk/boinc/; revision=21960 2010-07-15 20:42:13 +00:00			`volatile clock_t gputime;`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00			`gputime=clock();`

			`int lda = ((n+15)&~15\|16);`
BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`REAL * AI = (REAL )malloc(sizeof(REAL)(nlda2));`
			`memset(AI,0,sizeof(REAL)nlda*2);`
			`for (int i = 0; i < n; i++) {`
			`memcpy(&AI[ldai2], &A[ni], sizeof(REAL)n);`
			`AI[ldai2+n+i] = 1;`
			`}`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00
BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`REAL * AI_d;`
			`SAFECALL(cudaMalloc((void *) &AI_d, sizeof(REAL)nlda2));`
			`SAFECALL(cudaMemcpy(AI_d, AI, sizeof(REAL)nlda*2, cudaMemcpyHostToDevice));`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00
BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`invertge(AI_d, lda, n);`
			`SAFECALL(cudaMemcpy(AI, AI_d, sizeof(REAL)nlda*2, cudaMemcpyDeviceToHost));`
			`cudaFree(AI_d);`
			`gputime=clock()-gputime;fprintf(stderr, " %7.1f ms ",gputime/1.e3f);`
			`fprintf(stderr, " %7.2f Gflops", 1e-3(3.0)nnn/3.0/gputime);`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00
			`#ifdef VERIFY`
			`// let's verify that`
BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`REAL error=0.0;`
			`// multiply inverse*xcopy, should be Identity matrix`
			`for (int k = 0; k < n; k++) {`
			`for (int j = 0; j < n; j++) {`
			`REAL sum = 0;`
			`for (int i = 0; i < n; i++) {`
			`sum += AI[jlda2+n+i]A[in+k];`
			`}`
			`if (j!=k) {`
			`error += sum * sum;`
			`} else {`
			`error += (1.0-sum) * (1.0-sum);`
			`}`
			`}`
			`}`
svn path=/trunk/boinc/; revision=21726 2010-06-10 17:44:32 +00:00			`fprintf(stderr, " %6.2f SSE", error);`
			`#endif`

BOINC's LGPL license added code formated functions renamed comments added svn path=/trunk/boinc/; revision=21899 2010-07-13 18:03:00 +00:00			`for (int i = 0; i < n; i++) {`
			`memcpy(&A[ni], &AI[ldai2+n], sizeof(REAL)n);`
			`}`
			`free(AI);`
			`fprintf(stderr," done!\n");`
			`}`