2016-04-25 88 views
1
#include <iostream> 
#include <assert.h> 
#include <sys/time.h> 

#define BLOCK_SIZE 32 // CUDA block size 

__device__ inline int getValFromMatrix(int* matrix, int row, int col,int matSize) { 
    if (row<matSize && col<matSize) {return matrix[row*matSize + col];} 
    return 0; 
} 

__device__ inline int getValFromVector(int* vector, int row, int matSize) { 
    if (row<matSize) {return vector[row];} 
    return 0; 
} 

__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) { 
    __shared__ int aRowShared[BLOCK_SIZE]; 
    __shared__ int bShared[BLOCK_SIZE]; 
    __shared__ int myRow; 
    __shared__ double rowSum; 

    int myIndexInBlock = threadIdx.x; 
    myRow = blockIdx.x; 
    rowSum = 0; 

    for (int m = 0; m < (matSize/BLOCK_SIZE + 1);m++) { 
     aRowShared[myIndexInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myIndexInBlock,matSize); 
     bShared[myIndexInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myIndexInBlock,matSize); 

     __syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared 

     if (myIndexInBlock==0) { 
      for (int k=0;k<BLOCK_SIZE;k++) { 
       rowSum += aRowShared[k] * bShared[k]; 
      } 
     } 
    } 

    if (myIndexInBlock==0) {cOnGPU[myRow] = rowSum;} 
} 

static inline void cudaCheckReturn(cudaError_t result) { 
    if (result != cudaSuccess) { 
     std::cerr <<"CUDA Runtime Error: " << cudaGetErrorString(result) << std::endl; 
     assert(result == cudaSuccess); 
    } 
} 

static void matVecMultCUDA(int* aOnGPU,int* bOnGPU, int* cOnGPU, int* c, int sizeOfc, int matSize) { 
    matVecMultCUDAKernel<<<matSize,BLOCK_SIZE>>>(aOnGPU,bOnGPU,cOnGPU,matSize); // Launch 1 block per row 

    cudaCheckReturn(cudaMemcpy(c,cOnGPU,sizeOfc,cudaMemcpyDeviceToHost)); 
} 


static void matVecMult(int** A,int* b, int* c, int matSize) { 
    // Sequential implementation: 
    for (int i=0;i<matSize;i++) { 
     c[i]=0; 
     for (int j=0;j<matSize;j++) { 
      c[i]+=(A[i][j] * b[j]); 
     } 
    } 
} 

int main() { 
    int matSize = 1000; 

    int** A,* b,* c; 
    int* aOnGPU,* bOnGPU,* cOnGPU; 

    A = new int*[matSize]; 
    for (int i = 0; i < matSize;i++) {A[i] = new int[matSize]();} 
    b = new int[matSize](); 
    c = new int[matSize](); 

    int aSizeOnGPU = matSize * matSize * sizeof(int), bcSizeOnGPU = matSize * sizeof(int); 

    cudaCheckReturn(cudaMalloc(&aOnGPU,aSizeOnGPU)); // cudaMallocPitch? 
    cudaCheckReturn(cudaMalloc(&bOnGPU,bcSizeOnGPU)); 
    cudaCheckReturn(cudaMalloc(&cOnGPU,bcSizeOnGPU)); 

    srand(time(NULL)); 

    for (int i=0;i<matSize;i++) { 
     b[i] = rand()%100; 
     for (int j=0;j<matSize;j++) { 
      A[i][j] = rand()%100; 
     } 
    } 

    for (int i=0;i<matSize;i++) {cudaCheckReturn(cudaMemcpy((aOnGPU+i*matSize),A[i],bcSizeOnGPU,cudaMemcpyHostToDevice));} 
    cudaCheckReturn(cudaMemcpy(bOnGPU,b,bcSizeOnGPU,cudaMemcpyHostToDevice)); 

    int iters=1; 
    timeval start,end; 

    // Sequential run: 
    gettimeofday(&start,NULL); 
    for (int i=0;i<iters;i++) {matVecMult(A,b,c,matSize);} 
    gettimeofday(&end,NULL); 
    std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl; 

    // CUDA run: 
    gettimeofday(&start,NULL); 
    for (int i=0;i<iters;i++) {matVecMultCUDA(aOnGPU,bOnGPU,cOnGPU,c,bcSizeOnGPU,matSize);} 
    gettimeofday(&end,NULL); 
    std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl; 

    cudaCheckReturn(cudaFree(aOnGPU)); 
    cudaCheckReturn(cudaFree(bOnGPU)); 
    cudaCheckReturn(cudaFree(cOnGPU)); 


    for (int i = 0; i < matSize; ++i) { 
     delete[] A[i]; 
    } 
    delete[] A; 
    delete[] b; 
    delete[] c; 
} 

給出:無法獲得矩陣*向量乘法去更快CUDA比CPU

267171 
580253 

我跟着指南http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory,如何做一個矩陣乘法。我對矩陣(A)和向量(B)都使用共享內存,但無論選擇什麼樣的矩陣大小(100 * 100-20000 * 20000)或塊大小(32-1024),順序執行總是優於在CUDA實施速度方面,速度大約快兩倍。

由於我使用矩陣*向量乘法,共享數組和塊的處理方式有點不同;我在矩陣的每一行使用一個塊,而不是在矩陣的一部分上使用二維塊。

是我的執行錯誤,或者簡直是CUDA不比CPU快?

回答

3

第一項:您在cuda實現中對不在CPU上的邊界執行檢查。在GPU上分支是非常昂貴的。

第二:你可以算作cuda表演中的cudamemcpy。在將結果返回給CPU之前,僅執行一次乘法是非常罕見的。 通常(例如在CG上),您必須在複製之前在GPU上執行數百次乘法。

第三:不要試圖實現(教育目的除外)並使用供應商庫(如每個CUDA版本附帶的CUBLAS),這是非常難以超越的。