2017-07-23 71 views
-1

我已經實現了一個向量點積如下。 它與CUDA 7.5編譯爲compute_20,sm_20const int THREADS_PER_BLOCK = 16;兩個向量的cuda點積不適用於N> = 369

浮動和雙打都發生這種情況。

它工作到n=368,但除此之外,結果是不正確的。我想知道問題出在我的實現代碼還是我正在使用的值(請參閱第二個代碼,初始化),例如可能是除n=368之外的添加引入了浮點錯誤(這可能很奇怪,因爲浮點和雙精度同時出現錯誤)。

int divUp(int total, int grain) { return (total+grain-1)/grain; } 

__device__ __forceinline__ double atomicAdd(double* address, double val) 
{ 
    unsigned long long int* address_as_ull = (unsigned long long int*)address; 
    unsigned long long int old = *address_as_ull, assumed; 
    do 
    { 
     assumed = old; 
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed))); 
    } 
    while(assumed!=old); 
    return __longlong_as_double(old); 
} 

__device__ __forceinline__ float atomicAdd(float* address, float val) 
{ 
    unsigned int *ptr = (unsigned int *)address; 
    unsigned int old, newint, ret = *ptr; 
    do { 
     old = ret; 
     newint = __float_as_int(__int_as_float(old)+val); 
    } while((ret = atomicCAS(ptr, old, newint)) != old); 

    return __int_as_float(ret); 
} 

template<typename T> 
__global__ void vecdotk(const T* a, const T* b, const int n, T* c) 
{ 
    __shared__ T temp[THREADS_PER_BLOCK]; 
    int x = threadIdx.x+blockIdx.x*blockDim.x; 
    if(x==0) c[0] = 0.0; 
    if(x<n) {temp[threadIdx.x] = a[x]*b[x]; 
    } 
    else temp[threadIdx.x] = 0.0; 
    __syncthreads(); 

    if(0==threadIdx.x) 
    { 
     T sum = 0.0; 
     for(int j=0; j<THREADS_PER_BLOCK; ++j) 
     { 
      sum += temp[j]; 
     } 
     atomicAdd(c, sum); 
    } 
} 

template<typename T> 
void dot(const T* a, const T* b, const int n, T* c) 
{ 
    dim3 block(THREADS_PER_BLOCK); 
    dim3 grid(divUp(n, block.x), 1); 
    vecdotk<T><<<grid, block>>>(a, b, n, c); 
    cudaSafeCall(cudaGetLastError()); 
}; 

我使用以下兩個主機矢量來填充輸入設備數組(目前我沒有顯示,因爲它們是更大的庫的一部分)。基本上我想計算平方系列的總和即

equation

// fill host vectors a and b 
for(int i=0; i<n; ++i) 
{ 
    h_vec_a[i] = i+1;//__mat_rand(); 
    h_vec_b[i] = i+1;//__mat_rand(); 
} 

回答

1

這是行不通的:

if(x==0) c[0] = 0.0; 

沒有保證(在CUDA)該線程第一個0或運行該行將在其他線程到達代碼中的任何點之前運行。在啓動該內核之前,您需要初始化c[0]。否則,一些線程可能會將它們的原子添加到c中,然後,稍後線程0可能會將c [0]初始化爲零。

此外,CUDA已經提供了一個float版本的atomicAdd,您沒有理由提供自己的。並且,運行16個線程的線程塊將不會給你帶來好的性能(我建議只使用CUBLAS點積函數。)通過修復c[0](刪除該代碼行,並在內核之前初始化c[0]),代碼將正確運行我:

$ cat t372.cu 
#include <stdio.h> 

const int n = 2048; 
#ifdef USE_DOUBLE 
typedef double mytype; 
#else 
typedef float mytype; 
#endif 
const int THREADS_PER_BLOCK = 16; 

int divUp(int total, int grain) { return (total+grain-1)/grain; } 
#if 0 
__device__ __forceinline__ double atomicAdd(double* address, double val) 
{ 
    unsigned long long int* address_as_ull = (unsigned long long int*)address; 
    unsigned long long int old = *address_as_ull, assumed; 
    do 
    { 
     assumed = old; 
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed))); 
    } 
    while(assumed!=old); 
    return __longlong_as_double(old); 
} 

__device__ __forceinline__ float atomicAdd(float* address, float val) 
{ 
    unsigned int *ptr = (unsigned int *)address; 
    unsigned int old, newint, ret = *ptr; 
    do { 
     old = ret; 
     newint = __float_as_int(__int_as_float(old)+val); 
    } while((ret = atomicCAS(ptr, old, newint)) != old); 

    return __int_as_float(ret); 
} 
#endif 
template<typename T> 
__global__ void vecdotk(const T* a, const T* b, const int n, T* c) 
{ 
    __shared__ T temp[THREADS_PER_BLOCK]; 
    int x = threadIdx.x+blockIdx.x*blockDim.x; 
    //if(x==0) c[0] = 0.0; 
    if(x<n) {temp[threadIdx.x] = a[x]*b[x]; 
    } 
    else temp[threadIdx.x] = 0.0; 
    __syncthreads(); 

    if(0==threadIdx.x) 
    { 
     T sum = 0.0; 
     for(int j=0; j<THREADS_PER_BLOCK; ++j) 
     { 
      sum += temp[j]; 
     } 
     atomicAdd(c, sum); 
    } 
} 

template<typename T> 
cudaError_t dot(const T* a, const T* b, const int n, T* c) 
{ 
    dim3 block(THREADS_PER_BLOCK); 
    dim3 grid(divUp(n, block.x), 1); 
    vecdotk<T><<<grid, block>>>(a, b, n, c); 
    cudaDeviceSynchronize(); 
    return cudaGetLastError(); 
}; 

int main(){ 

    mytype *h_vec_a, *h_vec_b, *d_vec_a, *d_vec_b, *h_c, *d_c; 
    int bs = n*sizeof(mytype); 
    h_vec_a = (mytype *)malloc(bs); 
    h_vec_b = (mytype *)malloc(bs); 
    h_c = (mytype *)malloc(sizeof(mytype)); 
    cudaMalloc(&d_vec_b, bs); 
    cudaMalloc(&d_vec_a, bs); 
    cudaMalloc(&d_c, sizeof(mytype)); 
// fill host vectors a and b 
    for(int i=0; i<n; ++i) 
    { 
    h_vec_a[i] = i+1;//__mat_rand(); 
    h_vec_b[i] = i+1;//__mat_rand(); 
    } 
    h_c[0] = 0; 
    cudaMemcpy(d_vec_a, h_vec_a, bs, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_vec_b, h_vec_b, bs, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_c, h_c, sizeof(mytype), cudaMemcpyHostToDevice); 
    dot(d_vec_a, d_vec_b, n, d_c); 
    cudaMemcpy(h_c, d_c, sizeof(mytype), cudaMemcpyDeviceToHost); 
    mytype test_val = 0; 
    for (int i=0; i < n; i++) 
    test_val += h_vec_a[i] * h_vec_b[i]; 
    printf("GPU result: %f, CPU result: %f\n", h_c[0], test_val); 

} 
$ nvcc -arch=sm_20 -o t372 t372.cu 
nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). 
$ cuda-memcheck ./t372 
========= CUDA-MEMCHECK 
GPU result: 2865411584.000000, CPU result: 2865411072.000000 
========= ERROR SUMMARY: 0 errors 
$ 

在最後3個數字的數值差異是由於float極限,不由於在代碼的任何誤差。例如,如果您更改初始化以將每個矢量初始化爲全1,則在此情況下您將得到一個精確的結果。

再一次,出於性能方面的原因,可能會對您的代碼提出一些批評。如果你想要一個快速點產品,我建議使用CUBLAS

+0

我正在學習CUDA的基礎知識,併爲通用計算編程GPU。但是,如果我在'if(x == 0)c [0] = 0.0;'行之後使用'__threadfence()'會怎麼樣? – user62039

+0

謝謝羅伯特。另外,我認爲最好在void void(...)函數中初始化sum = 0.0。對? – user62039

+0

threadfence不強制線程執行的順序。是的,只要在內核調用之前就可以將c初始化 –