兩個向量的cuda點積不適用於N> = 369

-1

我已經實現了一個向量點積如下。它與CUDA 7.5編譯爲compute_20,sm_20和const int THREADS_PER_BLOCK = 16;。兩個向量的cuda點積不適用於N> = 369

浮動和雙打都發生這種情況。

它工作到n=368，但除此之外，結果是不正確的。我想知道問題出在我的實現代碼還是我正在使用的值（請參閱第二個代碼，初始化），例如可能是除n=368之外的添加引入了浮點錯誤（這可能很奇怪，因爲浮點和雙精度同時出現錯誤）。

int divUp(int total, int grain) { return (total+grain-1)/grain; } 

__device__ __forceinline__ double atomicAdd(double* address, double val) 
{ 
    unsigned long long int* address_as_ull = (unsigned long long int*)address; 
    unsigned long long int old = *address_as_ull, assumed; 
    do 
    { 
     assumed = old; 
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed))); 
    } 
    while(assumed!=old); 
    return __longlong_as_double(old); 
} 

__device__ __forceinline__ float atomicAdd(float* address, float val) 
{ 
    unsigned int *ptr = (unsigned int *)address; 
    unsigned int old, newint, ret = *ptr; 
    do { 
     old = ret; 
     newint = __float_as_int(__int_as_float(old)+val); 
    } while((ret = atomicCAS(ptr, old, newint)) != old); 

    return __int_as_float(ret); 
} 

template<typename T> 
__global__ void vecdotk(const T* a, const T* b, const int n, T* c) 
{ 
    __shared__ T temp[THREADS_PER_BLOCK]; 
    int x = threadIdx.x+blockIdx.x*blockDim.x; 
    if(x==0) c[0] = 0.0; 
    if(x<n) {temp[threadIdx.x] = a[x]*b[x]; 
    } 
    else temp[threadIdx.x] = 0.0; 
    __syncthreads(); 

    if(0==threadIdx.x) 
    { 
     T sum = 0.0; 
     for(int j=0; j<THREADS_PER_BLOCK; ++j) 
     { 
      sum += temp[j]; 
     } 
     atomicAdd(c, sum); 
    } 
} 

template<typename T> 
void dot(const T* a, const T* b, const int n, T* c) 
{ 
    dim3 block(THREADS_PER_BLOCK); 
    dim3 grid(divUp(n, block.x), 1); 
    vecdotk<T><<<grid, block>>>(a, b, n, c); 
    cudaSafeCall(cudaGetLastError()); 
};

我使用以下兩個主機矢量來填充輸入設備數組（目前我沒有顯示，因爲它們是更大的庫的一部分）。基本上我想計算平方系列的總和即

// fill host vectors a and b 
for(int i=0; i<n; ++i) 
{ 
    h_vec_a[i] = i+1;//__mat_rand(); 
    h_vec_b[i] = i+1;//__mat_rand(); 
}

來源

2017-07-23 user62039

這是行不通的：

if(x==0) c[0] = 0.0;

沒有保證（在CUDA）該線程第一個0或運行該行將在其他線程到達代碼中的任何點之前運行。在啓動該內核之前，您需要初始化c[0]。否則，一些線程可能會將它們的原子添加到c中，然後，稍後線程0可能會將c [0]初始化爲零。

此外，CUDA已經提供了一個float版本的atomicAdd，您沒有理由提供自己的。並且，運行16個線程的線程塊將不會給你帶來好的性能（我建議只使用CUBLAS點積函數。）通過修復c[0]（刪除該代碼行，並在內核之前初始化c[0]），代碼將正確運行我：

$ cat t372.cu 
#include <stdio.h> 

const int n = 2048; 
#ifdef USE_DOUBLE 
typedef double mytype; 
#else 
typedef float mytype; 
#endif 
const int THREADS_PER_BLOCK = 16; 

int divUp(int total, int grain) { return (total+grain-1)/grain; } 
#if 0 
__device__ __forceinline__ double atomicAdd(double* address, double val) 
{ 
    unsigned long long int* address_as_ull = (unsigned long long int*)address; 
    unsigned long long int old = *address_as_ull, assumed; 
    do 
    { 
     assumed = old; 
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed))); 
    } 
    while(assumed!=old); 
    return __longlong_as_double(old); 
} 

__device__ __forceinline__ float atomicAdd(float* address, float val) 
{ 
    unsigned int *ptr = (unsigned int *)address; 
    unsigned int old, newint, ret = *ptr; 
    do { 
     old = ret; 
     newint = __float_as_int(__int_as_float(old)+val); 
    } while((ret = atomicCAS(ptr, old, newint)) != old); 

    return __int_as_float(ret); 
} 
#endif 
template<typename T> 
__global__ void vecdotk(const T* a, const T* b, const int n, T* c) 
{ 
    __shared__ T temp[THREADS_PER_BLOCK]; 
    int x = threadIdx.x+blockIdx.x*blockDim.x; 
    //if(x==0) c[0] = 0.0; 
    if(x<n) {temp[threadIdx.x] = a[x]*b[x]; 
    } 
    else temp[threadIdx.x] = 0.0; 
    __syncthreads(); 

    if(0==threadIdx.x) 
    { 
     T sum = 0.0; 
     for(int j=0; j<THREADS_PER_BLOCK; ++j) 
     { 
      sum += temp[j]; 
     } 
     atomicAdd(c, sum); 
    } 
} 

template<typename T> 
cudaError_t dot(const T* a, const T* b, const int n, T* c) 
{ 
    dim3 block(THREADS_PER_BLOCK); 
    dim3 grid(divUp(n, block.x), 1); 
    vecdotk<T><<<grid, block>>>(a, b, n, c); 
    cudaDeviceSynchronize(); 
    return cudaGetLastError(); 
}; 

int main(){ 

    mytype *h_vec_a, *h_vec_b, *d_vec_a, *d_vec_b, *h_c, *d_c; 
    int bs = n*sizeof(mytype); 
    h_vec_a = (mytype *)malloc(bs); 
    h_vec_b = (mytype *)malloc(bs); 
    h_c = (mytype *)malloc(sizeof(mytype)); 
    cudaMalloc(&d_vec_b, bs); 
    cudaMalloc(&d_vec_a, bs); 
    cudaMalloc(&d_c, sizeof(mytype)); 
// fill host vectors a and b 
    for(int i=0; i<n; ++i) 
    { 
    h_vec_a[i] = i+1;//__mat_rand(); 
    h_vec_b[i] = i+1;//__mat_rand(); 
    } 
    h_c[0] = 0; 
    cudaMemcpy(d_vec_a, h_vec_a, bs, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_vec_b, h_vec_b, bs, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_c, h_c, sizeof(mytype), cudaMemcpyHostToDevice); 
    dot(d_vec_a, d_vec_b, n, d_c); 
    cudaMemcpy(h_c, d_c, sizeof(mytype), cudaMemcpyDeviceToHost); 
    mytype test_val = 0; 
    for (int i=0; i < n; i++) 
    test_val += h_vec_a[i] * h_vec_b[i]; 
    printf("GPU result: %f, CPU result: %f\n", h_c[0], test_val); 

} 
$ nvcc -arch=sm_20 -o t372 t372.cu 
nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). 
$ cuda-memcheck ./t372 
========= CUDA-MEMCHECK 
GPU result: 2865411584.000000, CPU result: 2865411072.000000 
========= ERROR SUMMARY: 0 errors 
$

在最後3個數字的數值差異是由於float極限，不由於在代碼的任何誤差。例如，如果您更改初始化以將每個矢量初始化爲全1，則在此情況下您將得到一個精確的結果。

再一次，出於性能方面的原因，可能會對您的代碼提出一些批評。如果你想要一個快速點產品，我建議使用CUBLAS。

來源

2017-07-23 15:13:40

我正在學習CUDA的基礎知識，併爲通用計算編程GPU。但是，如果我在'if（x == 0）c [0] = 0.0;'行之後使用'__threadfence（）'會怎麼樣？ – user62039

謝謝羅伯特。另外，我認爲最好在void void（...）函數中初始化sum = 0.0。對？ – user62039

threadfence不強制線程執行的順序。是的，只要在內核調用之前就可以將c初始化 –

兩個向量的cuda點積不適用於N> = 369

回答

相關問題