前綴掃描gpugems3中的CUDA示例代碼是否正確？

我寫了一段代碼來調用內核gpugem3 前綴掃描gpugems3中的CUDA示例代碼是否正確？

但我得到的結果是一堆負數而不是前綴掃描。我想知道我的內核調用是錯誤的還是gpugem3代碼有問題？

這裏是我的代碼：

#include <stdio.h> 
#include <sys/time.h> 
#include <cuda.h> 



__global__ void kernel(int *g_odata, int *g_idata, int n, int dim) 
{ 
    extern __shared__ int temp[];// allocated on invocation 
    int thid = threadIdx.x; 
    int offset = 1; 

    temp[2*thid] = g_idata[2*thid]; // load input into shared memory 
    temp[2*thid+1] = g_idata[2*thid+1]; 
    for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree 
    { 
    __syncthreads(); 
    if (thid < d) 
    { 
    int ai = offset*(2*thid+1)-1; 
    int bi = offset*(2*thid+2)-1; 
    temp[bi] += g_idata[ai]; 
    } 
    offset *= 2; 
    } 
    if (thid == 0) { temp[n - 1] = 0; } // clear the last element 
    for (int d = 1; d < n; d *= 2) // traverse down tree & build scan 
    { 
    offset >>= 1; 
    __syncthreads(); 
    if (thid < d) 
    { 
    int ai = offset*(2*thid+1)-1; 
    int bi = offset*(2*thid+2)-1; 
    int t = temp[ai]; 
    temp[ai] = temp[bi]; 
    temp[bi] += t; 
    } 
    } 
    __syncthreads(); 
    g_odata[2*thid] = temp[2*thid]; // write results to device memory 
    g_odata[2*thid+1] = temp[2*thid+1]; 

} 


void Initialize(int *h_in,int num_items) 
{ 

    int j; 
    for(j=0;j<num_items;j++) 

     h_in[j]=j; 
     printf(" input: "); 
     printf("\n\n"); 



} 


int main(int argc, char** argv) 
{ 
    int num_items = 512; 


    int* h_in = new int[num_items]; 


    // Initialize problem 
    Initialize(h_in, num_items); 


    int *d_in = NULL; 
    cudaMalloc((void**)&d_in, sizeof(int) * num_items); 


if(cudaSuccess!= cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)) fprintf(stderr,"could not copy to gpu"); 

    // Allocate device output array 
    int *d_out = NULL; 
    cudaMalloc((void**)&d_out, sizeof(int) * (num_items+1)); 


    kernel<<<1,256,num_items*sizeof(int)>>>(d_out, d_in,num_items, 2); 

    int* h_out= new int[num_items+1]; 
    if(cudaSuccess !=cudaMemcpy(h_out,d_out,sizeof(int)*(num_items+1),cudaMemcpyDeviceToHost))fprintf(stderr,"could not copy back"); 
    int i; 
    printf(" \n"); 
    for(i=0;i<num_items;i++) 
    printf(" ,%d ",h_out[i]); 
    // Cleanup 
    if (h_in) delete[] h_in; 
    if (h_out) delete[] h_out; 
    if (d_in) cudaFree(d_in); 
    if (d_out) cudaFree(d_out); 

    printf("\n\n"); 

    return 0; 
}

來源

2015-06-14 dibid

你的內核使用動態共享內存和你甚至有「在調用上分配的註釋」，但是你不在內核調用上分配共享內存，它應該是內核調用的第3個參數，參見http://docs.nvidia.com/cuda/cuda-c-programming- guide/index.html＃execution-configuration –

所以我通過調用kernel <<< 1，256，256 * sizeof（int）>>>（g_odata，g_idata，n）在內核調用上分配共享內存，但仍然得到一些負數作爲我的結果。有誰知道我做錯了什麼？ – dibid

請發佈您正在運行到您的問題的*實際*代碼。在發佈的問題中，版本中存在很多嚴重錯誤，但顯然這不是您現在正在運行的內容。 – talonmies

看來，你已經在轉錄從GPU精粹3 chapter代碼到你的內核做至少1個錯誤。這條線是不正確的：

temp[bi] += g_idata[ai];

它應該是：

temp[bi] += temp[ai];

當我做出一個改變你現在已經發布的代碼，它似乎打印出正確的（不掃描）前綴總計給我。還有一些其他的事情我會提到：

即使沒有這種變化，我會得到一些接近正確的結果。因此，如果您遇到了各種各樣的問題（例如負數），您的機器設置或CUDA安裝可能會有問題。我建議你使用比現在更嚴格的cuda error checking（雖然機器設置問題應該在你的其中一張支票上註明）。
製作的程序會有一些限制。它只能在單個線程塊中使用，它在共享內存訪問時會發生bank衝突，並且數據集大小將受限於單個線程塊能夠處理的內容（此例程每個線程產生兩個輸出元素，所以數據集大小預計等於線程數的兩倍）。如前所述，動態共享內存分配需要與數據集大小（即線程大小的兩倍，元素數量）一樣大。
這可能是對學習有用的，但如果你想有一個強大的，快速的前綴掃描，建議您使用常規的thrust或cub，而不是你自己的代碼，即使從這裏（舊）的文章的。

下面的代碼與你相似，但它有固定的上述問題，我已經模板化的內核，用於各種數據類型的使用：

#include <stdio.h> 
#define DSIZE 512 
#define cudaCheckErrors(msg) \ 
    do { \ 
     cudaError_t __err = cudaGetLastError(); \ 
     if (__err != cudaSuccess) { \ 
      fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ 
       msg, cudaGetErrorString(__err), \ 
       __FILE__, __LINE__); \ 
      fprintf(stderr, "*** FAILED - ABORTING\n"); \ 
      exit(1); \ 
     } \ 
    } while (0) 


typedef int mytype; 

template <typename T> 
__global__ void prescan(T *g_odata, T *g_idata, int n) 
{ 
    extern __shared__ T temp[]; // allocated on invocation 
    int thid = threadIdx.x; 
    int offset = 1; 
    temp[2*thid] = g_idata[2*thid]; // load input into shared memory 
    temp[2*thid+1] = g_idata[2*thid+1]; 
    for (int d = n>>1; d > 0; d >>= 1)     // build sum in place up the tree 
    { 
    __syncthreads(); 
    if (thid < d) 
    { 
     int ai = offset*(2*thid+1)-1; 
     int bi = offset*(2*thid+2)-1; 
     temp[bi] += temp[ai]; 
    } 
    offset *= 2; 
    } 
    if (thid == 0) { temp[n - 1] = 0; } // clear the last element 
    for (int d = 1; d < n; d *= 2) // traverse down tree & build scan 
    { 
     offset >>= 1; 
     __syncthreads(); 
     if (thid < d) 
     { 
     int ai = offset*(2*thid+1)-1; 
     int bi = offset*(2*thid+2)-1; 
     T t = temp[ai]; 
     temp[ai] = temp[bi]; 
     temp[bi] += t; 
     } 
    } 
    __syncthreads(); 
    g_odata[2*thid] = temp[2*thid]; // write results to device memory 
    g_odata[2*thid+1] = temp[2*thid+1]; 
} 

int main(){ 

    mytype *h_i, *d_i, *h_o, *d_o; 
    int dszp = (DSIZE)*sizeof(mytype); 

    h_i = (mytype *)malloc(dszp); 
    h_o = (mytype *)malloc(dszp); 
    if ((h_i == NULL) || (h_o == NULL)) {printf("malloc fail\n"); return 1;} 
    cudaMalloc(&d_i, dszp); 
    cudaMalloc(&d_o, dszp); 
    cudaCheckErrors("cudaMalloc fail"); 
    for (int i = 0 ; i < DSIZE; i++){ 
    h_i[i] = i; 
    h_o[i] = 0;} 
    cudaMemset(d_o, 0, dszp); 
    cudaCheckErrors("cudaMemset fail"); 
    cudaMemcpy(d_i, h_i, dszp, cudaMemcpyHostToDevice); 
    cudaCheckErrors("cudaMemcpy 1 fail"); 
    prescan<<<1,DSIZE/2, dszp>>>(d_o, d_i, DSIZE); 
    cudaDeviceSynchronize(); 
    cudaCheckErrors("kernel fail"); 
    cudaMemcpy(h_o, d_o, dszp, cudaMemcpyDeviceToHost); 
    cudaCheckErrors("cudaMemcpy 2 fail"); 
    mytype psum = 0; 
    for (int i =1; i < DSIZE; i++){ 
    psum += h_i[i-1]; 
    if (psum != h_o[i]) {printf("mismatch at %d, was: %d, should be: %d\n", i, h_o[i], psum); return 1;} 
    } 
    return 0; 
}

來源

2015-06-14 22:05:42

你*遠比*更耐心...... +1 – talonmies

設備功能無效通常意味着你正在編譯一個不正確的體系結構。您使用的是哪種GPU，您使用的是哪種操作系統和版本，以及您使用的確切編譯命令是什麼？（這可能是我答案中的第1點的解釋，因爲我的代碼執行了適當的cuda錯誤檢查。） –

對不起，「特斯拉」不是一個足夠的描述。我需要知道您使用的確切GPU。如果您不確定，請運行deviceQuery示例代碼。 –

前綴掃描gpugems3中的CUDA示例代碼是否正確？

回答

相關問題