如何解決CUDA推力庫 - for_each同步錯誤？

我正試圖使用CUDA的推力庫來修改CUDA中的簡單動態向量。但是我在屏幕上顯示「launch_closure_by_value」錯誤，指出錯誤與某些同步過程有關。如何解決CUDA推力庫 - for_each同步錯誤？

由於此錯誤，不可能進行簡單的一維動態數組修改。

導致錯誤的我的代碼段如下。

從cpp文件我打電話setIndexedGrid，其在System.cu

float* a= (float*)(malloc(8*sizeof(float))); 
a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7; 
float* b = (float*)(malloc(8*sizeof(float))); 
setIndexedGridInfo(a,b);

的代碼段在System.cu定義：

void 
setIndexedGridInfo(float* a, float*b) 
{ 

    thrust::device_ptr<float> d_oldData(a); 
    thrust::device_ptr<float> d_newData(b); 

    float c = 0.0; 

    thrust::for_each(
     thrust::make_zip_iterator(thrust::make_tuple(d_oldData,d_newData)), 
     thrust::make_zip_iterator(thrust::make_tuple(d_oldData+8,d_newData+8)), 
     grid_functor(c)); 
}

grid_functor在_kernel.cu

定義

struct grid_functor 
{ 
    float a; 

    __host__ __device__ 
    grid_functor(float grid_Info) : a(grid_Info) {} 

    template <typename Tuple> 
    __device__ 
    void operator()(Tuple t) 
    { 
     volatile float data = thrust::get<0>(t); 
     float pos = data + 0.1; 
     thrust::get<1>(t) = pos; 
    } 

};

我也得到這些在輸出窗口（我使用Visual Studio）：

0123在Particles.exe在0x000007fefdc7cacd

第一次機會異常：微軟C++異常：cudaError_enum內存位置 0x0029eb60 ..第一次機會異常在0x000007fefdc7cacd在 smokeParticles.exe：微軟C++異常：推力::系統:: SYSTEM_ERROR內存位置0x0029ecf0 ..在0x000007fefdc7cacd未處理例外Particles.exe：微軟C++ 例外：推力::系統:: SYSTEM_ERROR內存位置 0x0029ecf0 ..

是什麼原因造成問題？

來源

2012-03-10 Emre Turkoz

您正試圖在期待設備內存中的指針的函數中使用主機內存指針。此代碼的問題是：

float* a= (float*)(malloc(8*sizeof(float))); 
a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7; 
float* b = (float*)(malloc(8*sizeof(float))); 
setIndexedGridInfo(a,b); 

..... 

thrust::device_ptr<float> d_oldData(a); 
thrust::device_ptr<float> d_newData(b);

的thrust::device_ptr旨在用於「包裝」與CUDA API分配，使得推力可以用它一個設備存儲器指針。您正嘗試直接將主機指針視爲設備指針。這是非法的。你可以修改你的setIndexedGridInfo功能是這樣的：

void setIndexedGridInfo(float* a, float*b, const int n) 
{ 

    thrust::device_vector<float> d_oldData(a,a+n); 
    thrust::device_vector<float> d_newData(b,b+n); 

    float c = 0.0; 

    thrust::for_each(
     thrust::make_zip_iterator(thrust::make_tuple(d_oldData.begin(),d_newData.begin())), 
     thrust::make_zip_iterator(thrust::make_tuple(d_oldData.end(),d_newData.end())), 
     grid_functor(c)); 
}

的device_vector構造函數將分配設備內存中，然後你的主機內存中的內容複製到設備。這應該可以解決你所看到的錯誤，儘管我不確定你想要用迭代器來做什麼，以及你所用的函子是否正確。

編輯：

這是一個完整的，可編譯，可運行代碼的版本：

#include <cstdlib> 
#include <cstdio> 
#include <thrust/device_vector.h> 
#include <thrust/for_each.h> 
#include <thrust/copy.h> 

struct grid_functor 
{ 
    float a; 

    __host__ __device__ 
    grid_functor(float grid_Info) : a(grid_Info) {} 

    template <typename Tuple> 
    __device__ 
    void operator()(Tuple t) 
    { 
     volatile float data = thrust::get<0>(t); 
     float pos = data + 0.1f; 
     thrust::get<1>(t) = pos; 
    } 

}; 

void setIndexedGridInfo(float* a, float*b, const int n) 
{ 

    thrust::device_vector<float> d_oldData(a,a+n); 
    thrust::device_vector<float> d_newData(b,b+n); 

    float c = 0.0; 

    thrust::for_each(
     thrust::make_zip_iterator(thrust::make_tuple(d_oldData.begin(),d_newData.begin())), 
     thrust::make_zip_iterator(thrust::make_tuple(d_oldData.end(),d_newData.end())), 
     grid_functor(c)); 

    thrust::copy(d_newData.begin(), d_newData.end(), b); 
} 

int main(void) 
{ 
    const int n = 8; 
    float* a= (float*)(malloc(n*sizeof(float))); 
    a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7; 
    float* b = (float*)(malloc(n*sizeof(float))); 
    setIndexedGridInfo(a,b,n); 

    for(int i=0; i<n; i++) { 
     fprintf(stdout, "%d (%f,%f)\n", i, a[i], b[i]); 
    } 

    return 0; 
}

我可以編譯和OS 10.6.8主機上運行此代碼CUDA 4。1像這樣：

$ nvcc -Xptxas="-v" -arch=sm_12 -g -G thrustforeach.cu 
./thrustforeach.cu(18): Warning: Cannot tell what pointer points to, assuming global memory space 
./thrustforeach.cu(20): Warning: Cannot tell what pointer points to, assuming global memory space 
./thrustforeach.cu(18): Warning: Cannot tell what pointer points to, assuming global memory space 
./thrustforeach.cu(20): Warning: Cannot tell what pointer points to, assuming global memory space 
ptxas info : Compiling entry function '_ZN6thrust6detail7backend4cuda6detail23launch_closure_by_valueINS2_18for_each_n_closureINS_12zip_iteratorINS_5tupleINS0_15normal_iteratorINS_10device_ptrIfEEEESB_NS_9null_typeESC_SC_SC_SC_SC_SC_SC_EEEEi12grid_functorEEEEvT_' for 'sm_12' 
ptxas info : Used 14 registers, 160+0 bytes lmem, 16+16 bytes smem, 4 bytes cmem[1] 
ptxas info : Compiling entry function '_ZN6thrust6detail7backend4cuda6detail23launch_closure_by_valueINS2_18for_each_n_closureINS_12zip_iteratorINS_5tupleINS0_15normal_iteratorINS_10device_ptrIfEEEESB_NS_9null_typeESC_SC_SC_SC_SC_SC_SC_EEEEj12grid_functorEEEEvT_' for 'sm_12' 
ptxas info : Used 14 registers, 160+0 bytes lmem, 16+16 bytes smem, 4 bytes cmem[1] 

$ ./a.out 
0 (0.000000,0.100000) 
1 (1.000000,1.100000) 
2 (2.000000,2.100000) 
3 (3.000000,3.100000) 
4 (4.000000,4.100000) 
5 (5.000000,5.100000) 
6 (6.000000,6.100000) 
7 (7.000000,7.100000)

來源

2012-03-10 09:05:33 talonmies

我完全誤解了推力的概念。我認爲它也可以通過主機陣列。我只是想把每個元素增加0.1。只是爲了鍛鍊。感謝您的幫助。 – 2012-03-10 09:12:04

但這個device_vector初始化不起作用。 device_vector 是不夠的。 device_vector還需要一個類型名稱Alloc：device_vector – 2012-03-10 10:05:19

相信我，它的確如此。我在對'for_each'調用進行更改時發生了一個小的語法錯誤。看看新版本。我現在用一個編譯器進行了檢查，它在計算1.2設備上使用CUDA 4.1運行。 – talonmies 2012-03-10 10:13:45

如何解決CUDA推力庫 - for_each同步錯誤？

回答

相關問題