從設備讀取主機時，cudaMemcpy返回cudaErrorInvalidArgument，不清楚爲什麼

首先發布在這裏。我目前正在開發一個項目，該項目需要編寫一個大型的2d數組（大約1,000,000x7）到我的GPU中，進行一些計算並將其返回給主機。由於我想快速這麼做，並且有這麼大的陣列，所以我試圖壓扁陣列以幫助將其直接傳遞到GPU。數組成功寫入（或者至少cudaMalloc和cudaMemcpy在我寫入設備時返回cudaSuccess），但是當我嘗試讀取它時，cudaMemcpy返回無效的參數錯誤。從設備讀取主機時，cudaMemcpy返回cudaErrorInvalidArgument，不清楚爲什麼

我一直無法弄清楚這是爲什麼，因爲我認爲我應該寫一個有效的1d數組（展平）到設備上並讀出來，並且我認爲我正在將正確的參數提供給做這個。我在網上發現的這個錯誤的唯一結果涉及交換cudaMemcpy的dst和src參數，但我想我已經在這裏了。

這是我的代碼的簡化版本能重現問題：

#include <iostream> 

using namespace std; 

void alloc2dArray(float ** &arr, unsigned long int rows, unsigned long int cols){ 

    arr = new float*[rows]; 

    arr[0] = new float[rows * cols]; 

    for(unsigned long int i = 1; i < rows; i++) arr[i] = arr[i - 1] + cols; 
} 

void write2dArrayToGPU(float ** arr, float * devPtr, unsigned long int rows, unsigned long int cols){ 

    if(cudaSuccess != cudaMalloc((void**)&devPtr, sizeof(float) * rows * cols)) cerr << "cudaMalloc Failed"; 

    if(cudaSuccess != cudaMemcpy(devPtr, arr[0], sizeof(float) * rows * cols, cudaMemcpyHostToDevice)) cerr << "cudaMemcpy Write Failed"; 
} 

void read2dArrayFromGPU(float ** arr, float * devPtr, unsigned long int rows, unsigned long int cols){ 

    if(cudaSuccess != cudaMemcpy(arr[0], devPtr, sizeof(float) * rows * cols, cudaMemcpyDeviceToHost)) cerr << "cudaMemcpy Read Failed" << endl; 
} 

int main(){ 

int R = 100; 
int C = 7; 

cout << "Allocating an " << R << "x" << C << " array ..."; 
float ** arrA; 
alloc2dArray(arrA, R, C); 


cout << "Assigning some values ..."; 
for(int i = 0; i < R; i++){ 
    for(int j = 0; j < C; j++){ 
     arrA[i][j] = i*C + j; 
    } 
} 
cout << "Done!" << endl; 


cout << "Writing to the GPU ..."; 
float * Darr = 0; 
write2dArrayToGPU(arrA, Darr, R, C); 
cout << " Done!" << endl; 

cout << "Allocating second " << R << "x" << C << " array ..."; 
float ** arrB; 
alloc2dArray(arrB, R, C); 
cout << "Done!" << endl; 

cout << "Reading from the GPU into the new array ..."; 
read2dArrayFromGPU(arrB, Darr, R, C); 


}

我編譯和我的筆記本電腦與

$nvcc -arch=sm_30 test.cu -o test 
$optirun cuda-memcheck ./test

運行此並得到結果：

========= CUDA-MEMCHECK 
Allocating an 100x7 array ...Assigning some values ...Done! 
Writing to the GPU ... Done! 
Allocating second 100x7 array ...Done! 
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpy. 
=========  Saved host backtrace up to driver entry point at error 
Reading from the GPU into the new array ...=========  Host Frame:/usr/lib64/nvidia-bumblebee/libcuda.so.1 [0x2ef343] 
cudaMemcpy Read Failed=========  Host Frame:./test [0x38c6f] 
=========  Host Frame:./test [0x2f08] 
=========  Host Frame:./test [0x3135] 
=========  Host Frame:/usr/lib64/libc.so.6 (__libc_start_main + 0xf1) [0x20401] 
=========  Host Frame:./test [0x2c6a] 

========= 
========= ERROR SUMMARY: 1 error

我是CUDA的中等新手，仍在學習，所以任何幫助將不勝感激，謝謝！

來源

2017-07-11 tgorororo

CUDA是不相關的C. – Olaf

你不能通過按值'devPtr'作爲一個指針參數一個函數，在該指針上執行'cudaMalloc'，然後期望分配的指針值在調用環境中顯示。這是傳遞價值的常見錯誤，當然還有其他類似的問題。比如[this one]（https://stackoverflow.com/questions/22826380/cuda-allocation-and-return-array-from-gpu-to-cpu）。你可能想在那裏研究答案，你的問題可以說是那個答案的重複。 –

感謝羅伯特克羅維拉指出我在正確的方向與上面的評論，並鏈接類似的question。

要點是，通過使由devPtr值，而不是通過指針或通過參考到我的GPU write和read功能外，cudaMalloc和cudaMemcpy功能進行了僅在該功能範圍的副本作用。

兩種解決方案 - （這些未拋出錯誤，我跑）

第一：通過devPtr引用到write2dArrayToGPU和read2dArrayFromGPU那麼解決的模樣。

#include <iostream> 

using namespace std; 


void alloc2dArray(float ** &arr, unsigned long int rows, unsigned long int cols){ 

    arr = new float*[rows]; 

    arr[0] = new float[rows * cols]; 

    for(unsigned long int i = 1; i < rows; i++) arr[i] = arr[i - 1] + cols; 
} 

//changed float * devPtr to float * &devPtr 
void write2dArrayToGPU(float ** arr, float * &devPtr, unsigned long int rows, unsigned long int cols){ 

    if(cudaSuccess != cudaMalloc((void**)&devPtr, sizeof(float) * rows * cols)) cerr << "cudaMalloc Failed"; 

    if(cudaSuccess != cudaMemcpy(devPtr, arr[0], sizeof(float) * rows * cols, cudaMemcpyHostToDevice)) cerr << "cudaMemcpy Write Failed"; 
} 

//changed float * devPtr to float * &devPtr 
void read2dArrayFromGPU(float ** arr, float * &devPtr, unsigned long int rows, unsigned long int cols){ 

    if(cudaSuccess != cudaMemcpy(arr[0], devPtr, sizeof(float) * rows * cols, cudaMemcpyDeviceToHost)) cerr << "cudaMemcpy Read Failed" << endl; 
} 

int main(){ 

int R = 100; 
int C = 7; 

cout << "Allocating an " << R << "x" << C << " array ..."; 
float ** arrA; 
alloc2dArray(arrA, R, C); 


cout << "Assigning some values ..."; 
for(int i = 0; i < R; i++){ 
    for(int j = 0; j < C; j++){ 
     arrA[i][j] = i*C + j; 
    } 
} 
cout << "Done!" << endl; 


cout << "Writing to the GPU ..."; 
float * Darr = 0; 
write2dArrayToGPU(arrA, Darr, R, C); 
cout << " Done!" << endl; 

cout << "Allocating second " << R << "x" << C << " array ..."; 
float ** arrB; 
alloc2dArray(arrB, R, C); 
cout << "Done!" << endl; 

cout << "Reading from the GPU into the new array ..."; 
read2dArrayFromGPU(arrB, Darr, R, C); 


}

二：通過指針傳遞devPtr因此解決方案看起來像

#include <iostream> 

using namespace std; 

void alloc2dArray(float ** &arr, unsigned long int rows, unsigned long int cols){ 

    arr = new float*[rows]; 

    arr[0] = new float[rows * cols]; 

    for(unsigned long int i = 1; i < rows; i++) arr[i] = arr[i - 1] + cols; 
} 

//changed float * devPtr to float ** devPtr 
void write2dArrayToGPU(float ** arr, float ** devPtr, unsigned long int rows, unsigned long int cols){ 

    if(cudaSuccess != cudaMalloc((void**)devPtr, sizeof(float) * rows * cols)) cerr << "cudaMalloc Failed"; 

    if(cudaSuccess != cudaMemcpy(*devPtr, arr[0], sizeof(float) * rows * cols, cudaMemcpyHostToDevice)) cerr << "cudaMemcpy Write Failed"; 
} 

//changed float * devPtr to float ** devPtr 
void read2dArrayFromGPU(float ** arr, float ** devPtr, unsigned long int rows, unsigned long int cols){ 

    if(cudaSuccess != cudaMemcpy(arr[0], *devPtr, sizeof(float) * rows * cols, cudaMemcpyDeviceToHost)) cerr << "cudaMemcpy Read Failed" << endl; 
} 

int main(){ 

int R = 100; 
int C = 7; 

cout << "Allocating an " << R << "x" << C << " array ..."; 
float ** arrA; 
alloc2dArray(arrA, R, C); 


cout << "Assigning some values ..."; 
for(int i = 0; i < R; i++){ 
    for(int j = 0; j < C; j++){ 
     arrA[i][j] = i*C + j; 
    } 
} 
cout << "Done!" << endl; 


cout << "Writing to the GPU ..."; 
float * Darr = 0; 
write2dArrayToGPU(arrA, &Darr, R, C); \\changed Darr to &Darr 
cout << " Done!" << endl; 

cout << "Allocating second " << R << "x" << C << " array ..."; 
float ** arrB; 
alloc2dArray(arrB, R, C); 
cout << "Done!" << endl; 

cout << "Reading from the GPU into the new array ..."; 
read2dArrayFromGPU(arrB, &Darr, R, C); // changed Darr to &Darr 


}

來源

2017-07-12 00:41:50 tgorororo

從設備讀取主機時，cudaMemcpy返回cudaErrorInvalidArgument，不清楚爲什麼

回答

相關問題