如何正確地將內存從設備複製到CUDA中的主機？

-1

我想簡單地在CUDA中並行增加一些矩陣值並嘗試將它們複製回主內存。但是，當我一旦線程函數返回打印出來，值是相同的。我甚至嘗試用1個線程運行程序，但沒有運氣。任何幫助將不勝感激。如何正確地將內存從設備複製到CUDA中的主機？

我的代碼：

#include <stdio.h> 
#include <stdlib.h> 
#include <math.h> 
#include <sys/time.h> 
#include <cuda.h> 

#define BLOCK_SIZE 1024 
#define MAX_N  100000000 
#define MAX_THREADS  1024 

int num_threads; 
int count;    // Count of threads that have updated their partition 
int size; 
//int increment; // VS 
int * inc2; 
//int my_start; 


//Host data 
int * thread_ids; 

//nvcc -arch=sm_20 -o nbody.exe nbody.cu (compilation) 

__global__ void pcyc_red(float * a, float * b, float * c, float * D, float * X, 
        float * a2, float * b2, float * c2, float * D2, 
        int * inc2_dev, int * size_dev, int * num_threads_dev){ 

//__threadfence(); 
int thread_id = threadIdx.x + (blockIdx.x * blockDim.x); 
float k1; 
float k2; 
int i; 

int start = 0; 
//int end = size_dev-1; 
//int inc2_dev = inc2_dev1[0]; 
//int inc_dev = *inc_dev1; 
//int size_dev = size_dev1[0]; 
int nthreads = num_threads_dev[0]; 
//Thread work assignment 
int chunk_size = size_dev[0]/nthreads; 
int my_start = thread_id*(chunk_size); 
int my_end = start + ((thread_id + 1)*chunk_size - 1); 
//__threadfence(); 
__syncthreads(); 
//Forward Reduction 
for(i = my_start; i <= my_end; ++i){ 
    a[i] = a[i]++; 
    b[i] = b[i]++; 
    c[i] = c[i]++; 
    D[i] = D[i]++; 
    X[i] = X[i]++; 
} 

__threadfence(); 
//__syncthreads(); 
}//Device Function 


float* init_vector(int size){ 
float* output; 
output = (float*) calloc(size, sizeof(float)); 
int i; 
for(i = 0; i < size; ++i){ 
    output[i] = 2.0; 
} 
return output; 
} 

float* init_vector_ac(int s){ 
//s will be used for size-1 not to be confused for size. 
float* output; 
output = (float*) calloc(s, sizeof(float)); 
int i; 
for(i = 0; i < s; ++i){ 
    output[i] = -1.0; 
} 
return output; 
} 

// Main program 
int main(int argc, char *argv[]) { 

//num_threads -> atoi(argv[argc-1]); 
//struct timeval start, stop; 
float total_time; 
int i; 

///Host structures 
float* a; 
float* b; 
float* c; 
float* D; 
float* X; 

//increment = 2; // VS 
inc2 = (int*) malloc(sizeof(int)); 
inc2[0] = 1; 
//size = (int*) malloc(sizeof(int)); 
//num_threads = (int*) malloc(sizeof(int)); 
//my_start = 0; 
//wait_flag = false; 

///Device Data 
//SYSTEM * sys_dev; 
float * a_dev; 
float * b_dev; 
float * c_dev; 
float * D_dev; 
float * X_dev; 

float * a2_dev; 
float * b2_dev; 
float * c2_dev; 
float * D2_dev; 
//float * X2_dev; 

//int * inc_dev; 
int * inc2_dev; 
//int * mstart_dev; 
int * size_dev; 
int * num_threads_dev; 
int result_var; 

//int final_inc2; 

cudaEvent_t start, stop; // GPU timing variables 
//struct timeval cpu_start, cpu_stop; // CPU timing variables 
    // float time_array[10]; 

// Timing initializations 
cudaEventCreate(&start); 
cudaEventCreate(&stop); 

if (argc != 3) 
{ 
    printf("Use: <executable_name> <size> <num_threads>\n"); 
    exit(0); 
} 
if ((size = atoi(argv[argc-2])) > MAX_N) 
{ 
    printf("Maximum number of nodes allowed: %d\n", MAX_N); 
    exit(0); 
}; 

if ((num_threads = atoi(argv[argc-1])) > MAX_THREADS) 
{ 
    printf("Maximum number of threads allowed: %d.\n", MAX_THREADS); 
    exit(0); 
}; 

int size_array = (size) * sizeof(float); 
int size_array2 = (size - 1) * sizeof(float); 

// Initialize host tridiagonal matrix 
a = init_vector_ac(size-1); // a[i] = -1.0 
b = init_vector(size);  // b[i] = 2.0 
c = init_vector_ac(size-1); // c[i] = -1.0 
D = init_vector(size);  // D[i] = 2.0 
X = init_vector(size);  // X[i] = 2.0 

//xs = init_vector_err(size); 

// Shift elements of a by 1 
for(i = size-1; i > 0; i--) a[i] = a[i-1]; 
a[0] = 0.0; 


thread_ids = (int*) calloc(num_threads, sizeof(int)); 

count = 0; 

for(i = 0; i < num_threads; ++i){ 
    thread_ids[i] = i; 
} 
//Cuda Operation 

cudaEventRecord(start, 0); 

cudaMalloc((void **) &a_dev, size); 
cudaMalloc((void **) &b_dev, size); 
cudaMalloc((void **) &c_dev, size); 
cudaMalloc((void **) &D_dev, size); 
cudaMalloc((void **) &X_dev, size); 
cudaMalloc((void **) &a2_dev, size); 
cudaMalloc((void **) &b2_dev, size); 
cudaMalloc((void **) &c2_dev, size); 
cudaMalloc((void **) &D2_dev, size); 
//cudaMalloc((void**)&inc_dev, sizeof(int)); 
cudaMalloc((void**)&inc2_dev, sizeof(int)); 
//cudaMalloc((void**)&mstart_dev, sizeof(int)); 
cudaMalloc((void**)&size_dev, sizeof(int)); 
cudaMalloc((void**)&num_threads_dev, sizeof(int)); 


cudaMemcpy(a_dev, a, size_array2, cudaMemcpyHostToDevice); 
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice); 
cudaMemcpy(c_dev, c, size_array2, cudaMemcpyHostToDevice); 
cudaMemcpy(D_dev, D, size_array, cudaMemcpyHostToDevice); 
cudaMemcpy(X_dev, X, size_array, cudaMemcpyHostToDevice); 
cudaMemcpy(a2_dev, a, size_array2, cudaMemcpyHostToDevice); 
cudaMemcpy(b2_dev, b, size_array, cudaMemcpyHostToDevice); 
cudaMemcpy(c2_dev, c, size_array2, cudaMemcpyHostToDevice); 
cudaMemcpy(D2_dev, D, size_array, cudaMemcpyHostToDevice); 

//cudaMemcpy(inc_dev, &increment, sizeof(int), cudaMemcpyHostToDevice); 
cudaMemcpy(inc2_dev, inc2, sizeof(int), cudaMemcpyHostToDevice); 
//cudaMemcpy(mstart_dev, &my_start, sizeof(int), cudaMemcpyHostToDevice); 
cudaMemcpy(size_dev, &size, sizeof(int), cudaMemcpyHostToDevice); 
cudaMemcpy(num_threads_dev, &num_threads, sizeof(int), cudaMemcpyHostToDevice); 

cudaDeviceSynchronize(); 
pcyc_red<<<1, num_threads>>>(a_dev, b_dev, c_dev, D_dev, X_dev, 
          a2_dev, b2_dev, c2_dev, D2_dev, 
          inc2_dev, size_dev, num_threads_dev); 
cudaDeviceSynchronize(); 

cudaMemcpy(X, X_dev, size_array, cudaMemcpyDeviceToHost); 
cudaMemcpy(a, a_dev, size_array, cudaMemcpyDeviceToHost); 
cudaMemcpy(b, b_dev, size_array, cudaMemcpyDeviceToHost); 
cudaMemcpy(c, c_dev, size_array, cudaMemcpyDeviceToHost); 
cudaMemcpy(D, D_dev, size_array, cudaMemcpyDeviceToHost); 
cudaMemcpy(inc2, inc2_dev, sizeof(int), cudaMemcpyDeviceToHost); 
cudaMemcpy(&result_var, num_threads_dev, sizeof(int), cudaMemcpyDeviceToHost); 
cudaDeviceSynchronize(); 
cudaEventRecord(stop, 0); 
cudaEventSynchronize(stop); 
cudaEventElapsedTime(&total_time, start, stop); 

printf("Final Var: %d\n\n", inc2[0]); 
printf("Num Threads Var: %d\n\n", result_var); 

for(i = 0; i < size; ++i){ 
    printf("a=%8.4f \n", a[i]); 
    printf("b=%8.4f \n", b[i]); 
    printf("c=%8.4f \n", c[i]); 
    printf("D=%8.4f \n", D[i]); 
    printf("X=%8.4f \n", X[i]); 
} 

printf("Threads = %d, matrix_size = %d, time = %f\n", 
    num_threads, size, total_time); 

cudaFree(a_dev); 
cudaFree(b_dev); 
cudaFree(c_dev); 
cudaFree(D_dev); 
cudaFree(X_dev); 
//cudaFree(inc_dev); 
cudaFree(inc2_dev); 
//cudaFree(mstart_dev); 
//cudaFree(size_dev); 
//cudaFree(num_threads_dev); 

}//end of main

來源

2014-10-01 HarishV

從添加[適當的cuda錯誤檢查]開始（http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime -api）到您的代碼。你也可以用'cuda-memcheck'運行你的代碼來查看它報告的內容。之後，如果您需要幫助，請[發送*完整*代碼]（http://stackoverflow.com/help/mcve），以便某人可以複製，粘貼，編譯和運行，而無需添加任何內容或更改任何內容。 – 2014-10-01 16:47:13

我已添加完整的代碼。 – HarishV 2014-10-01 16:57:20

運行memcheck時，我得到「找不到Cuda-Memcheck結果」。 – HarishV 2014-10-01 17:10:40

添加proper cuda error checking到您的代碼。

我可以看到的一個問題是您的分配大小與您的數組大小不匹配。舉一個例子：

int size_array = (size) * sizeof(float); 
... 
cudaMalloc((void **) &b_dev, size); // size should probably be size_array here 
...       ^^^^ 
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice); // this won't work, will throw error 
        ^^^^^^^^^^

上面肯定是一個錯誤，並且在你的代碼中有幾種類型。您也可能遇到機器配置問題（CUDA未正確安裝等），錯誤檢查也會指出。

來源

2014-10-01 19:15:48

原來我一直提交作業的服務器節點沒有GPU設備。但是，謝謝你真的有幫助。 – HarishV 2014-10-15 22:44:08

如何正確地將內存從設備複製到CUDA中的主機？

回答

相關問題