cuda magma matrix-matrix addition kernel

-1

我嘗試使用與magmablas_sgeadd_q內核類似的格式，但是我沒有得到正確的輸出，而且每次運行它時都會得到不同的輸出。，我使用的代碼在下面給出：cuda magma matrix-matrix addition kernel

#include <stdio.h> 
#include <stdlib.h> 
#include <math.h> 
#include <cuda_runtime.h> 

#define BLK_X 2 
#define BLK_Y 1 

__global__ void matrixAdd2(const float *dA, const float *dB, float *dC, int m, int n) 
{ 
int ldda = m; 
int lddb = m; 

int ind = blockIdx.x*BLK_X + threadIdx.x; 
int iby = blockIdx.y*BLK_Y; 
/* check if full block-column */ 
bool full = (iby + BLK_Y <= n); 
/* do only rows inside matrix */ 
if (ind < m) { 
    dA += ind + iby*ldda; 
    dB += ind + iby*lddb; 
    if (full) 
    { 
     // full block-column 
     #pragma unroll 
     for(int j=0; j < BLK_Y; ++j) 
     { 
      dC[j*lddb] = dA[j*ldda] + dB[j*lddb]; 
      printf("A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]); 
     } 
    } 
    else 
    { 
     // partial block-column 
     for(int j=0; j < BLK_Y && iby+j < n; ++j) 
     { 
      dC[j*lddb] = dA[j*ldda] + dB[j*lddb]; 
      printf("parital: A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]); 
     } 
    } 
} 
} 



int main (void) 
{ 

int m = 4; // a - mxn matrix 
int n = 2; // b - mxn matrix 

size_t size = m * n * sizeof(float); 


printf("Matrix addition of %d rows and %d columns \n", m, n); 

// allocate matrices on the host 

float *h_A = (float *)malloc(size); // a- mxn matrix on the host 
float *h_B = (float *)malloc(size); // b- mxn matrix on the host 
float *h_C = (float *)malloc(size); // b- mxn matrix on the host 


// Initialize the host input matrixs 
for (int i = 0; i < m; ++i) 
{ 
    for (int j = 0; j < n ; j ++) 
    { 
     h_A[i*m+j] = rand()/(float)RAND_MAX; 
     h_B[i*m+j] = rand()/(float)RAND_MAX; 

    } 
} 

// Allocate the device input matrix A 
float *d_A = NULL; 
err = cudaMalloc((void **)&d_A, size);; // d_a - mxn matrix a on the device 

// Allocate the device input matrix B 
float *d_B = NULL; 
err = cudaMalloc((void **)&d_B, size); 

// Allocate the device output matrix C 
float *d_C = NULL; 
err = cudaMalloc((void **)&d_C, size); 

// Copy the host input matrixs A and B in host memory to the device input matrixs in device memory 
printf("Copy input data from the host memory to the CUDA device\n"); 
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 

err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 

// defining number of threads and blocks  
dim3 threads(BLK_X, 1); 
dim3 grid((int)ceil(m/BLK_X),(int)ceil(n/BLK_Y)); 


// Launching kernel  
matrixAdd2<<<grid, threads, 0>>>(d_A, d_B, d_C, m, n); 

// Copy the device result matrix in device memory to the host result matrix in host memory. 
printf("Copy output data from the CUDA device to the host memory\n"); 
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 

//print A matrix 
printf("Matrix A"); 
for (int i = 0; i < m; i++) 
{ 
    for (int j = 0; j < n; j++) 
    { 
     printf(" %f", h_A[i*m+j]); 

    } 
    printf("\n"); 
} 

// print B matrix if required 
printf("Matrix B"); 
for (int i = 0; i < m; i++) 
{ 
    for (int j = 0; j < n; j++) 
    { 

     printf(" %f", h_B[i*m+j]); 

    } 
    printf("\n"); 
} 

//Error checkng 
printf("Matrix C "); 
for (int i = 0; i < m; i++) 
{ 
    for (int j = 0; j < n; j++) 
    {  
     printf("%f", h_C[i*m+j]); 
     if(h_C[i*m+j] == h_A[i*m+j] + h_B[i*m+j]) 
     { 
      flag = flag + 1; 
     } 
    } 
    printf("\n"); 
} 

if(flag==m*n) 
{ 
printf("Test PASSED\n"); 
} 


// Free device global memory 
err = cudaFree(d_A); 

err = cudaFree(d_B); 

err = cudaFree(d_C); 

// Free host memory 
free(h_A); 
free(h_B); 
free(h_C); 


err = cudaDeviceReset(); 
printf("Done\n"); 
return 0; 

}

輸出我得到：

矩陣加法的4行2列從主機內存到CUDA設備 CUDA內核啓動與複製輸入數據的2個線程從CUDA設備到主機存儲器甲是0.000000 複印輸出數據，B是0.364784 4塊，C是0.364784 甲是0.000000，B是0.952230，C是0.952230 甲是0.000000，B是0.000000，C是0.000000 甲是0.000000，B是0.000000，C是0.000000 甲是0.840188，B是0.394383，C是1.234571 甲是0.783099，B是0.798440，C是1.581539 甲是0.911647，B是0.197551，C是1.109199 甲是0.335223，B是0.768230，C是1.103452

矩陣的

0.840188 0.783099 0.911647 0.335223 0.277775 0.477397 0.364784 0.952230

基質B

0.394383 0.798440 0.197551 0.768230 0.553970 0.628871 0.000000 0.000000

矩陣C

0.0000000.000000 0.0000000.000000 0.0000000.000000 0.0000000.000000

我們，如果你發現有什麼錯誤的代碼我知道。

謝謝

來源

2016-07-14 aish

有報道說，我發現了兩個編碼錯誤：

當您使用此方法來「撞」在你的內核矩陣dA基指針和dB，你必須也做基指針同樣爲矩陣dC：
```
if (ind < m) { 
    dA += ind + iby*ldda; 
    dB += ind + iby*lddb; 
    dC += ind + iby*lddb; // add this line 
```
嵌套for循環的主機代碼沒有正確編制索引。外環旨在指數橫跨行，你有n行，但你允許跨m行外環指數：
```
for (int i = 0; i < m; ++i) 
{ 
    for (int j = 0; j < n ; j ++) 
```
這樣的話，當你在這裏做了實際的索引計算：
```
h_A[i*m+j] = rand()/(float)RAND_MAX; 
```
您正在索引超出範圍。（i*m超出了矩陣大小，對於某些值爲i）在主機代碼中的所有嵌套for循環中都會重複此問題。修復方法是將您的i，j循環上的m，n範圍反轉。

下面的代碼的錯誤固定的（加上你留下了一些附加的變量定義 - err和flag在當前已發佈的代碼未定義 - 這創造編譯錯誤），它似乎正確地運行，產生正確的結果：

$ cat t1213.cu 
#include <stdio.h> 
#include <stdlib.h> 
#include <math.h> 
#include <cuda_runtime.h> 

#define BLK_X 2 
#define BLK_Y 1 

__global__ void matrixAdd2(const float *dA, const float *dB, float *dC, int m, int n) 
{ 
int ldda = m; 
int lddb = m; 

int ind = blockIdx.x*BLK_X + threadIdx.x; 
int iby = blockIdx.y*BLK_Y; 
/* check if full block-column */ 
bool full = (iby + BLK_Y <= n); 
/* do only rows inside matrix */ 
if (ind < m) { 
    dA += ind + iby*ldda; 
    dB += ind + iby*lddb; 
    dC += ind + iby*lddb; 
    if (full) 
    { 
     // full block-column 
     #pragma unroll 
     for(int j=0; j < BLK_Y; ++j) 
     { 
      dC[j*lddb] = dA[j*ldda] + dB[j*lddb]; 
      printf("A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]); 
     } 
    } 
    else 
    { 
     // partial block-column 
     for(int j=0; j < BLK_Y && iby+j < n; ++j) 
     { 
      dC[j*lddb] = dA[j*ldda] + dB[j*lddb]; 
      printf("parital: A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]); 
     } 
    } 
} 
} 



int main (void) 
{ 

int m = 4; // a - mxn matrix 
int n = 2; // b - mxn matrix 

size_t size = m * n * sizeof(float); 


printf("Matrix addition of %d rows and %d columns \n", m, n); 

// allocate matrices on the host 

float *h_A = (float *)malloc(size); // a- mxn matrix on the host 
float *h_B = (float *)malloc(size); // b- mxn matrix on the host 
float *h_C = (float *)malloc(size); // b- mxn matrix on the host 


// Initialize the host input matrixs 
for (int i = 0; i < n; ++i) 
{ 
    for (int j = 0; j < m ; j ++) 
    { 
     h_A[i*m+j] = rand()/(float)RAND_MAX; 
     h_B[i*m+j] = rand()/(float)RAND_MAX; 

    } 
} 

// Allocate the device input matrix A 
float *d_A = NULL; 
cudaError_t err = cudaMalloc((void **)&d_A, size);; // d_a - mxn matrix a on the device 

// Allocate the device input matrix B 
float *d_B = NULL; 
err = cudaMalloc((void **)&d_B, size); 

// Allocate the device output matrix C 
float *d_C = NULL; 
err = cudaMalloc((void **)&d_C, size); 

// Copy the host input matrixs A and B in host memory to the device input matrixs in device memory 
printf("Copy input data from the host memory to the CUDA device\n"); 
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 

err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 

// defining number of threads and blocks 
dim3 threads(BLK_X, BLK_Y); 
dim3 grid((int)ceil(m/BLK_X),(int)ceil(n/BLK_Y)); 


// Launching kernel 
matrixAdd2<<<grid, threads, 0>>>(d_A, d_B, d_C, m, n); 

// Copy the device result matrix in device memory to the host result matrix in host memory. 
printf("Copy output data from the CUDA device to the host memory\n"); 
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 

//print A matrix 
printf("Matrix A"); 
for (int i = 0; i < n; i++) 
{ 
    for (int j = 0; j < m; j++) 
    { 
     printf(" %f", h_A[i*m+j]); 

    } 
    printf("\n"); 
} 

// print B matrix if required 
printf("Matrix B"); 
for (int i = 0; i < n; i++) 
{ 
    for (int j = 0; j < m; j++) 
    { 

     printf(" %f", h_B[i*m+j]); 

    } 
    printf("\n"); 
} 
int flag = 0; 
//Error checkng 
printf("Matrix C "); 
for (int i = 0; i < n; i++) 
{ 
    for (int j = 0; j < m; j++) 
    { 
     printf("%f", h_C[i*m+j]); 
     if(h_C[i*m+j] == h_A[i*m+j] + h_B[i*m+j]) 
     { 
      flag = flag + 1; 
     } 
    } 
    printf("\n"); 
} 

if(flag==m*n) 
{ 
printf("Test PASSED\n"); 
} 


// Free device global memory 
err = cudaFree(d_A); 

err = cudaFree(d_B); 

err = cudaFree(d_C); 

// Free host memory 
free(h_A); 
free(h_B); 
free(h_C); 


err = cudaDeviceReset(); 
printf("Done\n"); 
return 0; 

} 
$ nvcc -o t1213 t1213.cu 
$ cuda-memcheck ./t1213 
========= CUDA-MEMCHECK 
Matrix addition of 4 rows and 2 columns 
Copy input data from the host memory to the CUDA device 
Copy output data from the CUDA device to the host memory 
A is 0.277775, B is 0.553970, C is 0.831745 
A is 0.477397, B is 0.628871, C is 1.106268 
A is 0.364784, B is 0.513401, C is 0.878185 
A is 0.952230, B is 0.916195, C is 1.868425 
A is 0.911647, B is 0.197551, C is 1.109199 
A is 0.335223, B is 0.768230, C is 1.103452 
A is 0.840188, B is 0.394383, C is 1.234571 
A is 0.783099, B is 0.798440, C is 1.581539 
Matrix A 0.840188 0.783099 0.911647 0.335223 
0.277775 0.477397 0.364784 0.952230 
Matrix B 0.394383 0.798440 0.197551 0.768230 
0.553970 0.628871 0.513401 0.916195 
Matrix C 1.2345711.5815391.1091991.103452 
0.8317451.1062680.8781851.868425 
Test PASSED 
Done 
========= ERROR SUMMARY: 0 errors 
$

來源

2016-07-14 19:40:02

是的，這是問題所在。感謝您的幫助：） – aish

cuda magma matrix-matrix addition kernel

回答

相關問題