我嘗試使用與magmablas_sgeadd_q內核類似的格式,但是我沒有得到正確的輸出,而且每次運行它時都會得到不同的輸出。 ,我使用的代碼在下面給出:cuda magma matrix-matrix addition kernel
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#define BLK_X 2
#define BLK_Y 1
__global__ void matrixAdd2(const float *dA, const float *dB, float *dC, int m, int n)
{
int ldda = m;
int lddb = m;
int ind = blockIdx.x*BLK_X + threadIdx.x;
int iby = blockIdx.y*BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
/* do only rows inside matrix */
if (ind < m) {
dA += ind + iby*ldda;
dB += ind + iby*lddb;
if (full)
{
// full block-column
#pragma unroll
for(int j=0; j < BLK_Y; ++j)
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
else
{
// partial block-column
for(int j=0; j < BLK_Y && iby+j < n; ++j)
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("parital: A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
}
}
int main (void)
{
int m = 4; // a - mxn matrix
int n = 2; // b - mxn matrix
size_t size = m * n * sizeof(float);
printf("Matrix addition of %d rows and %d columns \n", m, n);
// allocate matrices on the host
float *h_A = (float *)malloc(size); // a- mxn matrix on the host
float *h_B = (float *)malloc(size); // b- mxn matrix on the host
float *h_C = (float *)malloc(size); // b- mxn matrix on the host
// Initialize the host input matrixs
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < n ; j ++)
{
h_A[i*m+j] = rand()/(float)RAND_MAX;
h_B[i*m+j] = rand()/(float)RAND_MAX;
}
}
// Allocate the device input matrix A
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);; // d_a - mxn matrix a on the device
// Allocate the device input matrix B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);
// Allocate the device output matrix C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);
// Copy the host input matrixs A and B in host memory to the device input matrixs in device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// defining number of threads and blocks
dim3 threads(BLK_X, 1);
dim3 grid((int)ceil(m/BLK_X),(int)ceil(n/BLK_Y));
// Launching kernel
matrixAdd2<<<grid, threads, 0>>>(d_A, d_B, d_C, m, n);
// Copy the device result matrix in device memory to the host result matrix in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
//print A matrix
printf("Matrix A");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
printf(" %f", h_A[i*m+j]);
}
printf("\n");
}
// print B matrix if required
printf("Matrix B");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
printf(" %f", h_B[i*m+j]);
}
printf("\n");
}
//Error checkng
printf("Matrix C ");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
printf("%f", h_C[i*m+j]);
if(h_C[i*m+j] == h_A[i*m+j] + h_B[i*m+j])
{
flag = flag + 1;
}
}
printf("\n");
}
if(flag==m*n)
{
printf("Test PASSED\n");
}
// Free device global memory
err = cudaFree(d_A);
err = cudaFree(d_B);
err = cudaFree(d_C);
// Free host memory
free(h_A);
free(h_B);
free(h_C);
err = cudaDeviceReset();
printf("Done\n");
return 0;
}
輸出我得到:
矩陣加法的4行2列從主機內存到CUDA設備 CUDA內核啓動與 複製輸入數據的2個線程從CUDA設備到主機存儲器 甲是0.000000 複印輸出數據,B是0.364784 4塊,C是0.364784 甲是0.000000,B是0.952230,C是0.952230 甲是0.000000,B是0.000000,C是0.000000 甲是0.000000,B是0.000000,C是0.000000 甲是0.840188,B是0.394383,C是1.234571 甲是0.783099,B是0.798440,C是1.581539 甲是0.911647,B是0.197551,C是1.109199 甲是0.335223,B是0.768230,C是1.103452
矩陣的
0.840188 0.783099 0.911647 0.335223 0.277775 0.477397 0.364784 0.952230
基質B
0.394383 0.798440 0.197551 0.768230 0.553970 0.628871 0.000000 0.000000
矩陣C
0.0000000.000000 0.0000000.000000 0.0000000.000000 0.0000000.000000
我們,如果你發現有什麼錯誤的代碼我知道。
謝謝
是的,這是問題所在。感謝您的幫助 :) – aish