0
我在CUDA C中的矩陣乘法中遇到了麻煩。檢查很長一段時間後,我發現問題是我錯誤地使用了「dim3」。糾正後,我得到了正確的結果。但是我增加了矩陣的維數,答案是不正確的。即使我不能使用Nsight,它在我增加維度之前運行良好,以啓動CUDA調試。CUDA中的矩陣乘法的問題
內核代碼:
__global__ void multiKernal(float* Md, float*Nd, float*Pd, int width)
{
int row = blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
float Pvalue = 0;
for (int k = 0; k <width; ++k){
Pvalue += Md[row*width + k] * Nd[col + width*k];
}
Pd[row*width + col] = Pvalue;
return;
}
主機到設備:
void matrixmutiplacation(float*hostM, float*hostN, float*hostP, int width)
{
int size = width*width*sizeof(float);
float* Md; float* Nd; float* Pd;
dim3 dimGrid(4, 4, 1);
dim3 dimBlock(128, 128, 1);
cudaError_t error;
cudaEvent_t start;
error = cudaEventCreate(&start);
cudaEvent_t stop;
error = cudaEventCreate(&stop);
cudaMalloc((void**)&Md, size);
cudaMalloc((void**)&Nd, size);
cudaMalloc((void**)&Pd, size);
cudaMemcpy(Md, hostM, size, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, hostN, size, cudaMemcpyHostToDevice);
cudaMemcpy(Pd, hostP, size, cudaMemcpyHostToDevice);
error = cudaEventRecord(start, NULL);
multiKernal << <dimGrid, dimBlock >> >(Md, Nd, Pd, width);
error = cudaEventRecord(stop, NULL);
error = cudaEventSynchronize(stop);
float msecTotal = 0.0f;
error = cudaEventElapsedTime(&msecTotal, start, stop);
float msecPerMatrixMul = msecTotal;
printf("running time:%.3f msec", msecPerMatrixMul);
cudaMemcpy(hostP, Pd, size, cudaMemcpyDeviceToHost);
cudaFree(Md); cudaFree(Nd); cudaFree(Pd);
return;
}
主:
int main()
{
int M = 512 * 512;
int N = 512 * 512;
int P = 512 * 512;
int width = 512;
int c[512];
float* hostM = (float*)malloc(sizeof(float)*M);
float* hostN = (float*)malloc(sizeof(float)*N);
float* hostP = (float*)malloc(sizeof(float)*P);
for (int i = 0; i < P; ++i)
hostP[i] = 0;
for (int i = 0; i <width; i++)
c[i] = i + 1;
for (int i = 0; i <width; i++) {
for (int j = 0; j <width; j++) {
hostM[i*width + j] = c[j] + i;
hostN[i*width + j] = c[j] + i;
}
}
matrixmutiplacation(hostM, hostN, hostP, width);
//for (int i = 0; i <width; i++){
//for (int j = 0; j <width; j++){
// printf("%f\t", hostP[i*width + j]);
//}
// printf("\n");
//}
free(hostM);
free(hostN);
free(hostP);
return 0;
}
我認爲「1024 * 1024 * 64」是每塊的線程大小。謝謝 ! –
@CalvinLouBME:塊的最大*維*和每塊的最大線程數是不同的東西,並且都適用於限制合法塊大小 – talonmies