2015-06-21 141 views
0

我在CUDA C中的矩陣乘法中遇到了麻煩。檢查很長一段時間後,我發現問題是我錯誤地使用了「dim3」。糾正後,我得到了正確的結果。但是我增加了矩陣的維數,答案是不正確的。即使我不能使用Nsight,它在我增加維度之前運行良好,以啓動CUDA調試。CUDA中的矩陣乘法的問題

內核代碼:

__global__ void multiKernal(float* Md, float*Nd, float*Pd, int width) 
{ 
    int row = blockIdx.y*blockDim.y + threadIdx.y; 
    int col = blockIdx.x*blockDim.x + threadIdx.x; 
    float Pvalue = 0; 

for (int k = 0; k <width; ++k){ 
    Pvalue += Md[row*width + k] * Nd[col + width*k]; 
} 
Pd[row*width + col] = Pvalue; 
return; 
} 

主機到設備:

void matrixmutiplacation(float*hostM, float*hostN, float*hostP, int width) 
{ 
    int size = width*width*sizeof(float); 
    float* Md; float* Nd; float* Pd; 
    dim3 dimGrid(4, 4, 1); 
    dim3 dimBlock(128, 128, 1); 
    cudaError_t error; 
    cudaEvent_t start; 
    error = cudaEventCreate(&start); 
    cudaEvent_t stop; 
    error = cudaEventCreate(&stop); 

    cudaMalloc((void**)&Md, size); 
    cudaMalloc((void**)&Nd, size); 
    cudaMalloc((void**)&Pd, size); 
    cudaMemcpy(Md, hostM, size, cudaMemcpyHostToDevice); 
    cudaMemcpy(Nd, hostN, size, cudaMemcpyHostToDevice); 
    cudaMemcpy(Pd, hostP, size, cudaMemcpyHostToDevice); 

    error = cudaEventRecord(start, NULL); 
    multiKernal << <dimGrid, dimBlock >> >(Md, Nd, Pd, width); 
    error = cudaEventRecord(stop, NULL); 
    error = cudaEventSynchronize(stop); 
    float msecTotal = 0.0f; 
    error = cudaEventElapsedTime(&msecTotal, start, stop); 
    float msecPerMatrixMul = msecTotal; 
    printf("running time:%.3f msec", msecPerMatrixMul); 

    cudaMemcpy(hostP, Pd, size, cudaMemcpyDeviceToHost); 
    cudaFree(Md); cudaFree(Nd); cudaFree(Pd); 
    return; 
} 

主:

int main() 
{ 
    int M = 512 * 512; 
    int N = 512 * 512; 
    int P = 512 * 512; 
    int width = 512; 
    int c[512]; 
    float* hostM = (float*)malloc(sizeof(float)*M); 
    float* hostN = (float*)malloc(sizeof(float)*N); 
    float* hostP = (float*)malloc(sizeof(float)*P); 

    for (int i = 0; i < P; ++i) 
     hostP[i] = 0; 

    for (int i = 0; i <width; i++) 
     c[i] = i + 1; 

    for (int i = 0; i <width; i++) { 
     for (int j = 0; j <width; j++) { 
      hostM[i*width + j] = c[j] + i; 
      hostN[i*width + j] = c[j] + i; 
     } 
    } 

    matrixmutiplacation(hostM, hostN, hostP, width); 

    //for (int i = 0; i <width; i++){ 
    //for (int j = 0; j <width; j++){ 
    // printf("%f\t", hostP[i*width + j]); 
    //} 
    // printf("\n"); 
    //} 

    free(hostM); 
    free(hostN); 
    free(hostP); 

    return 0; 

} 

回答

4

你threadblock尺寸128x128x1 = 16K,最大尺寸threadblock是1024。內核只是不運行。嘗試使用cuda-memcheck運行應用程序,它可能會告訴你代碼有問題。檢查結果代碼CUDA運行時API函數返回錯誤也是一種很好的做法。

+0

我認爲「1024 * 1024 * 64」是每塊的線程大小。謝謝 ! –

+0

@CalvinLouBME:塊的最大*維*和每塊的最大線程數是不同的東西,並且都適用於限制合法塊大小 – talonmies