2016-11-22 70 views
0

我需要在使用共享內存的GPU上實現矩陣轉置功能。我以一種簡單的方式完成了這項工作,沒有共享內存,工作正常,而且還嘗試使用SM。但不幸的是,計算不正確,我不明白爲什麼。一個完整的實例可以在here找到,並在這個問題的底部。具有共享內存的CUDA矩陣轉置

EDIT 1

我還知道,結果,其中我有一個錯誤的值的第一個指數是指數32(扁平型矩陣,所以matr[0][32]在二維的方式)。

如果還有更多的信息,我會很樂意爲他們提供幫助。

從近似於不工作函數整個代碼中的一個片段被列舉如下:

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, 
    const int height, const int nreps) 
{ 
    __shared__ float tile[TILE_DIM][TILE_DIM + 1]; 
    int blockIdx_y = blockIdx.x; 
    int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x; 
    int xIndex = blockIdx_x * TILE_DIM + threadIdx.x; 
    int yIndex = blockIdx_y * TILE_DIM + threadIdx.y; 
    int index_in = xIndex + (yIndex)* width; 

    xIndex = blockIdx_y * TILE_DIM + threadIdx.x; 
    yIndex = blockIdx_x * TILE_DIM + threadIdx.y; 
    int index_out = xIndex + (yIndex)* height; 

    int r, i; 
#pragma unroll 
    for (r = 0; r < nreps; r++) 
    { 
#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width]; 

     __syncthreads(); 

#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      if (index_in + i * width < width * height) 
       matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i]; 
    } 
} 

輸出看起來像這樣:

Avg. CPU Transpose Time: 0.106048 ms, Bandwidth: 3.771873 GB/s 

Avg. GPU Naive Trans Time: 0.009871 ms, bandwidth: 40.520836 GB/s 
    Correct: 50000, Wrong: 0 

Avg. GPU Trans with SM Time: 0.007598 ms, bandwidth: 52.643482 GB/s 
    Correct: 12352, Wrong: 37648 

以下是完整的工作示例。我從它剝去了大部分不必要的代碼,所以它的填充較少:

#include "cuda_runtime.h" 
#include "device_functions.h" 
#include "device_launch_parameters.h" 

#include <chrono> 
#include <time.h> 
#include <stdio.h> 
#include <stdlib.h> 

#define TILE_DIM 32 
#define BLOCK_ROWS 8 
#define BLOCK_COLS 32 

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation); 
void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps); 
__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps); 
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps); 

int main() 
{ 
    int i, width, height, nreps, size, wrong, correct; 
    double cpuTime, cpuBandwidth; 
    cudaError_t cudaStatus; 

    float *matrA, *matrATC, *matrATG, *matrAC; 

    srand(time(NULL)); 

    nreps = 10000; 
    width = 500; 
    height = 100; 
    size = width * height; 

    matrA = (float*)malloc(size * sizeof(float)); // matrix A 
    matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied 
    matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU 
    matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU 

    for (i = 0; i < size; i++) 
    { 
     matrA[i] = (float)i; 
    } 

    auto start = std::chrono::high_resolution_clock::now(); 

    //CPU Transpose 
    cpuMatrTrans(matrATC, matrA, width, height, nreps); 

    auto end = std::chrono::high_resolution_clock::now(); 

    std::chrono::duration<double> diff = end - start; 
    cpuTime = (diff.count() * 1000)/nreps; 
    cpuBandwidth = (sizeof(float) * size * 2)/(cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write 
    printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth); 

    correct = 0; 
    wrong = 0; 

    //Naive transpose 
    matrMagicCuda(matrATG, matrA, width, height, nreps, 1); 

    //Check if calc was correct 
    for (i = 0; i < size; i++) 
    { 
     if (matrATC[i] != matrATG[i]) 
     { 
      /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]); 
      return;*/ 
      wrong++; 
     } 
     else 
     { 
      correct++; 
     } 
    } 

    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong); 
    correct = 0; 
    wrong = 0; 

    //Transpose with shared memory 
    matrMagicCuda(matrATG, matrA, width, height, nreps, 2); 

    //Check if calc was correct 
    for (i = 0; i < size; i++) 
    { 
     if (matrATC[i] != matrATG[i]) 
     { 
      /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]); 
      return;*/ 
      wrong++; 
     } 
     else 
     { 
      correct++; 
     } 
    } 

    //printf("\tTranspose with SM on GPU was executed correctly.\n\n"); 
    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong); 
    correct = 0; 
    wrong = 0; 

    // cudaDeviceReset must be called before exiting in order for profiling and 
    // tracing tools such as Nsight and Visual Profiler to show complete traces. 
    cudaStatus = cudaDeviceReset(); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaDeviceReset failed!\n"); 
     return 1; 
    } 

    return 0; 
} 

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation) 
{ 
    float elapsed = 0; 
    float *dev_matrA = 0; 
    float *dev_matrB = 0; 
    cudaError_t cudaStatus; 
    dim3 dim_grid, dim_block; 
    double gpuBandwidth; 

    int size = width * height; 

    dim_block.x = TILE_DIM; 
    dim_block.y = BLOCK_ROWS; 
    dim_block.z = 1; 

    dim_grid.x = (width + TILE_DIM - 1)/TILE_DIM; 
    dim_grid.y = (height + TILE_DIM - 1)/TILE_DIM; 
    dim_grid.z = 1; 

    // Choose which GPU to run on, change this on a multi-GPU system. 
    cudaStatus = cudaSetDevice(0); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
     goto Error; 
    } 

    // Allocate GPU buffers for three matrix 
    cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float)); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float)); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    // Copy input matrix from host memory to GPU buffers. 
    cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 

    cudaEvent_t start, stop; 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 

    switch (operation) 
    { 
     case(1): 
     { 
      cudaEventRecord(start); 
      // Launch a kernel on the GPU with one thread for each element. 
      naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps); 

      cudaEventRecord(stop); 
      cudaEventSynchronize(stop); 

      cudaEventElapsedTime(&elapsed, start, stop); 
      cudaEventDestroy(start); 
      cudaEventDestroy(stop); 

      elapsed /= nreps; 

      gpuBandwidth = (sizeof(float) * size * 2)/(elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write 
      printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth); 

      break; 
     } 

     case(2): 
     { 
      cudaEventRecord(start); 
      // Launch a kernel on the GPU with one thread for each element. 
      notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps); 

      cudaEventRecord(stop); 
      cudaEventSynchronize(stop); 

      cudaEventElapsedTime(&elapsed, start, stop); 
      cudaEventDestroy(start); 
      cudaEventDestroy(stop); 

      elapsed /= nreps; 

      gpuBandwidth = (sizeof(float) * size * 2)/(elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write 
      printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth); 

      break; 
     } 

    default: 
     printf("No matching opcode was found.\n"); 
    } 

    // Check for any errors launching the kernel 
    cudaStatus = cudaGetLastError(); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
     goto Error; 
    } 

    // cudaDeviceSynchronize waits for the kernel to finish, and returns 
    // any errors encountered during the launch. 
    cudaStatus = cudaDeviceSynchronize(); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus); 
     goto Error; 
    } 

    // Copy output matrix from GPU buffer to host memory. 
    cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 

Error: 
    cudaFree(dev_matrB); 
    cudaFree(dev_matrA); 

    return cudaStatus; 
} 

void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps) 
{ 
    int i, j, r; 

#pragma unroll 
    for (r = 0; r < nreps; r++) 
#pragma unroll 
     for (i = 0; i < height; i++) 
#pragma unroll 
      for (j = 0; j < width; j++) 
       matrB[j * height + i] = matrA[i * width + j]; 
} 

__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps) 
{ 
    int i, r; 
    int row = blockIdx.x * TILE_DIM + threadIdx.x; 
    int col = blockIdx.y * TILE_DIM + threadIdx.y; 
    int index_in = row + width * col; 
    int index_out = col + height * row; 

#pragma unroll 
    for (r = 0; r < nreps; r++) 
#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      if (index_in + i * width < width * height) 
       matrB[index_out + i] = matrA[index_in + i * width]; 
} 

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps) 
{ 
    __shared__ float tile[TILE_DIM][TILE_DIM + 1]; 
    int blockIdx_y = blockIdx.x; 
    int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x; 
    int xIndex = blockIdx_x * TILE_DIM + threadIdx.x; 
    int yIndex = blockIdx_y * TILE_DIM + threadIdx.y; 
    int index_in = xIndex + (yIndex)* width; 

    xIndex = blockIdx_y * TILE_DIM + threadIdx.x; 
    yIndex = blockIdx_x * TILE_DIM + threadIdx.y; 
    int index_out = xIndex + (yIndex)* height; 

    int r, i; 
#pragma unroll 
    for (r = 0; r < nreps; r++) 
    { 
#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width]; 

     __syncthreads(); 

#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      if (index_in + i * width < width * height) 
       matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i]; 
    } 
} 
+2

顯然您希望能夠進行非方形矩陣轉置?還是隻有方矩陣轉置?你應該在你的問題中包含[mcve] *,而不是在外部鏈接中。 –

+0

是的,我也想在非方矩陣上進行轉置。爲了便於閱讀,我將它放在了外面,但如您所願,我還會將其添加到該問題中。 – JRsz

+1

您的代碼正在進行超出邊界的訪問。無論何時,如果您在使用CUDA代碼時遇到問題,都應該使用[適當的cuda錯誤檢查](http://stackoverflow.com/questions/14038589)並使用'cuda-memcheck'運行您的代碼。即使你不明白錯誤輸出,它也會對其他試圖幫助你的人有用。另外請注意,雖然我明白這可能是一個學習練習,但對於認真的工作,建議您使用cublas [geam]這樣的庫例程(http://docs.nvidia.com/cuda/cublas/index.html#cublas- LT-T-GT-GEAM)。 –

回答

1

此代碼有許多問題。我不確定我能否涵蓋所有這些。

可能最重要的問題是您缺乏(並且缺乏對正確的2D線程檢查的理解)。您的算法會創建一個線程網格,這個網格的維度都大於問題大小。這會在兩個維度中創建邏輯線程外部矩陣的維度。

您試圖創建一個線程2D檢查這樣的:

 if (index_in + i * width < width * height) 

這是行不通的。假設我有一個3x3矩陣和一個4x4線程網格。 (3,0)處的線程顯然是您矩陣的界限,但會通過您的2D線程檢查。

在這種情況下,正確的線程檢查必須分別測試每個維度,而不是作爲產品。

請注意,這個邏輯錯誤也存在於您的「天真」轉置內核中,並且您可以確認是否使用cuda-memcheck運行您的代碼。它會指出內核中的邊界訪問錯誤,即使它看起來工作正常。

還有其他各種問題。這些大部分都與共享內存內核的索引有關。我不清楚你是否理解shared memory transpose的必要索引操作。有兩個單獨的索引調換,我們必須在這種情況下,這樣做:

  1. 移調塊(磚)指數
  2. 移調螺紋指數

換位螺紋指數是通過閱讀完成/編寫共享內存。對於共享內存的讀取/寫入,使用threadIdx.xthreadIdx.y的反向操作正確地解釋了這一點。但就我所知,您的索引生成反轉塊索引(在讀取/寫入到全局內存期間使用該反轉)已被打破。這是另一個需要解決的重大問題。

下面的代碼有這些固定其他一些問題,並似乎對我正常工作:

$ cat t33.cu 
#include "cuda_runtime.h" 
#include "device_functions.h" 
#include "device_launch_parameters.h" 

#include <chrono> 
#include <time.h> 
#include <stdio.h> 
#include <stdlib.h> 

#define TILE_DIM 32 
#define BLOCK_ROWS 8 
#define BLOCK_COLS 32 

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation); 
void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps); 
__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps); 
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps); 

int main() 
{ 
    int i, width, height, nreps, size, wrong, correct; 
    double cpuTime, cpuBandwidth; 
    cudaError_t cudaStatus; 

    float *matrA, *matrATC, *matrATG, *matrAC; 

    srand(time(NULL)); 

    nreps = 10000; 
    width = 500; 
    height = 100; 


    size = width * height; 

    matrA = (float*)malloc(size * sizeof(float)); // matrix A 
    matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied 
    matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU 
    matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU 

    for (i = 0; i < size; i++) 
    { 
     matrA[i] = (float)i; 
    } 

    auto start = std::chrono::high_resolution_clock::now(); 

    //CPU Transpose 
    cpuMatrTrans(matrATC, matrA, width, height, nreps); 

    auto end = std::chrono::high_resolution_clock::now(); 

    std::chrono::duration<double> diff = end - start; 
    cpuTime = (diff.count() * 1000)/nreps; 
    cpuBandwidth = (sizeof(float) * size * 2)/(cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write 
    printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth); 

    correct = 0; 
    wrong = 0; 

    //Naive transpose 
    memset(matrATG, 0, size*sizeof(float)); 
    matrMagicCuda(matrATG, matrA, width, height, nreps, 1); 

    //Check if calc was correct 
    for (i = 0; i < size; i++) 
    { 
     if (matrATC[i] != matrATG[i]) 
     { 
      /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]); 
      return;*/ 
      wrong++; 
     } 
     else 
     { 
      correct++; 
     } 
    } 

    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong); 
    correct = 0; 
    wrong = 0; 

    //Transpose with shared memory 
    memset(matrATG, 0, size*sizeof(float)); 
    matrMagicCuda(matrATG, matrA, width, height, nreps, 2); 

    //Check if calc was correct 
    for (i = 0; i < size; i++) 
    { 
     if (matrATC[i] != matrATG[i]) 
     { 
      /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]); 
      return;*/ 
      wrong++; 
     } 
     else 
     { 
      correct++; 
     } 
    } 

    //printf("\tTranspose with SM on GPU was executed correctly.\n\n"); 
    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong); 
    correct = 0; 
    wrong = 0; 

    // cudaDeviceReset must be called before exiting in order for profiling and 
    // tracing tools such as Nsight and Visual Profiler to show complete traces. 
    cudaStatus = cudaDeviceReset(); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaDeviceReset failed!\n"); 
     return 1; 
    } 

    return 0; 
} 

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation) 
{ 
    float elapsed = 0; 
    float *dev_matrA = 0; 
    float *dev_matrB = 0; 
    cudaError_t cudaStatus; 
    dim3 dim_grid, dim_block; 
    double gpuBandwidth; 

    int size = width * height; 

    dim_block.x = TILE_DIM; 
    dim_block.y = BLOCK_ROWS; 
    dim_block.z = 1; 

    dim_grid.x = (width + TILE_DIM - 1)/TILE_DIM; 
    dim_grid.y = (height + TILE_DIM - 1)/TILE_DIM; 
    dim_grid.z = 1; 

    // Choose which GPU to run on, change this on a multi-GPU system. 
    cudaStatus = cudaSetDevice(0); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
     goto Error; 
    } 

    // Allocate GPU buffers for three matrix 
    cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float)); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float)); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    // Copy input matrix from host memory to GPU buffers. 
    cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 
    cudaMemset(dev_matrB, 0, size * sizeof(float)); 
    cudaEvent_t start, stop; 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 

    switch (operation) 
    { 
     case(1): 
     { 
      cudaEventRecord(start); 
      // Launch a kernel on the GPU with one thread for each element. 
      naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps); 

      cudaEventRecord(stop); 
      cudaEventSynchronize(stop); 

      cudaEventElapsedTime(&elapsed, start, stop); 
      cudaEventDestroy(start); 
      cudaEventDestroy(stop); 

      elapsed /= nreps; 

      gpuBandwidth = (sizeof(float) * size * 2)/(elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write 
      printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth); 

      break; 
     } 

     case(2): 
     { 
      cudaEventRecord(start); 
      // Launch a kernel on the GPU with one thread for each element. 
      notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps); 

      cudaEventRecord(stop); 
      cudaEventSynchronize(stop); 

      cudaEventElapsedTime(&elapsed, start, stop); 
      cudaEventDestroy(start); 
      cudaEventDestroy(stop); 

      elapsed /= nreps; 

      gpuBandwidth = (sizeof(float) * size * 2)/(elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write 
      printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth); 

      break; 
     } 

    default: 
     printf("No matching opcode was found.\n"); 
    } 

    // Check for any errors launching the kernel 
    cudaStatus = cudaGetLastError(); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
     goto Error; 
    } 

    // cudaDeviceSynchronize waits for the kernel to finish, and returns 
    // any errors encountered during the launch. 
    cudaStatus = cudaDeviceSynchronize(); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus); 
     goto Error; 
    } 

    // Copy output matrix from GPU buffer to host memory. 
    cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 

Error: 
    cudaFree(dev_matrB); 
    cudaFree(dev_matrA); 

    return cudaStatus; 
} 

void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps) 
{ 
    int i, j, r; 

#pragma unroll 
    for (r = 0; r < nreps; r++) 
#pragma unroll 
     for (i = 0; i < height; i++) 
#pragma unroll 
      for (j = 0; j < width; j++) 
       matrB[j * height + i] = matrA[i * width + j]; 
} 

__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps) 
{ 
    int i, r; 
    int col = blockIdx.x * TILE_DIM + threadIdx.x; 
    int row = blockIdx.y * TILE_DIM + threadIdx.y; 
    int index_in = col + width * row; 
    int index_out = row + height * col; 

#pragma unroll 
    for (r = 0; r < nreps; r++) 
#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      if ((row+i<height) && (col < width)) 
       matrB[index_out + i] = matrA[index_in + i * width]; 
} 

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps) 
{ 
    __shared__ float tile[TILE_DIM][TILE_DIM + 1]; 
    int ciIndex = blockIdx.x * TILE_DIM + threadIdx.x; 
    int riIndex = blockIdx.y * TILE_DIM + threadIdx.y; 
    int coIndex = blockIdx.y * TILE_DIM + threadIdx.x; 
    int roIndex = blockIdx.x * TILE_DIM + threadIdx.y; 
    int index_in = ciIndex + (riIndex)* width; 
    int index_out = coIndex + (roIndex)* height; 


    int r, i; 
#pragma unroll 
    for (r = 0; r < nreps; r++) 
    { 
#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      if ((ciIndex<width) && (riIndex+i < height)) 
       tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width]; 
     __syncthreads(); 

#pragma unroll 
     for (i = 0; i < TILE_DIM; i += BLOCK_ROWS) 
      if ((coIndex<height) && (roIndex+i < width)) 
       matrB[index_out + i*height] = tile[threadIdx.x][threadIdx.y + i]; 
     __syncthreads(); 
    } 
} 
$ nvcc -std=c++11 -arch=sm_61 -o t33 t33.cu 
t33.cu(25): warning: variable "matrAC" was set but never used 

t33.cu(25): warning: variable "matrAC" was set but never used 

$ cuda-memcheck ./t33 
========= CUDA-MEMCHECK 
Avg. CPU Transpose Time: 0.143087 ms, Bandwidth: 2.795509 GB/s 

Avg. GPU Naive Trans Time: 0.028587 ms, bandwidth: 13.992195 GB/s 
     Correct: 50000, Wrong: 0 

Avg. GPU Trans with SM Time: 0.040328 ms, bandwidth: 9.918678 GB/s 
     Correct: 50000, Wrong: 0 

========= ERROR SUMMARY: 0 errors 
$ ./t33 
Avg. CPU Transpose Time: 0.140469 ms, Bandwidth: 2.847594 GB/s 

Avg. GPU Naive Trans Time: 0.003828 ms, bandwidth: 104.505440 GB/s 
     Correct: 50000, Wrong: 0 

Avg. GPU Trans with SM Time: 0.000715 ms, bandwidth: 559.206604 GB/s 
     Correct: 50000, Wrong: 0 

$ 

注:該代碼試圖測量的帶寬。但是,您應該知道,此處測得的帶寬受緩存帶寬的影響。您的矩陣大小(500x100 =輸入和輸出每個200KB)很容易小到足以放入大多數GPU的L2緩存中。這一事實,再加上您多次運行相同轉置的事實(nreps)意味着大部分工作直接在L2緩存中運行。因此,在上面的「優化」情況下,我們看到一個報告的帶寬數量大大超過了GPU的可用內存帶寬(這種情況恰好是Pascal Titan X,所以大約340GB/s的主內存帶寬可用)。這是由於這種測量包含了L2緩存的一些好處,其帶寬至少是主內存帶寬的兩倍。您可以通過使用更大的矩陣大小和/或將nreps設置爲1來消除這種影響。

+0

你是對的這是一個講座,我很新,因爲你已經意識到索引不夠熟悉。您的解決方案有效,我非常感謝您的幫助。我會盡力去理解它的每一行,以便將來我不會再犯這些錯誤。在旁註中,我的算法來自我們的講師:/你有很好的參考資料,可以幫助我理解確切的索引,塊和網格佈局。如果您提供的鏈接解決了這個問題,那就忘了。我會毫不猶豫地閱讀它!和順便說一句好的GPU,你到了那裏! Matrix 5000x1000崩潰我的驅動程序:( – JRsz

+1

我可以指出的最好的參考是我已經在我的答案中鏈接了一個。但是,爲了簡化演示文稿,它涵蓋了您有矩形矩陣的情況以及矩陣尺寸可以被瓦片維度整除,然而你提出的算法的許多方面似乎與那個相似,甚至直到使用'BLOCK_ROWS'文字,並且具有每個線程的特定內核循環方法執行幾次轉置所以我想象你的講師很熟悉我已經鏈接的博客 –

+1

關於驅動程序崩潰,似乎很明顯你正在開發一個windows平臺,對於處於WDDM模式的windows GPU(可能是這樣的情況您的測試),WDDM TDR監視器將GPU內核執行時間限制在2秒左右。如果您願意,可以解決此問題(谷歌「WDDM TDR cuda」),但我可以這是導致您的驅動程序崩潰的最可能原因。較大的矩陣轉置,加上大量的重複,導致內核花費超過2秒鐘(也可能是幼稚的內核)。所以你也可以嘗試通過減少'nreps'來解決這個問題。 –