使用模板模式優化CUDA內核

-1

在編寫用於處理數字圖像的程序的過程中，我編寫了一個緩慢運行的CUDA內核。使用模板模式優化CUDA內核

__global__ void Kernel (int* inputArray, float* outputArray, float3* const col_image, int height, int width, int kc2) { 
    float G, h; 
    float fx[3]; 
    float fy[3]; 
    float g[2][2]; 
    float k10 = 0.0; 
    float k11 = 0.0; 
    float k12 = 0.0; 
    float k20 = 0.0; 
    float k21 = 0.0; 
    float k22 = 0.0; 
    float k30 = 0.0; 
    float k31 = 0.0; 
    float k32 = 0.0; 

    int xIndex = blockIdx.x * blockDim.x + threadIdx.x; 
    int yIndex = blockIdx.y * blockDim.y + threadIdx.y; 

    if ((xIndex < width - kc2/2) && (xIndex >= kc2/2) && (yIndex < height - kc2/2) && (yIndex >= kc2/2)) 
    { 
     int idx0 = yIndex * width + xIndex; 
     if (inputArray[idx0] > 0) 
     { 
      for (int i = 0; i < kc2; i++) 
      { 
       for (int j = 0; j < kc2; j++) 
       { 
        int idx1 = (yIndex + i - kc2/2) * width + (xIndex + j - kc2/2); 
        float3 rgb = col_image[idx1]; 
        k10 = k10 + constMat1[i * kc2 + j] * rgb.x; 
        k11 = k11 + constMat1[i * kc2 + j] * rgb.y; 
        k12 = k12 + constMat1[i * kc2 + j] * rgb.z; 

        k20 = k20 + constMat2[i * kc2 + j] * rgb.x; 
        k21 = k21 + constMat2[i * kc2 + j] * rgb.y; 
        k22 = k22 + constMat2[i * kc2 + j] * rgb.z; 

        k30 = k30 + constMat3[i * kc2 + j] * rgb.x; 
        k31 = k31 + constMat3[i * kc2 + j] * rgb.y; 
        k32 = k32 + constMat3[i * kc2 + j] * rgb.z; 
       } 
      } 
      fx[0] = kc2 * (k30 - k20); 
      fx[1] = kc2 * (k31 - k21); 
      fx[2] = kc2 * (k32 - k22); 
      fy[0] = kc2 * (k10 - k20); 
      fy[1] = kc2 * (k11 - k21); 
      fy[2] = kc2 * (k12 - k22); 

      g[0][0] = fx[0] * fx[0] + fx[1] * fx[1] + fx[2] * fx[2]; 
      g[0][1] = fx[0] * fy[0] + fx[1] * fy[1] + fx[2] * fy[2]; 
      g[1][0] = g[0][1]; 
      g[1][1] = fy[0] * fy[0] + fy[1] * fy[1] + fy[2] * fy[2] 
      G = g[0][0] * g[1][1] - g[0][1] * g[1][0]; 
      h = g[0][0] + g[1][1]; 

      // Output 
      int idx2 = (yIndex - kc2/2) * (width - kc2) + (xIndex - kc2/2); 
      outputArray[idx2] = (h * h)/G; 
     } 
    } 
}

這裏一些（非負）值inputArray進行處理：下面的代碼中給出。陣列col-image包含RGB模型中的顏色分量。如果inputArray的值滿足條件，則我們計算kc2附近的kc2附近的特定係數k_ {ij}，其中心位於所考慮的點（kc2的值爲3或5）。 constMat [1,2,3]的值被存儲在設備的存儲器常數：

__device__ __constant__ float constMat[];

然後我們計算特徵FX，FY，G_ {IJ}，H，G和寫入得到的數值爲outputArray。

重要的是，所有指定的數據都存儲在全局內存中，並且輸入數組可以足夠大（大約40萬個點）。所有這些直接影響內核的速度。

我們如何加快內核的執行速度（歡迎使用任何技術：使用共享內存/紋理，使用模板模板等）？

來源

2017-08-06 Ch. Alex

什麼我稱之爲共享內存的「標準」使用來緩衝col_image由線程塊使用（和重用）的塊將在這裏是一個「標準」建議。

根據我的測試，它似乎提供了實質性的改進。由於您未提供完整的代碼，或任何類型的數據集或結果驗證，因此我將跳過所有這些。接下來是在現有代碼中實現共享內存的一種未經實際測試的實現，將col_image輸入數據的（threadblockwidth + kc2）*（threadblockheight + kc2）「補丁」「緩衝」到共享內存緩衝區中。此後，在雙嵌套for循環期間，將數據從共享內存緩衝區中讀出。

像這樣的2D共享內存模板操作是一個練習索引以及處理邊緣情況的練習。你的代碼比較簡單，因爲在考慮將數據緩存到共享內存中時，我們只需考慮「右」和「下」邊緣。

我還沒有試圖驗證此代碼是否完美。然而，它應該爲您提供一個關於如何實現2D共享內存緩衝系統的「路線圖」，併爲此付出一些努力：儘管YMMV以及我完全可能做出性能錯誤。

這裏的工作實例，表現出對帕斯卡爾泰坦X，CUDA 8.0.61，Linux的增速：

$ cat t390.cu 
#include <stdio.h> 
#include <iostream> 

const int adim = 6000; 
const int KC2 = 5; 
const int thx = 32; 
const int thy = 32; 
__constant__ float constMat1[KC2*KC2]; 
__constant__ float constMat2[KC2*KC2]; 
__constant__ float constMat3[KC2*KC2]; 

__global__ void Kernel (int* inputArray, float* outputArray, float3* const col_image, int height, int width, int kc2) { 
    float G, h; 
    float fx[3]; 
    float fy[3]; 
    float g[2][2]; 
    float k10 = 0.0; 
    float k11 = 0.0; 
    float k12 = 0.0; 
    float k20 = 0.0; 
    float k21 = 0.0; 
    float k22 = 0.0; 
    float k30 = 0.0; 
    float k31 = 0.0; 
    float k32 = 0.0; 

    int xIndex = blockIdx.x * blockDim.x + threadIdx.x; 
    int yIndex = blockIdx.y * blockDim.y + threadIdx.y; 
    int idx0 = yIndex * width + xIndex; 

#ifdef USE_SHARED 
    __shared__ float3 s_col_image[thy+KC2][thx+KC2]; 
    int idx = xIndex; 
    int idy = yIndex; 
    int DATAHSIZE= height; 
    int WSIZE = kc2; 
    int DATAWSIZE = width; 
    float3 *input = col_image; 
    int BLKWSIZE = thx; 
    int BLKHSIZE = thy; 
    if ((idx < DATAHSIZE+WSIZE) && (idy < DATAWSIZE+WSIZE)) 
     s_col_image[threadIdx.y][threadIdx.x]=input[idx0]; 
    if ((idx < DATAHSIZE+WSIZE) && (idy < DATAWSIZE) && (threadIdx.y > BLKWSIZE - WSIZE)) 
     s_col_image[threadIdx.y + (WSIZE-1)][threadIdx.x] = input[idx0+(WSIZE-1)*width]; 
    if ((idx < DATAHSIZE) && (idy < DATAWSIZE+WSIZE) && (threadIdx.x > BLKHSIZE - WSIZE)) 
     s_col_image[threadIdx.y][threadIdx.x + (WSIZE-1)] = input[idx0+(WSIZE-1)]; 
    if ((idx < DATAHSIZE) && (idy < DATAWSIZE) && (threadIdx.x > BLKHSIZE - WSIZE) && (threadIdx.y > BLKWSIZE - WSIZE)) 
     s_col_image[threadIdx.y + (WSIZE-1)][threadIdx.x + (WSIZE-1)] = input[idx0+(WSIZE-1)*width + (WSIZE-1)]; 
    __syncthreads(); 
#endif 


    if ((xIndex < width - kc2/2) && (xIndex >= kc2/2) && (yIndex < height - kc2/2) && (yIndex >= kc2/2)) 
    { 
     if (inputArray[idx0] > 0) 
     { 
      for (int i = 0; i < kc2; i++) 
      { 
       for (int j = 0; j < kc2; j++) 
       { 
#ifdef USE_SHARED 
        float3 rgb = s_col_image[threadIdx.y][threadIdx.x]; 
#else 
        int idx1 = (yIndex + i - kc2/2) * width + (xIndex + j - kc2/2); 
        float3 rgb = col_image[idx1]; 
#endif 
        k10 = k10 + constMat1[i * kc2 + j] * rgb.x; 
        k11 = k11 + constMat1[i * kc2 + j] * rgb.y; 
        k12 = k12 + constMat1[i * kc2 + j] * rgb.z; 

        k20 = k20 + constMat2[i * kc2 + j] * rgb.x; 
        k21 = k21 + constMat2[i * kc2 + j] * rgb.y; 
        k22 = k22 + constMat2[i * kc2 + j] * rgb.z; 

        k30 = k30 + constMat3[i * kc2 + j] * rgb.x; 
        k31 = k31 + constMat3[i * kc2 + j] * rgb.y; 
        k32 = k32 + constMat3[i * kc2 + j] * rgb.z; 
       } 
      } 
      fx[0] = kc2 * (k30 - k20); 
      fx[1] = kc2 * (k31 - k21); 
      fx[2] = kc2 * (k32 - k22); 
      fy[0] = kc2 * (k10 - k20); 
      fy[1] = kc2 * (k11 - k21); 
      fy[2] = kc2 * (k12 - k22); 

      g[0][0] = fx[0] * fx[0] + fx[1] * fx[1] + fx[2] * fx[2]; 
      g[0][1] = fx[0] * fy[0] + fx[1] * fy[1] + fx[2] * fy[2]; 
      g[1][0] = g[0][1]; 
      g[1][1] = fy[0] * fy[0] + fy[1] * fy[1] + fy[2] * fy[2]; // had a missing semicolon 
      G = g[0][0] * g[1][1] - g[0][1] * g[1][0]; 
      h = g[0][0] + g[1][1]; 

      // Output 
      int idx2 = (yIndex - kc2/2) * (width - kc2) + (xIndex - kc2/2); // possible indexing bug here 
      outputArray[idx2] = (h * h)/G; 
     } 
    } 
} 

int main(){ 

    int *d_inputArray; 
    int height = adim; 
    int width = adim; 
    float *d_outputArray; 
    float3 *d_col_image; 
    int kc2 = KC2; 
    cudaMalloc(&d_inputArray, height*width*sizeof(int)); 
    cudaMemset(d_inputArray, 1, height*width*sizeof(int)); 
    cudaMalloc(&d_col_image, (height+kc2)*(width+kc2)*sizeof(float3)); 
    cudaMalloc(&d_outputArray, height*width*sizeof(float)); 
    dim3 threads(thx,thy); 
    dim3 blocks((adim+threads.x-1)/threads.x, (adim+threads.y-1)/threads.y); 
    Kernel<<<blocks,threads>>>(d_inputArray, d_outputArray, d_col_image, height, width, kc2); 
    cudaDeviceSynchronize(); 
} 
$ nvcc -arch=sm_61 -o t390 t390.cu 
$ cuda-memcheck ./t390 
========= CUDA-MEMCHECK 
========= ERROR SUMMARY: 0 errors 
$ nvprof ./t390 
==1473== NVPROF is profiling process 1473, command: ./t390 
==1473== Profiling application: ./t390 
==1473== Profiling result: 
Time(%)  Time  Calls  Avg  Min  Max Name 
97.29% 34.705ms   1 34.705ms 34.705ms 34.705ms Kernel(int*, float*, float3*, int, int, int) 
    2.71% 965.14us   1 965.14us 965.14us 965.14us [CUDA memset] 

==1473== API calls: 
Time(%)  Time  Calls  Avg  Min  Max Name 
88.29% 310.69ms   3 103.56ms 550.23us 309.46ms cudaMalloc 
    9.86% 34.712ms   1 34.712ms 34.712ms 34.712ms cudaDeviceSynchronize 
    1.05% 3.6801ms  364 10.110us  247ns 453.59us cuDeviceGetAttribute 
    0.70% 2.4483ms   4 612.07us 547.62us 682.25us cuDeviceTotalMem 
    0.08% 284.32us   4 71.079us 63.098us 79.616us cuDeviceGetName 
    0.01% 29.533us   1 29.533us 29.533us 29.533us cudaMemset 
    0.01% 21.189us   1 21.189us 21.189us 21.189us cudaLaunch 
    0.00% 5.2730us  12  439ns  253ns 1.1660us cuDeviceGet 
    0.00% 3.4710us   6  578ns  147ns 2.4820us cudaSetupArgument 
    0.00% 3.1090us   3 1.0360us  340ns 2.1660us cuDeviceGetCount 
    0.00% 1.0370us   1 1.0370us 1.0370us 1.0370us cudaConfigureCall 
[email protected]:~/bobc/misc$ nvcc -arch=sm_61 -o t390 t390.cu -DUSE_SHARED 
[email protected]:~/bobc/misc$ cuda-memcheck ./t390 
========= CUDA-MEMCHECK 
========= ERROR SUMMARY: 0 errors 
$ nvprof ./t390 
==1545== NVPROF is profiling process 1545, command: ./t390 
==1545== Profiling application: ./t390 
==1545== Profiling result: 
Time(%)  Time  Calls  Avg  Min  Max Name 
86.17% 5.4181ms   1 5.4181ms 5.4181ms 5.4181ms Kernel(int*, float*, float3*, int, int, int) 
13.83% 869.94us   1 869.94us 869.94us 869.94us [CUDA memset] 

==1545== API calls: 
Time(%)  Time  Calls  Avg  Min  Max Name 
96.13% 297.15ms   3 99.050ms 555.80us 295.90ms cudaMalloc 
    1.76% 5.4281ms   1 5.4281ms 5.4281ms 5.4281ms cudaDeviceSynchronize 
    1.15% 3.5664ms  364 9.7970us  247ns 435.92us cuDeviceGetAttribute 
    0.86% 2.6475ms   4 661.88us 642.85us 682.42us cuDeviceTotalMem 
    0.09% 266.42us   4 66.603us 62.005us 77.380us cuDeviceGetName 
    0.01% 29.624us   1 29.624us 29.624us 29.624us cudaMemset 
    0.01% 19.147us   1 19.147us 19.147us 19.147us cudaLaunch 
    0.00% 4.8560us  12  404ns  248ns  988ns cuDeviceGet 
    0.00% 3.3390us   6  556ns  134ns 2.3510us cudaSetupArgument 
    0.00% 3.1190us   3 1.0390us  331ns 2.0780us cuDeviceGetCount 
    0.00% 1.1940us   1 1.1940us 1.1940us 1.1940us cudaConfigureCall 
$

我們看到，內核執行時間是〜在非共享的情況下爲35ms，並且〜共享情況下5.5ms。對於這種情況，我設置了kc2=5。對於kc2=3的情況，性能增益會更低。

的幾個注意事項：

您發佈的代碼是缺少在一行分號。我已經添加了該代碼，並在代碼中標記了該行。
我懷疑在「輸出」寫入outputArray時可能會產生索引錯誤。你的索引是這樣的：
```
int idx2 = (yIndex - kc2/2) * (width - kc2) + (xIndex - kc2/2); 
```
，而我本來期望這一點：
```
int idx2 = (yIndex - kc2/2) * width + (xIndex - kc2/2); 
```
，但是我還沒有仔細想過這個問題，所以我可能是錯在這裏。在未來，如果你需要這樣的問題的幫助，我建議你至少提供完整的代碼腳手架和描述的級別。提供完整的代碼，讓別人可以立即拿起並測試，而無需編寫自己的代碼。還要定義你所在的平臺以及你的績效評估。

來源

2017-08-06 21:33:06

非常感謝您提供測試後編寫的代碼的詳細答案。 –

使用模板模式優化CUDA內核

回答

相關問題