CUDA中的FIR濾波器（作爲1D卷積）

我正試圖在CUDA中實現FIR（有限衝激響應）濾波器。我的方法很簡單，有點類似於：CUDA中的FIR濾波器（作爲1D卷積）

#include <cuda.h> 

__global__ void filterData(const float *d_data, 
          const float *d_numerator, 
          float *d_filteredData, 
          const int numeratorLength, 
          const int filteredDataLength) 
{ 
    int i = blockDim.x * blockIdx.x + threadIdx.x; 

    float sum = 0.0f; 

    if (i < filteredDataLength) 
    { 
     for (int j = 0; j < numeratorLength; j++) 
     { 
      // The first (numeratorLength-1) elements contain the filter state 
      sum += d_numerator[j] * d_data[i + numeratorLength - j - 1]; 
     } 
    } 

    d_filteredData[i] = sum; 
} 

int main(void) 
{ 
    // (Skipping error checks to make code more readable) 

    int dataLength = 18042; 
    int filteredDataLength = 16384; 
    int numeratorLength= 1659; 

    // Pointers to data, filtered data and filter coefficients 
    // (Skipping how these are read into the arrays) 
    float *h_data = new float[dataLength]; 
    float *h_filteredData = new float[filteredDataLength]; 
    float *h_filter = new float[numeratorLength]; 


    // Create device pointers 
    float *d_data = nullptr; 
    cudaMalloc((void **)&d_data, dataLength * sizeof(float)); 

    float *d_numerator = nullptr; 
    cudaMalloc((void **)&d_numerator, numeratorLength * sizeof(float)); 

    float *d_filteredData = nullptr; 
    cudaMalloc((void **)&d_filteredData, filteredDataLength * sizeof(float)); 


    // Copy data to device 
    cudaMemcpy(d_data, h_data, dataLength * sizeof(float), cudaMemcpyHostToDevice); 
    cudaMemcpy(d_numerator, h_numerator, numeratorLength * sizeof(float), cudaMemcpyHostToDevice); 

    // Launch the kernel 
    int threadsPerBlock = 256; 
    int blocksPerGrid = (filteredDataLength + threadsPerBlock - 1)/threadsPerBlock; 
    filterData<<<blocksPerGrid,threadsPerBlock>>>(d_data, d_numerator, d_filteredData, numeratorLength, filteredDataLength); 

    // Copy results to host 
    cudaMemcpy(h_filteredData, d_filteredData, filteredDataLength * sizeof(float), cudaMemcpyDeviceToHost); 

    // Clean up 
    cudaFree(d_data); 
    cudaFree(d_numerator); 
    cudaFree(d_filteredData); 

    // Do stuff with h_filteredData... 

    // Clean up some more 
    delete [] h_data; 
    delete [] h_filteredData; 
    delete [] h_filter; 
}

過濾器的工作原理，但我是新來的CUDA編程，我不知道如何去優化它。

，我看到的輕微問題是dataLength，filteredDataLength，和numeratorLength手前在應用程序是未知的，我打算使用的過濾器中，另外，即使dataLength是32在上面的代碼的倍數，它不能保證在最終的應用程序中。

當我將上面的代碼與ArrayFire進行比較時，我的代碼需要大約三倍的時間才能執行。

有沒有人有關於如何加快速度的任何想法？

編輯：已將所有filterLength更改爲numeratorLength。

來源

2013-04-06 Elfendahl

是'numeratorLength'一樣'filterLength'？在您發佈的內容中，我沒有看到「numeratorLength」的定義。這個問題本質上是一個一維模板問題。對模板問題的標準優化是將一部分輸入數據放入共享內存中，足以讓塊的線程計算其輸出，然後讓這些線程在共享內存副本之外工作。 – 2013-04-06 19:09:03

如果你最終打敗ArrayFire，請告訴我們！如果沒有，你總是可以自由使用ArrayFire，因爲它更快:) – arrayfire 2013-04-07 03:06:18

@RobertCrovella是的，numeratorLength與filterLength相同。我決定改名，但顯然錯過了幾個地方。我的壞，對不起。我修改了原始帖子，以便只有分子長度。感謝您使用共享內存的建議。我已經讀過，這些速度比全局內存快得多，但我對如何最好地實現這一點有些不確定，因爲共享內存的大小有限，而且過濾器的長度可能會很長。我會玩弄它，看看它是怎麼回事 – Elfendahl 2013-04-08 03:44:28

我可以建議如下，以加快代碼：

使用共享內存：這是一個很小的緩存樣的內存，但高於全球卡內存非常更快。在CUDA文檔中尋找__shared__關鍵字，您可以通過瞭解更多信息。例如，對於示例，您可以預先獲取共享內存中的數據的過濾器分子和大塊，這將顯着提高您的性能。在這種情況下，您需要額外注意數據對齊，因爲它確實很重要，它會減慢您的代碼的。
考慮展開分子總和的for循環。您可以檢查CUDA 文檔中的reduce-vector示例。
您也可以考慮自行並行分子循環本身。這可以通過向你的線程塊添加一個額外的維度（比如'y'）來完成。您需要將總和作爲分子矢量以及分子長度的維數。您還可以檢查reduce矢量示例，以瞭解如何在最後快速獲取此矢量的總和。

來源

2013-04-07 00:12:42 Bichoy

感謝您的建議！我會着眼於使用共享內存，而矢量縮減示例看起來非常有趣。但是我特別喜歡你的第三個建議，因爲當前代碼中的for循環感覺像濾波器長度增加時的潛在瓶頸。 – Elfendahl 2013-04-08 03:49:32

您試圖通過CUDA內核直接評估一維卷積來計算濾波器輸出。

當濾波器的脈衝響應持續時間爲long時，您可以做的一件事是評估濾波後的輸入，然後使用FFT直接在共軛域中執行計算。下面我使用CUDA Thrust和cuFFT庫報告示例代碼。正是在

報道了基於Matlab的例子的直接翻譯

Low-Pass Filtering by FFT Convolution

讓我放棄一些優化是可能與此代碼，但我寧願離開它，因爲它是如此，它可能是更容易與Matlab的相對應。

#include <stdio.h> 
#include <math.h> 

#include <cufft.h> 

#include <thrust\device_vector.h> 
#include <thrust\sequence.h> 

#define pi_f 3.14159265358979f     // Greek pi in single precision 

/****************/ 
/* SIN OPERATOR */ 
/****************/ 
class sin_op { 

    float fk_, Fs_; 

    public: 

     sin_op(float fk, float Fs) { fk_ = fk; Fs_ = Fs; } 

     __host__ __device__ float operator()(float x) const { return sin(2.f*pi_f*x*fk_/Fs_); } 
}; 

/*****************/ 
/* SINC OPERATOR */ 
/*****************/ 
class sinc_op { 

    float fc_, Fs_; 

    public: 

     sinc_op(float fc, float Fs) { fc_ = fc; Fs_ = Fs; } 

     __host__ __device__ float operator()(float x) const 
     { 
      if (x==0) return (2.f*fc_/Fs_); 
      else   return (2.f*fc_/Fs_)*sin(2.f*pi_f*fc_*x/Fs_)/(2.f*pi_f*fc_*x/Fs_); 
     } 
}; 

/********************/ 
/* HAMMING OPERATOR */ 
/********************/ 
class hamming_op { 

    int L_; 

    public: 

     hamming_op(int L) { L_ = L; } 

     __host__ __device__ float operator()(int x) const 
     { 
      return 0.54-0.46*cos(2.f*pi_f*x/(L_-1)); 
     } 
}; 


/*********************************/ 
/* MULTIPLY CUFFTCOMPLEX NUMBERS */ 
/*********************************/ 
struct multiply_cufftComplex { 
    __device__ cufftComplex operator()(const cufftComplex& a, const cufftComplex& b) const { 
     cufftComplex r; 
     r.x = a.x * b.x - a.y * b.y; 
     r.y = a.x * b.y + a.y * b.x; 
     return r; 
    } 
}; 

/********/ 
/* MAIN */ 
/********/ 
void main(){ 

    // Signal parameters: 
    int M = 256;       // signal length 
    const int N = 4; 
    float f[N] = { 440, 880, 1000, 2000 };    // frequencies 
    float Fs = 5000.;      // sampling rate 

    // Generate a signal by adding up sinusoids: 
    thrust::device_vector<float> d_x(M,0.f);   // pre-allocate 'accumulator' 
    thrust::device_vector<float> d_n(M);    // discrete-time grid 
    thrust::sequence(d_n.begin(), d_n.end(), 0, 1); 

    thrust::device_vector<float> d_temp(M); 
    for (int i=0; i<N; i++) { 
     float fk = f[i]; 
     thrust::transform(d_n.begin(), d_n.end(), d_temp.begin(), sin_op(fk,Fs)); 
     thrust::transform(d_temp.begin(), d_temp.end(), d_x.begin(), d_x.begin(), thrust::plus<float>()); 
    } 

    // Filter parameters: 
    int L = 257;      // filter length 
    float fc = 600.f;     // cutoff frequency 

    // Design the filter using the window method: 
    thrust::device_vector<float> d_hsupp(L);    
    thrust::sequence(d_hsupp.begin(), d_hsupp.end(), -(L-1)/2, 1); 
    thrust::device_vector<float> d_hideal(L);   
    thrust::transform(d_hsupp.begin(), d_hsupp.end(), d_hideal.begin(), sinc_op(fc,Fs)); 
    thrust::device_vector<float> d_l(L);     
    thrust::sequence(d_l.begin(), d_l.end(), 0, 1); 
    thrust::device_vector<float> d_h(L);     
    thrust::transform(d_l.begin(), d_l.end(), d_h.begin(), hamming_op(L)); 
    // h is our filter 
    thrust::transform(d_hideal.begin(), d_hideal.end(), d_h.begin(), d_h.begin(), thrust::multiplies<float>()); 

    // --- Choose the next power of 2 greater than L+M-1 
    int Nfft = pow(2,(ceil(log2((float)(L+M-1))))); // or 2^nextpow2(L+M-1) 

    // Zero pad the signal and impulse response: 
    thrust::device_vector<float> d_xzp(Nfft,0.f); 
    thrust::device_vector<float> d_hzp(Nfft,0.f); 
    thrust::copy(d_x.begin(), d_x.end(), d_xzp.begin()); 
    thrust::copy(d_h.begin(), d_h.end(), d_hzp.begin()); 

    // Transform the signal and the filter: 
    cufftHandle plan; 
    cufftPlan1d(&plan, Nfft, CUFFT_R2C, 1); 
    thrust::device_vector<cufftComplex> d_X(Nfft/2+1); 
    thrust::device_vector<cufftComplex> d_H(Nfft/2+1); 
    cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_xzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_X.data())); 
    cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_hzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_H.data())); 

    thrust::device_vector<cufftComplex> d_Y(Nfft/2+1); 
    thrust::transform(d_X.begin(), d_X.end(), d_H.begin(), d_Y.begin(), multiply_cufftComplex()); 

    cufftPlan1d(&plan, Nfft, CUFFT_C2R, 1); 
    thrust::device_vector<float> d_y(Nfft); 
    cufftExecC2R(plan, (cufftComplex*)thrust::raw_pointer_cast(d_Y.data()), (cufftReal*)thrust::raw_pointer_cast(d_y.data())); 

    getchar(); 

}

來源

2014-05-19 15:33:00 JackOLantern

除了我，我期待將成爲卷積核更方便與長時間，下面，我報告一個不同的實現，這與OP的初步嘗試更符合對方的回答，我希望會卷積內核更方便短期持續時間。這種實現基於一個手寫內核，利用共享內存中的緩存。更多細節可以在D.B.B的書中找到。柯克和W.-m. W. HWU

Programming Massively Parallel Processors, Second Edition: A Hands-on Approach

#include <stdio.h> 
#include <stdlib.h> 

#include "TimingGPU.cuh" 
#include "Utilities.cuh" 

#define RG   10 
#define BLOCKSIZE 8 

/****************/ 
/* CPU FUNCTION */ 
/****************/ 
void h_convolution_1D(const float * __restrict__ h_Signal, const float * __restrict__ h_ConvKernel, float * __restrict__ h_Result_CPU, 
         const int N, const int K) { 

    for (int i = 0; i < N; i++) { 

     float temp = 0.f; 

     int N_start_point = i - (K/2); 

     for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) { 
      temp += h_Signal[N_start_point+ j] * h_ConvKernel[j]; 
     } 

     h_Result_CPU[i] = temp; 
    } 
} 

/********************/ 
/* BASIC GPU KERNEL */ 
/********************/ 
__global__ void d_convolution_1D_basic(const float * __restrict__ d_Signal, const float * __restrict__ d_ConvKernel, float * __restrict__ d_Result_GPU, 
             const int N, const int K) { 

    int i = blockIdx.x * blockDim.x + threadIdx.x; 

    float temp = 0.f; 

    int N_start_point = i - (K/2); 

    for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) { 
     temp += d_Signal[N_start_point+ j] * d_ConvKernel[j]; 
    } 

    d_Result_GPU[i] = temp; 
} 

/***************************/ 
/* GPU KERNEL WITH CACHING */ 
/***************************/ 
__global__ void d_convolution_1D_caching(const float * __restrict__ d_Signal, const float * __restrict__ d_ConvKernel, float * __restrict__ d_Result_GPU, 
             const int N, const int K) { 

    int i = blockIdx.x * blockDim.x + threadIdx.x; 

    __shared__ float d_Tile[BLOCKSIZE]; 

    d_Tile[threadIdx.x] = d_Signal[i]; 
    __syncthreads(); 

    float temp = 0.f; 

    int N_start_point = i - (K/2); 

    for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) { 

      if ((N_start_point + j >= blockIdx.x * blockDim.x) && (N_start_point + j < (blockIdx.x + 1) * blockDim.x)) 

       // --- The signal element is in the tile loaded in the shared memory 
       temp += d_Tile[threadIdx.x + j - (K/2)] * d_ConvKernel[j]; 

      else 

       // --- The signal element is not in the tile loaded in the shared memory 
       temp += d_Signal[N_start_point + j] * d_ConvKernel[j]; 

    } 

    d_Result_GPU[i] = temp; 
} 

/********/ 
/* MAIN */ 
/********/ 
int main(){ 

    const int N = 15;   // --- Signal length 
    const int K = 5;   // --- Convolution kernel length 

    float *h_Signal   = (float *)malloc(N * sizeof(float)); 
    float *h_Result_CPU  = (float *)malloc(N * sizeof(float)); 
    float *h_Result_GPU  = (float *)malloc(N * sizeof(float)); 
    float *h_ConvKernel  = (float *)malloc(K * sizeof(float)); 

    float *d_Signal;  gpuErrchk(cudaMalloc(&d_Signal,  N * sizeof(float))); 
    float *d_Result_GPU; gpuErrchk(cudaMalloc(&d_Result_GPU, N * sizeof(float))); 
    float *d_ConvKernel; gpuErrchk(cudaMalloc(&d_ConvKernel, K * sizeof(float))); 

    for (int i=0; i < N; i++) { h_Signal[i] = (float)(rand() % RG); } 

    for (int i=0; i < K; i++) { h_ConvKernel[i] = (float)(rand() % RG); } 

    gpuErrchk(cudaMemcpy(d_Signal,  h_Signal,  N * sizeof(float), cudaMemcpyHostToDevice)); 
    gpuErrchk(cudaMemcpy(d_ConvKernel, h_ConvKernel, K * sizeof(float), cudaMemcpyHostToDevice)); 

    h_convolution_1D(h_Signal, h_ConvKernel, h_Result_CPU, N, K); 

    d_convolution_1D_basic<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_Signal, d_ConvKernel, d_Result_GPU, N, K); 
    gpuErrchk(cudaPeekAtLastError()); 
    gpuErrchk(cudaDeviceSynchronize()); 

    gpuErrchk(cudaMemcpy(h_Result_GPU, d_Result_GPU, N * sizeof(float), cudaMemcpyDeviceToHost)); 

    for (int i = 0; i < N; i++) if (h_Result_CPU[i] != h_Result_GPU[i]) {printf("mismatch2 at %d, cpu: %d, gpu %d\n", i, h_Result_CPU[i], h_Result_GPU[i]); return 1;} 

    printf("Test basic passed\n"); 

    d_convolution_1D_caching<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_Signal, d_ConvKernel, d_Result_GPU, N, K); 
    gpuErrchk(cudaPeekAtLastError()); 
    gpuErrchk(cudaDeviceSynchronize()); 

    gpuErrchk(cudaMemcpy(h_Result_GPU, d_Result_GPU, N * sizeof(float), cudaMemcpyDeviceToHost)); 

    for (int i = 0; i < N; i++) if (h_Result_CPU[i] != h_Result_GPU[i]) {printf("mismatch2 at %d, cpu: %d, gpu %d\n", i, h_Result_CPU[i], h_Result_GPU[i]); return 1;} 

    printf("Test caching passed\n"); 

    return 0; 
}

來源

2015-08-25 13:24:24 JackOLantern

CUDA中的FIR濾波器（作爲1D卷積）

回答

相關問題