使用Cuda並行實現計算陣列中連續子序列的總和

-1

可以考慮以下數組： tab = [80,12,14,5,70,9,26,30,8,12,16 ，15 我要計算使用CUDA大小爲4的所有可能序列的總和：例如：使用Cuda並行實現計算陣列中連續子序列的總和

S1=80+12+14+5=111 
S2=12+14+5+70 =101 
S3=14+5+70+9 =98 
....

您有一個高效的想法parallise使用CUDA此任務。前面的表只是一個例子，在我的情況下，我將使用巨大的一個。

來源

2017-02-10 alae

float4向量將其元素左移（元素）1，然後將最新元素分配給下一個數組元素，然後將它的點積寫入S元素。或者，爲變量添加最新的元素，從該變量中減去最舊的元素，然後將其寫入S元素？但是這是針對單線程的。對於多線程，它可能需要本地數組而不是全局數組。 –

我們可以使用推力在單個操作（thrust::transform）中執行此操作。在CUDA中，這可以被認爲是相當簡單的一維模板操作。

可以在幻燈片49-58上找到here的1-D模板操作的很好描述。

這實際上是一個簡化的情況，因爲模板寬度是4並且它只位於中心點的一側。

這裏的一個工作實例比較2點的方法：

$ cat t88.cu 
#include <thrust/device_vector.h> 
#include <thrust/transform.h> 
#include <thrust/iterator/zip_iterator.h> 
#include <thrust/copy.h> 
#include <iostream> 

const int nTPB=256; 
typedef float mytype; 
const int ds = 1048576*32; 

struct sum4 
{ 
    template <typename T> 
    __host__ __device__ 
    mytype operator()(const T t){ 
    return thrust::get<0>(t) + thrust::get<1>(t) + thrust::get<2>(t) + thrust::get<3>(t); 
    } 
}; 

template <typename T> 
__global__ void sum4kernel(const T * __restrict__ in, T * __restrict__ out, const unsigned dsize) 
{ 

    __shared__ T sdata[nTPB+3]; 
    unsigned idx = threadIdx.x+blockDim.x*blockIdx.x; 
    if (idx < dsize) sdata[threadIdx.x] = in[idx]; 
    if ((threadIdx.x < 3) && ((idx+blockDim.x) < dsize)) sdata[threadIdx.x + blockDim.x] = in[idx + blockDim.x]; 
    __syncthreads(); 
    T temp = sdata[threadIdx.x]; 
    temp += sdata[threadIdx.x+1]; 
    temp += sdata[threadIdx.x+2]; 
    temp += sdata[threadIdx.x+3]; 
    if (idx < dsize - 4) out[idx] = temp; 
} 

int main(){ 

    mytype hdata1[] = {80,12,14,5,70,9,26,30,8,12,16,15}; 
    unsigned ds1 = sizeof(hdata1)/sizeof(hdata1[0]); 
    mytype hres1[ds1-4]; 
    thrust::device_vector<mytype> ddata1(hdata1, hdata1+ds1); 
    thrust::device_vector<mytype> dres1(ds1-4); 
    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(ddata1.begin(), ddata1.begin()+1, ddata1.begin()+2, ddata1.begin()+3)), thrust::make_zip_iterator(thrust::make_tuple(ddata1.end()-3, ddata1.end()-2, ddata1.end()-1, ddata1.end())), dres1.begin(), sum4()); 
    thrust::copy(dres1.begin(), dres1.end(), std::ostream_iterator<mytype>(std::cout, ",")); 
    std::cout << std::endl; 
    sum4kernel<<<(ds1+nTPB-1)/nTPB, nTPB>>>(thrust::raw_pointer_cast(ddata1.data()), thrust::raw_pointer_cast(dres1.data()), ds1); 
    cudaMemcpy(hres1, thrust::raw_pointer_cast(dres1.data()), (ds1-4)*sizeof(mytype), cudaMemcpyDeviceToHost); 
    for (int i = 0; i < ds1-4; i++) 
    std::cout << hres1[i] << ","; 
    std::cout << std::endl; 

    thrust::device_vector<mytype> ddata2(ds, 1); 
    thrust::device_vector<mytype> dres2(ds-4); 

    cudaEvent_t start, stop; 
    cudaEventCreate(&start); cudaEventCreate(&stop); 

    cudaEventRecord(start); 
    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(ddata2.begin(), ddata2.begin()+1, ddata2.begin()+2, ddata2.begin()+3)), thrust::make_zip_iterator(thrust::make_tuple(ddata2.end()-3, ddata2.end()-2, ddata2.end()-1, ddata2.end())), dres2.begin(), sum4()); 
    cudaEventRecord(stop); 
    thrust::host_vector<mytype> hres2 = dres2; 
    float et; 
    cudaEventElapsedTime(&et, start, stop); 
    std::cout << "thrust time: " << et << "ms" << std::endl; 
// validate 
    for (int i = 0; i < ds-4; i++) if (hres2[i] != 4) {std::cout << "thrust validation failure: " << i << "," << hres2[i] << std::endl; return 1;} 
    cudaEventRecord(start); 
    sum4kernel<<<(ds+nTPB-1)/nTPB, nTPB>>>(thrust::raw_pointer_cast(ddata2.data()), thrust::raw_pointer_cast(dres2.data()), ds); 
    cudaEventRecord(stop); 
    cudaMemcpy(&(hres2[0]), thrust::raw_pointer_cast(dres2.data()), (ds-4)*sizeof(mytype), cudaMemcpyDeviceToHost); 
    cudaEventElapsedTime(&et, start, stop); 
    std::cout << "cuda time: " << et << "ms" << std::endl; 
    for (int i = 0; i < ds-4; i++) if (hres2[i] != 4) {std::cout << "cuda validation failure: " << i << "," << hres2[i] << std::endl; return 1;} 
} 


$ nvcc -arch=sm_61 -o t88 t88.cu 
$ ./t88 
111,101,98,110,135,73,76,66, 
111,101,98,110,135,73,76,66, 
thrust time: 0.902464ms 
cuda time: 0.76288ms 
$

對於這個特定的GPU（泰坦X帕斯卡）沒有推力時間之間太大的差別（約15％）用於32M元件數據集和CUDA時間。我們希望這個算法是內存限制的。

對於這個pascal titan x，bandwidthTest報告了關於可測量內存帶寬的345 GB/s。

的CUDA實現必須加載整個數據集大小和存儲整個數據集大小（約）=每元件2點的操作，因此所獲得的帶寬計算爲這CUDA代碼是：

(32*1048576 elements * 2 ops/element * 4 bytes/op)/0.00076288 s = ~350GB/s

所以它看起來CUDA實現實現了大約最大可用帶寬。

來源

2017-02-10 16:43:53

您是否介意給cuda實現背後的想法提供更多解釋？@Robert Crovella – alae

我添加了一個鏈接，該鏈接爲使用CUDA中的共享內存的1-D模板示例提供了入門培訓幻燈片。 –

感謝您的明確回答@ Robert Crovella，當我閱讀關於1-D模板操作的NVIDIA論文時，我注意到每次迭代中的總和不會被重用，因爲以下元素是正確的？事實上，我將在下一個步驟中使用大量數據，其半徑爲100，這將會非常昂貴，您認爲如何？ – alae

使用Cuda並行實現計算陣列中連續子序列的總和

回答

相關問題