OpenCL：通過使用globalid（。）訪問正確的索引

嗨，
我在OpenCL編碼。
我正在轉換一個「C函數」具有從i = 1和j = 1 .PFB開始的二維數組。OpenCL：通過使用globalid（。）訪問正確的索引

cv::Mat input; //Input :having some data in it .. 
//Image input size is :input.rows=288 ,input.cols =640 
cv::Mat output(input.rows-2,input.cols-2,CV_32F); //Output buffer 
//Image output size is :output.rows=286 ,output.cols =638

這就是我想在OpenCL的修改代碼：

for(int i=1;i<output.rows-1;i++) 
{ 
    for(int j=1;j<output.cols-1;j++) 
    { 
     float xVal = input.at<uchar>(i-1,j-1)-input.at<uchar>(i-1,j+1)+ 2*(input.at<uchar>(i,j-1)-input.at<uchar>(i,j+1))+input.at<uchar>(i+1,j-1) - input.at<uchar>(i+1,j+1); 
     float yVal = input.at<uchar>(i-1,j-1) - input.at<uchar>(i+1,j-1)+ 2*(input.at<uchar>(i-1,j) - input.at<uchar>(i+1,j))+input.at<uchar>(i-1,j+1)-input.at<uchar>(i+1,j+1); 
     output.at<float>(i-1,j-1) = xVal*xVal+yVal*yVal; 
    } 
}

... 主機代碼：

//Input Image size is :input.rows=288 ,input.cols =640 
//Output Image size is :output.rows=286 ,output.cols =638 
OclStr->global_work_size[0] =(input.cols); 
OclStr->global_work_size[1] =(input.rows); 

size_t outBufSize = (output.rows) * (output.cols) * 4;//4 as I am copying all 4 uchar values into one float variable space 

    cl_mem cl_input_buffer = clCreateBuffer(
     OclStr->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , 
     (input.rows) * (input.cols), 
     static_cast<void *>(input.data), &OclStr->returnstatus); 

    cl_mem cl_output_buffer = clCreateBuffer(
     OclStr->context, CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR , 
     (output.rows) * (output.cols) * sizeof(float), 
     static_cast<void *>(output.data), &OclStr->returnstatus); 

OclStr->returnstatus = clSetKernelArg(OclStr->objkernel, 0, sizeof(cl_mem), (void *)&cl_input_buffer); 
OclStr->returnstatus = clSetKernelArg(OclStr->objkernel, 1, sizeof(cl_mem), (void *)&cl_output_buffer); 

    OclStr->returnstatus = clEnqueueNDRangeKernel(
     OclStr->command_queue, 
     OclStr->objkernel, 
     2, 
     NULL, 
     OclStr->global_work_size, 
     NULL, 
     0, 
     NULL, 
     NULL 
     ); 
clEnqueueMapBuffer(OclStr->command_queue, cl_output_buffer, true, CL_MAP_READ, 0, outBufSize, 0, NULL, NULL, &OclStr->returnstatus);

內核代碼：

__kernel void Sobel_uchar (__global uchar *pSrc, __global float *pDstImage)    
{                      
const uint cols = get_global_id(0)+1;            
const uint rows = get_global_id(1)+1;            
const uint width= get_global_size(0);            
uchar Opsoble[8];                 
Opsoble[0] = pSrc[(cols-1)+((rows-1)*width)];       
Opsoble[1] = pSrc[(cols+1)+((rows-1)*width)];       
Opsoble[2] = pSrc[(cols-1)+((rows+0)*width)];       
Opsoble[3] = pSrc[(cols+1)+((rows+0)*width)];       
Opsoble[4] = pSrc[(cols-1)+((rows+1)*width)];       
Opsoble[5] = pSrc[(cols+1)+((rows+1)*width)];       
Opsoble[6] = pSrc[(cols+0)+((rows-1)*width)];       
Opsoble[7] = pSrc[(cols+0)+((rows+1)*width)];       
float gx = Opsoble[0]-Opsoble[1]+2*(Opsoble[2]-Opsoble[3])+Opsoble[4]-Opsoble[5]; 
float gy = Opsoble[0]-Opsoble[4]+2*(Opsoble[6]-Opsoble[7])+Opsoble[1]-Opsoble[5]; 
pDstImage[(cols-1)+(rows-1)*width] = gx*gx + gy*gy;         

    }

在這裏，我無法獲得按預期輸出。我有一些問題，

我的for循環開始的從i = 1而不是零，那麼我如何通過使用X的global_id（）和y方向
這是怎麼回事得到適當的指數錯在我上面的內核代碼:(

我懷疑有緩衝步幅問題，但無法進一步打破我的頭已經打破了它在整個一天:( 我觀察到下面的邏輯輸出在一些7/8幀序列之後跳過一個或兩個幀。我添加了我的輸出的屏幕截圖與參考輸出進行比較。 PFB

const uint width = get_global_size(0)+1;

您的建議是最歡迎的!!! - 我上面的邏輯是對我的輸入做局部sobelling。我改變了寬度 enter image description here

來源

2014-09-26 Ashwin

對於第一個問題只是改變'常量UINT I = get_global_id（0）;''到常量UINT I = get_global_id（0）+ 1;'，並且相應調整全局工作尺寸。 – maZZZu 2014-09-26 13:34:57

pDstImage [（i-1）* width +（j-1）] = gx * gx + gy * gy; shouıldnyt這是我，j而不是i-1，j-1 – 2014-09-26 16:15:34

根據建議更改我的代碼庫後，您可以查看我的輸出結果！ – Ashwin 2014-09-29 05:45:36

看起來你可能會在你的opencl版本中以（y，x）格式獲取值。此外，您需要將1添加到全局ID以複製從1開始而不是0的for循環。

我不知道爲什麼會有未使用的iOffset變量。也許你的bug與此有關？我在我的版本中刪除它。

這個內核對你更好嗎？

__kernel void simple(__global uchar *pSrc, __global float *pDstImage)    
{                      
    const uint i = get_global_id(0) +1;             
    const uint j = get_global_id(1) +1;             
    const uint width = get_global_size(0) +2;            

    uchar Opsoble[8];                 
    Opsoble[0] = pSrc[(i-1) + (j - 1)*width];           
    Opsoble[1] = pSrc[(i-1) + (j + 1)*width];           
    Opsoble[2] = pSrc[i + (j-1)*width];             
    Opsoble[3] = pSrc[i + (j+1)*width];             
    Opsoble[4] = pSrc[(i+1) + (j - 1)*width];           
    Opsoble[5] = pSrc[(i+1) + (j + 1)*width];           
    Opsoble[6] = pSrc[(i-1) + (j)*width];            
    Opsoble[7] = pSrc[(i+1) + (j)*width]; 
    float gx = Opsoble[0]-Opsoble[1]+2*(Opsoble[2]-Opsoble[3])+Opsoble[4]-Opsoble[5]; 
    float gy = Opsoble[0]-Opsoble[4]+2*(Opsoble[6]-Opsoble[7])+Opsoble[1]-Opsoble[5]; 
    pDstImage[(i-1) + (j-1)*width] = gx*gx + gy*gy ;          
}

來源

2014-09-26 14:52:02 mfa

嗨mfa，用你的技巧，我做了改變。但我沒有得到像參考的精確輸出。我chnaged寬度爲const uint width = get_global_size（0）+1; – Ashwin 2014-09-29 05:35:52

如果您在主機代碼中嘗試這樣，輸出是否更好？ OclStr-> global_work_size [0] =（output.rows）; OclStr-> global_work_size [1] =（output.cols）; – mfa 2014-09-29 13:44:20

沒有運氣...沒有得到像圖中所示的輸出（如for循環）..沒有運氣... :( – Ashwin 2014-09-29 14:23:08

我有點惶惑關於發佈一個答案建議優化你的內核，看到原來的輸出尚未完全復現，作爲尚未。對於與圖像處理/過濾相關的問題，可以做出重大改進。

使用本地內存將通過將全局讀取次數減少八分之一來幫助您解決問題，並將全局寫入組合在一起以獲得單次寫入每像素輸出的潛在增益。

下面的內核從pSrc中讀取一個高達34x34的塊，並輸出pDstImage的32x32（最大）區域。我希望代碼中的註釋足以指導您使用內核。我無法完成此測試，因此可能需要進行更改。任何意見，以及讚賞。

__kernel void sobel_uchar_wlocal (__global uchar *pSrc, __global float *pDstImage, __global uint2 dimDstImage) 
{ 
    //call this kernel 1-dimensional work group size: 32x1 
    //calculates 32x32 region of output with 32 work items 

    const uint wid = get_local_id(0); 
    const uint wid_1 = wid+1; // corrected for the calculation step 
    const uint2 gid = (uint2)(get_group_id(0),get_group_id(1)); 
    const uint localDim = get_local_size(0); 

    const uint2 globalTopLeft = (uint2)(localDim * gid.x, localDim * gid.y); //position in pSrc to copy from/to 

    //dimLocalBuff is used for the right and bottom edges of the image, where the work group may run over the border 
    const uint2 dimLocalBuff = (uint2)(localDim,localDim); 
    if(dimDstImage.x - globalTopLeft.x < dimLocalBuff.x){ 
     dimLocalBuff.x = dimDstImage.x - globalTopLeft.x; 
    } 
    if(dimDstImage.y - globalTopLeft.y < dimLocalBuff.y){ 
     dimLocalBuff.y = dimDstImage.y - globalTopLeft.y; 
    } 

    int i,j; 

    //save region of data into local memory 
    __local uchar srcBuff[34][34]; //34^2 uchar = 1156 bytes 
    for(j=-1;j<dimLocalBuff.y+1;j++){ 
     for(i=x-1;i<dimLocalBuff.x+1;i+=localDim){ 
      srcBuff[i+1][j+1] = pSrc[globalTopLeft.x+i][globalTopLeft.y+j]; 
     } 
    } 
    mem_fence(CLK_LOCAL_MEM_FENCE); 

    //compute output and store locally 
    __local float dstBuff[32][32]; //32^2 float = 4096 bytes 
    if(wid_1 < dimLocalBuff.x){ 
     for(i=0;i<dimLocalBuff.y;i++){ 
      float gx = srcBuff[(wid_1-1)+ (i - 1)]-srcBuff[(wid_1-1)+ (i + 1)]+2*(srcBuff[wid_1+ (i-1)]-srcBuff[wid_1+ (i+1)])+srcBuff[(wid_1+1)+ (i - 1)]-srcBuff[(wid_1+1)+ (i + 1)]; 
      float gy = srcBuff[(wid_1-1)+ (i - 1)]-srcBuff[(wid_1+1)+ (i - 1)]+2*(srcBuff[(wid_1-1)+ (i)]-srcBuff[(wid_1+1)+ (i)])+srcBuff[(wid_1-1)+ (i + 1)]-srcBuff[(wid_1+1)+ (i + 1)]; 
      dstBuff[wid][i] = gx*gx + gy*gy; 
     } 
    } 
    mem_fence(CLK_LOCAL_MEM_FENCE); 

    //copy results to output 
    for(j=0;j<dimLocalBuff.y;j++){ 
     for(i=0;i<dimLocalBuff.x;i+=localDim){ 
      srcBuff[i][j] = pSrc[globalTopLeft.x+i][globalTopLeft.y+j]; 
     } 
    } 
}

來源

2014-10-01 12:38:52 mfa

我會跳過最後一步，直接寫入全局內存。這樣，全局的寫入延遲可以與gx gy操作交錯。通過將其保存到本地，此時可能會更快，但編譯器無法理解數據未被其他線程修改，因此一直等到圍欄將所有值轉儲到全局。但是，不應該有很大的速度影響。 – DarkZeros 2014-10-02 16:09:22

可能值得嘗試這兩種方法。畢竟，延遲寫作可能只有微小的好處。 – mfa 2014-10-02 16:17:50

OpenCL：通過使用globalid（。）訪問正確的索引

回答

相關問題