2013-04-10 76 views
4

目前,我有如下就像穿越在OpenCL內核。如果有人對這個相當大的內核進行優化,我會很高興。的OpenCL內核遍歷 - 進一步優化

問題是,我正在SAH BVH上運行此代碼,並且我希望在他的論文(瞭解GPU上的光線遍歷效率)中使用遍歷的方式獲得與Timo Aila類似的性能,當然他的代碼使用SplitBVH(我可能會考慮使用它代替SAH BVH,但在我看來它的構建時間確實很慢)。但我問的是遍歷,而不是BVH(我迄今爲止只適用於場景,SplitBVH不會給你帶來比SAH BVH更多的優勢)。

首先,這裏是我迄今爲止(標準的同時,同時遍歷內核)。

__constant sampler_t sampler = CLK_FILTER_NEAREST; 

// Inline definition of horizontal max 
inline float max4(float a, float b, float c, float d) 
{ 
    return max(max(max(a, b), c), d); 
} 

// Inline definition of horizontal min 
inline float min4(float a, float b, float c, float d) 
{ 
    return min(min(min(a, b), c), d); 
} 

// Traversal kernel 
__kernel void traverse(__read_only image2d_t nodes, 
         __global const float4* triangles, 
         __global const float4* rays, 
         __global float4* result, 
         const int num, 
         const int w, 
         const int h) 
{ 
    // Ray index 
    int idx = get_global_id(0); 

    if(idx < num) 
    { 
     // Stack 
     int todo[32]; 
     int todoOffset = 0; 

     // Current node 
     int nodeNum = 0; 

     float tmin = 0.0f; 
     float depth = 2e30f; 

     // Fetch ray origin, direction and compute invdirection 
     float4 origin = rays[2 * idx + 0]; 
     float4 direction = rays[2 * idx + 1]; 
     float4 invdir = native_recip(direction); 

     float4 temp = (float4)(0.0f, 0.0f, 0.0f, 1.0f); 

     // Traversal loop 
     while(true) 
     { 
      // Fetch node information 
      int2 nodeCoord = (int2)((nodeNum << 2) % w, (nodeNum << 2)/w); 
      int4 specs = read_imagei(nodes, sampler, nodeCoord + (int2)(3, 0)); 

      // While node isn't leaf 
      while(specs.z == 0) 
      { 
       // Fetch child bounding boxes 
       float4 n0xy = read_imagef(nodes, sampler, nodeCoord); 
       float4 n1xy = read_imagef(nodes, sampler, nodeCoord + (int2)(1, 0)); 
       float4 nz = read_imagef(nodes, sampler, nodeCoord + (int2)(2, 0)); 

       // Test ray against child bounding boxes 
       float oodx = origin.x * invdir.x; 
       float oody = origin.y * invdir.y; 
       float oodz = origin.z * invdir.z; 
       float c0lox = n0xy.x * invdir.x - oodx; 
       float c0hix = n0xy.y * invdir.x - oodx; 
       float c0loy = n0xy.z * invdir.y - oody; 
       float c0hiy = n0xy.w * invdir.y - oody; 
       float c0loz = nz.x * invdir.z - oodz; 
       float c0hiz = nz.y * invdir.z - oodz; 
       float c1loz = nz.z * invdir.z - oodz; 
       float c1hiz = nz.w * invdir.z - oodz; 
       float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), tmin); 
       float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), depth); 
       float c1lox = n1xy.x * invdir.x - oodx; 
       float c1hix = n1xy.y * invdir.x - oodx; 
       float c1loy = n1xy.z * invdir.y - oody; 
       float c1hiy = n1xy.w * invdir.y - oody; 
       float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), tmin); 
       float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), depth); 

       bool traverseChild0 = (c0max >= c0min); 
       bool traverseChild1 = (c1max >= c1min); 

       nodeNum = specs.x; 
       int nodeAbove = specs.y; 

       // We hit just one out of 2 childs 
       if(traverseChild0 != traverseChild1) 
       { 
        if(traverseChild1) 
        { 
         nodeNum = nodeAbove; 
        } 
       } 
       // We hit either both or none 
       else 
       { 
        // If we hit none, pop node from stack (or exit traversal, if stack is empty) 
        if (!traverseChild0) 
        { 
         if(todoOffset == 0) 
         { 
          break; 
         } 
         nodeNum = todo[--todoOffset]; 
        } 
        // If we hit both 
        else 
        { 
         // Sort them (so nearest goes 1st, further 2nd) 
         if(c1min < c0min) 
         { 
          unsigned int tmp = nodeNum; 
          nodeNum = nodeAbove; 
          nodeAbove = tmp; 
         } 

         // Push further on stack 
         todo[todoOffset++] = nodeAbove; 
        } 
       } 

       // Fetch next node information 
       nodeCoord = (int2)((nodeNum << 2) % w, (nodeNum << 2)/w); 
       specs = read_imagei(nodes, sampler, nodeCoord + (int2)(3, 0)); 
      } 

      // If node is leaf & has some primitives 
      if(specs.z > 0) 
      { 
       // Loop through primitives & perform intersection with them (Woop triangles) 
       for(int i = specs.x; i < specs.y; i++) 
       { 
        // Fetch first point from global memory 
        float4 v0 = triangles[i * 4 + 0]; 

        float o_z = v0.w - origin.x * v0.x - origin.y * v0.y - origin.z * v0.z; 
        float i_z = 1.0f/(direction.x * v0.x + direction.y * v0.y + direction.z * v0.z); 
        float t = o_z * i_z; 

        if(t > 0.0f && t < depth) 
        { 
         // Fetch second point from global memory 
         float4 v1 = triangles[i * 4 + 1]; 

         float o_x = v1.w + origin.x * v1.x + origin.y * v1.y + origin.z * v1.z; 
         float d_x = direction.x * v1.x + direction.y * v1.y + direction.z * v1.z; 
         float u = o_x + t * d_x; 

         if(u >= 0.0f && u <= 1.0f) 
         { 
          // Fetch third point from global memory 
          float4 v2 = triangles[i * 4 + 2]; 

          float o_y = v2.w + origin.x * v2.x + origin.y * v2.y + origin.z * v2.z; 
          float d_y = direction.x * v2.x + direction.y * v2.y + direction.z * v2.z; 
          float v = o_y + t * d_y; 

          if(v >= 0.0f && u + v <= 1.0f) 
          { 
           // We got successful hit, store the information 
           depth = t; 
           temp.x = u; 
           temp.y = v; 
           temp.z = t; 
           temp.w = as_float(i); 
          } 
         } 
        } 
       } 
      } 

      // Pop node from stack (if empty, finish traversal) 
      if(todoOffset == 0) 
      { 
       break; 
      } 

      nodeNum = todo[--todoOffset]; 
     } 

     // Store the ray traversal result in global memory 
     result[idx] = temp; 
    } 
} 

當天的第一個問題是,怎麼能寫在他的OpenCL而持久,而和投機性,同時,而內核?

廣告持續而-而,我得到它的權利,我其實剛開始與全球等同於當地的工作尺寸工作尺寸的內核,而這兩個數字應該等於以翹曲的GPU /波陣面尺寸? 我得到與CUDA持久線程實現看起來是這樣的:

do 
    { 
     volatile int& jobIndexBase = nextJobArray[threadIndex.y]; 

     if(threadIndex.x == 0) 
     { 
       jobIndexBase = atomicAdd(&warpCounter, WARP_SIZE); 
     } 

     index = jobIndexBase + threadIndex.x; 

     if(index >= totalJobs) 
       return; 

     /* Perform work for task numbered 'index' */ 
    } 
    while(true); 

怎麼可能相當於OpenCL的樣子,我知道我必須做在那裏的一些障礙,我也知道,一個後應我原子地將WARP_SIZE添加到warpCounter的分數。

廣告投機穿越 - 好,我可能沒有任何想法如何這應該在OpenCL的實施,所以任何提示的歡迎。我也不知道在哪裏放置障礙物(因爲將它們放在模擬__中會導致駕駛員碰撞)。

如果你在這裏做到了,感謝閱讀&任何提示,答案等,歡迎!

回答

1

你可以做一個優化是使用矢量變量和融合乘法相加功能,以加快建立數學。至於內核的其他部分,它很慢,因爲它很分散。如果您可以對信號數據進行假設,則可以通過減少代碼分支來縮短執行時間。我沒有檢查float4 swizles(float 4變量後面的.xxyy和.x .y .z .w),所以檢查一下。

  float4 n0xy = read_imagef(nodes, sampler, nodeCoord); 
      float4 n1xy = read_imagef(nodes, sampler, nodeCoord + (int2)(1, 0)); 
      float4 nz = read_imagef(nodes, sampler, nodeCoord + (int2)(2, 0)); 

      float4 oodf4 = -origin * invdir; 

      float4 c0xyf4 = fma(n0xy,invdir.xxyy,oodf4); 

      float4 c0zc1z = fma(nz,(float4)(invdir.z),oodf4); 

      float c0min = max4(min(c0xyf4.x, c0xyf4.y), min(c0xyf4.z, c0xyf4.w), min(c0zc1z.z, c0zc1z.w), tmin); 
      float c0max = min4(max(c0xyf4.x, c0xyf4.y), max(c0xyf4.z, c0xyf4.w), max(c0zc1z.z, c0zc1z.w), depth); 

      float4 c1xy = fma(n1xy,invdir.xxyy,oodf4); 

      float c1min = max4(min(c1xy.x, c1xy.y), min(c1xy.z, c1xy.w), min(c0zc1z.z, c0zc1z.w), tmin); 
      float c1max = min4(max(c1xy.x, c1xy.y), max(c1xy.z, c1xy.w), max(c0zc1z.z, c0zc1z.w), depth);