2017-04-08 73 views
2

我正在優化pyOpenCL中的Mandelbrot渲染器,並希望將塊中的迭代拆分,以便更好地利用我的GPU。
最大迭代次數= 1000和2個「塊」的示例:
1.運行mandelbrot轉義算法以進行迭代0-500。
2.保存每一點所需要的迭代,其中迭代< 500和所有其他點與500迭代再次運行 - 1000未能在pyopencl中渲染mandelbrot

第一循環的工作原理是預期,但之後每塊會導致錯誤的結果。我真的想更具體,但我不知道真正的問題在哪裏(現在主演代碼> 2天)。
我懷疑當從內核複製舊的x,y(真實,虛構)部分時會出現問題,但我不知道如何調試它。
我得到了與我的GPU和CPU上運行相同的結果,所以我猜它沒有特定的GPU。

迭代= 2000和10塊實施例圖像:
enter image description here

這是幾乎僅在第一組塊(加上一些少數 「錯誤」 的像素)。
在一個組塊的所有完成(迭代= 200和1塊):
enter image description here

並與迭代= 2000和組塊= 1預期的結果:
enter image description here

import pyopencl as cl 
import numpy as np 
from PIL import Image 
from decimal import Decimal 

def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False): 
    mf = cl.mem_flags 
    cl_queue = cl.CommandQueue(ctx) 
    # build program 
    code = """ 
    #if real_t == double 
     #pragma OPENCL EXTENSION cl_khr_fp64 : enable 
    #endif 
    kernel void mandel(
     __global real_t *coords, 
     __global uint *output, 
     __global real_t *output_coord, 
     const uint max_iter, 
     const uint start_iter  
    ){ 
     uint id = get_global_id(0);   
     real_t2 my_coords = vload2(id, coords);   
     real_t x = my_coords.x; 
     real_t y = my_coords.y; 
     uint iter = 0; 
     for(iter=start_iter; iter<max_iter; ++iter){ 
      if(x*x + y*y > 4.0f){ 
       break; 
      } 
      real_t xtemp = x*x - y*y + my_coords.x; 
      y = 2*x*y + my_coords.y; 
      x = xtemp; 
     } 
     // copy the current x,y pair back 
     real_t2 val = (real_t2){x, y}; 
     vstore2(val, id, output_coord); 
     output[id] = iter; 
    }   
    """ 
    _cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32) 
    prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype)) 

    # Calculate the "viewport". 
    x0 = x - ((Decimal(3) * zoom)/Decimal(2.)) 
    y0 = y - ((Decimal(2) * zoom)/Decimal(2.)) 
    x1 = x + ((Decimal(3) * zoom)/Decimal(2.)) 
    y1 = y + ((Decimal(2) * zoom)/Decimal(2.)) 

    # Create index map in x,y pairs 
    xx = np.arange(0, width, 1, dtype=np.uint32) 
    yy = np.arange(0, height, 1, dtype=np.uint32) 
    index_map = np.dstack(np.meshgrid(xx, yy)) 
    # and local "coordinates" (real, imaginary parts) 
    coord_map = np.ndarray(index_map.shape, dtype=_nptype) 
    coord_map[:] = index_map 
    coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height))) 
    coord_map[:] += (_nptype(x0), _nptype(y0)) 
    coord_map = coord_map.flatten() 
    index_map = index_map.flatten().astype(dtype=np.uint32) 
    # Create input and output buffer 
    buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes) 
    buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run 
    buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes) 
    buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values 
    buffer_out_coords_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out_coords.nbytes) 
    # 2D Buffer to collect the iterations needed per pixel 
    #iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width)) 
    iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width)) 

    start_max_iter = 0 
    to_do = coord_map.size/2 
    steps_size = int(max_iter/float(iter_steps)) 
    while to_do > 0 and start_max_iter < max_iter: 
     end_max_iter = min(max_iter, start_max_iter + steps_size) 
     print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do) 

     # copy x/y pairs to device 
     cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()   
     # and finally call the ocl function 
     prg.mandel(cl_queue, (to_do,), None, 
      buffer_in_cl,     
      buffer_out_cl, 
      buffer_out_coords_cl, 
      np.uint32(end_max_iter), 
      np.uint32(start_max_iter) 
     ).wait() 
     # Copy the output back 
     cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait() 
     cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait() 

     # Get indices of "found" escapes 
     done = np.where(buffer_out[:to_do]<end_max_iter)[0] 
     # and write the iterations to the coresponding cell 
     index_reshaped = index_map[:to_do*2].reshape((to_do, 2)) 
     tmp = index_reshaped[done] 
     iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]   
     #iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]   

     # Get the indices of non escapes 
     undone = np.where(buffer_out[:to_do]==end_max_iter)[0] 
     # and write them back to our "job" maps for the next loop 
     tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2)) 
     coord_map[:undone.size*2] = tmp[undone].flatten() 
     index_map[:undone.size*2] = index_reshaped[undone].flatten() 

     to_do = undone.size 
     start_max_iter = end_max_iter 
     print "%i done. %i unknown" % (done.size, undone.size)        

    # simple coloring by modulo 255 on the iter_map 
    return (iter_map % 255).astype(np.uint8).reshape((height, width)) 


if __name__ == '__main__': 
    ctx = cl.create_some_context(interactive=True) 
    img = mandel(ctx, 
      x=Decimal("-0.7546546453361122021732941811"), 
      y=Decimal("0.05020518634419688663435986387"), 
      zoom=Decimal("0.0002046859427855630601247281079"), 
      max_iter=2000, 
      iter_steps=1, 
      width=500, 
      height=400, 
      use_double=False 
    ) 
    Image.fromarray(img).show() 

編輯:Here is anothe r版本,其中真實/虛擬部分永遠不會離開GPU內存。
結果是一樣的。
我完全沒有想法。

+1

第三圖像,使y你說的是對的,對我來說也不合適。它缺乏你期望的細節(並且方面縮放不正確)。它是否以32位浮點完成(2000次迭代時不足)?在第一張圖像下是否有線索*加上一些「錯誤」像素*?爲什麼有錯誤的像素? –

+0

謝謝@WeatherVane。長寬比是錯誤的,對於這個測試來說應該沒有問題。我固定了縮放計算(仍然錯誤,但是更好)。浮動圖片:http://i.imgur.com/2rShDVl.png,雙圖:http://i.imgur.com/SvwmhWg.png。同樣的(與另一個渲染器的ish點http://guciek.github.io/web_mandelbrot.html#-0.7546546453361122;0.050205186344196885;0.000204685942785563;2000(是的,我的渲染是垂直翻轉的)。我會嘗試給你一個更好的圖像像素計算有點 – SleepProgger

+0

但恕我直言,真正的問題是,我只是從內核臨時x,y(實際/虛構),並提供他們,如果點沒有逃脫(至少這是我的嘗試所以我應該得到相同的圖像,如果我在一次運行中計算了所有內容,或者我在這裏錯了嗎? – SleepProgger

回答

2

在進行Z squared plus c計算時,您正在使用來自buffer_out_coords的更新的「co-ords」而不是原始座標作爲c值。 buffer_out_coords包含當前的Z值,而不是原始的c合作伙伴,所以這些是你想要開始的值,但你也想要原始合作伙伴。

即使世界只有4個需要改變:

  • 化妝buffer_out_coords_cl READ_WRITE
  • 副本buffer_out_coords到buffer_out_coords_cl每次運行前
  • 過濾buffer_out_coords和 「未完成」 在OpenCL的代碼
  • ,負載coord_map來自output_coord的起點x和y而不是座標

林沒有得到相同的輸出你的代碼,你提出這麼林不知道是否還有別的東西錯在這裏或沒有,但這種變化給了我一致的輸出:

1塊= 153052未知

PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py 
Iterations from iteration 0 to 500 for 200000 numbers 
46948 done. 153052 unknown 

5塊= 153052未知

PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py 
Iterations from iteration 0 to 100 for 200000 numbers 
0 done. 200000 unknown 
Iterations from iteration 100 to 200 for 200000 numbers 
11181 done. 188819 unknown 
Iterations from iteration 200 to 300 for 188819 numbers 
9627 done. 179192 unknown 
Iterations from iteration 300 to 400 for 179192 numbers 
16878 done. 162314 unknown 
Iterations from iteration 400 to 500 for 162314 numbers 
9262 done. 153052 unknown 

赫雷什的代碼:

import pyopencl as cl 
import numpy as np 
from PIL import Image 
from decimal import Decimal 

def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False): 
    mf = cl.mem_flags 
    cl_queue = cl.CommandQueue(ctx) 
    # build program 
    code = """ 
    #if real_t == double 
     #pragma OPENCL EXTENSION cl_khr_fp64 : enable 
    #endif 
    kernel void mandel(
     __global real_t *coords, 
     __global uint *output, 
     __global real_t *output_coord, 
     const uint max_iter, 
     const uint start_iter  
    ){ 
     uint id = get_global_id(0);   
     real_t2 my_coords = vload2(id, coords);   
     real_t2 my_value_coords = vload2(id, output_coord);   
     real_t x = my_value_coords.x; 
     real_t y = my_value_coords.y; 
     uint iter = 0; 
     for(iter=start_iter; iter<max_iter; ++iter){ 
      if(x*x + y*y > 4.0f){ 
       break; 
      } 
      real_t xtemp = x*x - y*y + my_coords.x; 
      y = 2*x*y + my_coords.y; 
      x = xtemp; 
     } 
     // copy the current x,y pair back 
     real_t2 val = (real_t2){x, y}; 
     vstore2(val, id, output_coord); 
     output[id] = iter; 
    }   
    """ 
    _cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32) 
    prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype)) 

    # Calculate the "viewport". 
    x0 = x - ((Decimal(3) * zoom)/Decimal(2.)) 
    y0 = y - ((Decimal(2) * zoom)/Decimal(2.)) 
    x1 = x + ((Decimal(3) * zoom)/Decimal(2.)) 
    y1 = y + ((Decimal(2) * zoom)/Decimal(2.)) 

    # Create index map in x,y pairs 
    xx = np.arange(0, width, 1, dtype=np.uint32) 
    yy = np.arange(0, height, 1, dtype=np.uint32) 
    index_map = np.dstack(np.meshgrid(xx, yy)) 
    # and local "coordinates" (real, imaginary parts) 
    coord_map = np.ndarray(index_map.shape, dtype=_nptype) 
    coord_map[:] = index_map 
    coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height))) 
    coord_map[:] += (_nptype(x0), _nptype(y0)) 
    coord_map = coord_map.flatten() 
    index_map = index_map.flatten().astype(dtype=np.uint32) 
    # Create input and output buffer 
    buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes) 
    buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run 
    buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes) 
    buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values 
    buffer_out_coords_cl = cl.Buffer(ctx, mf.READ_WRITE, size=buffer_out_coords.nbytes) 
    # 2D Buffer to collect the iterations needed per pixel 
    #iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width)) 
    iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width)) 

    start_max_iter = 0 
    to_do = coord_map.size/2 
    steps_size = int(max_iter/float(iter_steps)) 
    while to_do > 0 and start_max_iter < max_iter: 
     end_max_iter = min(max_iter, start_max_iter + steps_size) 
     print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do) 

     # copy x/y pairs to device 
     cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()   
     cl.enqueue_copy(cl_queue, buffer_out_coords_cl, buffer_out_coords[:to_do*2]).wait()   
     # and finally call the ocl function 
     prg.mandel(cl_queue, (to_do,), None, 
      buffer_in_cl,     
      buffer_out_cl, 
      buffer_out_coords_cl, 
      np.uint32(end_max_iter), 
      np.uint32(start_max_iter) 
     ).wait() 
     # Copy the output back 
     cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait() 
     cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait() 

     # Get indices of "found" escapes 
     done = np.where(buffer_out[:to_do]<end_max_iter)[0] 
     # and write the iterations to the coresponding cell 
     index_reshaped = index_map[:to_do*2].reshape((to_do, 2)) 
     tmp = index_reshaped[done] 
     iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]   
     #iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]   

     # Get the indices of non escapes 
     undone = np.where(buffer_out[:to_do]==end_max_iter)[0] 
     # and write them back to our "job" maps for the next loop 
     tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2)) 
     buffer_out_coords[:undone.size*2] = tmp[undone].flatten() 
     tmp = coord_map[:to_do*2].reshape((to_do, 2)) 
     coord_map[:undone.size*2] = tmp[undone].flatten() 
     index_map[:undone.size*2] = index_reshaped[undone].flatten() 

     to_do = undone.size 
     start_max_iter = end_max_iter 
     print "%i done. %i unknown" % (done.size, undone.size)        

    # simple coloring by modulo 255 on the iter_map 
    return (iter_map % 255).astype(np.uint8).reshape((height, width)) 


if __name__ == '__main__': 
    ctx = cl.create_some_context(interactive=True) 
    img = mandel(ctx, 
      x=Decimal("-0.7546546453361122021732941811"), 
      y=Decimal("0.05020518634419688663435986387"), 
      zoom=Decimal("0.0002046859427855630601247281079"), 
      max_iter=2000, 
      iter_steps=1, 
      width=500, 
      height=400, 
      use_double=False 
    ) 
    Image.fromarray(img).show() 
+0

哦,該死的。謝謝。沒有測試,但只要我開始閱讀你的答案,我知道我很愚蠢。我爲了這個錯誤而長時間搜尋了這麼難以置信的東西。再次感謝。將盡快接受我測試它,但我很確定這是錯誤。 – SleepProgger