2011-05-16 45 views
0

我想實現與pyopencl的元素乘法,但是當我從pyopencl讀取結果緩衝區時,只有8行中的前3行是正確的。我不確定這是否是OpenCL或pyopencl的問題。這是我最小的輸出示例。我爲每個建議感到高興。部分錯誤的pyopencl輸出

感謝

import pyopencl as cl 
import numpy 

# OpenCL Kernel code ----------------------------------------------------- 
KERNEL_CODE = """ 
    __kernel void eMul(
      __global float* C, 
      __global float* A, 
      __global float* B, 
      int width, int height) 
    { 
     // ID 
     int x = get_global_id(0); 
     int y = get_global_id(1); 

     // Multiplying 
    C[y * width + x ] = A[y * width + x] * B[y * width + x]; 
    } 
""" 

# init OpenCL ----------------------------------------------------- 
ctx = cl.create_some_context() 
queue = cl.CommandQueue(ctx) 
prg = cl.Program(ctx, KERNEL_CODE).build() 
kernel = prg.eMul 

# init host memory ----------------------------------------------------- 
numpy.random.seed(42) 
width = 4 
height = 8 
cl_left= numpy.random.rand(height, width).astype(numpy.float32) * 10 
cl_left = cl_left.round() 
cl_right= numpy.random.rand(height, width).astype(numpy.float32) * 10 
cl_right = cl_right.round() 
print "\nleft\n",cl_left,"\n\nright\n",cl_right 

# transfer host -> device ----------------------------------------------------- 
mf = cl.mem_flags 

cl_result = numpy.zeros(cl_left.shape).astype(numpy.float32) 
d_a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cl_left) 
d_b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cl_right) 
d_c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, cl_result.nbytes) 

kernel.set_arg(0,d_c_buf) 
kernel.set_arg(1,d_a_buf) 
kernel.set_arg(2,d_b_buf) 
kernel.set_arg(3,numpy.uint32(width)) 
kernel.set_arg(4,numpy.uint32(height)) 

event = cl.enqueue_nd_range_kernel(queue,kernel,cl_result.shape,cl_result.shape) 
event.wait() 

# transfer device -> host ----------------------------------------------------- 
cl.enqueue_read_buffer(queue, d_c_buf, cl_result).wait() 
print "\nresult\n", cl_result 

輸出:

left 
[[ 4. 10. 7. 6.] 
[ 2. 2. 1. 9.] 
[ 6. 7. 0. 10.] 
[ 8. 2. 2. 2.] 
[ 3. 5. 4. 3.] 
[ 6. 1. 3. 4.] 
[ 5. 8. 2. 5.] 
[ 6. 0. 6. 2.]] 
right 
[[ 1. 9. 10. 8.] 
[ 3. 1. 7. 4.] 
[ 1. 5. 0. 9.] 
[ 3. 7. 3. 5.] 
[ 5. 2. 10. 8.] 
[ 9. 9. 6. 9.] 
[ 1. 2. 0. 3.] 
[ 4. 3. 8. 4.]] 

result 
[[ 4. 90. 70. 48.] 
[ 6. 2. 7. 36.] 
[ 6. 35. 0. 90.] 
[ 24. 14. 6. 10.] 
[ 15. 10. 40. 24.] <== till here correct 
[ 138. 69. 87. 35.] <== from here incorrect 
[ 130. 47. 109. 49.] 
[ 95. 45. 25. 49.]] 

回答

2

有看起來是一個關於你如何指定數組的形狀內核位混亂的 - 基本上你有寬度和高度逆轉與源numpy數組的大小相比。因此,您試圖如果更換這個內核使用的4個字間距爲輸出數組,而不是8

列主要爲了寫:

__kernel void eMul(
         __global float* C, 
         __global float* A, 
         __global float* B, 
         int width, int height) 
    { 
     // ID 
     int x = get_global_id(0); 
     int y = get_global_id(1); 

     // Multiplying 
     C[y * height + x ] = A[y * height + x] * B[y * height + x]; 
    } 

我想你會發現結果更符合你的期望。