无法在 pyopencl 中呈现 mandelbrot

Question

我正在优化 pyOpenCL 中的 Mandelbrot 渲染器，并希望将迭代分成块，以便更好地利用我的 GPU。
最大迭代次数=1000 和 2 "chunks":
的示例 1. 运行迭代 0-500 的 mandelbrot 转义算法。
2. 为迭代次数 < 500 的每个点保存所需的迭代次数，并再次为迭代次数为 500 - 1000 的所有其他点保存运行。

第一个循环按预期工作，但之后的每个块都会导致错误的结果。我真的很想说得更具体一些，但我不知道真正的问题出在哪里（现在盯着代码看了 2 天以上）。
我怀疑从内核复制旧的 x、y（实部、虚部）部分时出现问题，但我不知道如何调试它。
我在我的 GPU 和 CPU 上得到相同的结果运行ning，所以我猜它与 GPU 无关。

具有 iterations=2000 和 10 个块的示例图像:

这几乎只是第一个块（加上一些 "wrong" 像素）。
全部在一个块中完成（迭代次数=200 和 1 个块）：

以及 iterations=2000 和 chunks = 1 的预期结果:

import pyopencl as cl
import numpy as np
from PIL import Image
from decimal import Decimal

def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False):
    mf = cl.mem_flags
    cl_queue = cl.CommandQueue(ctx)
    # build program
    code = """
    #if real_t == double
        #pragma OPENCL EXTENSION cl_khr_fp64 : enable
    #endif
    kernel void mandel(
        __global real_t *coords,
        __global uint *output,
        __global real_t *output_coord,
        const uint max_iter,
        const uint start_iter    
    ){
        uint id = get_global_id(0);         
        real_t2 my_coords = vload2(id, coords);           
        real_t x = my_coords.x;
        real_t y = my_coords.y;
        uint iter = 0;
        for(iter=start_iter; iter<max_iter; ++iter){
            if(x*x + y*y > 4.0f){
                break;
            }
            real_t xtemp = x*x - y*y + my_coords.x;
            y = 2*x*y + my_coords.y;
            x = xtemp;
        }
        // copy the current x,y pair back
        real_t2 val = (real_t2){x, y};
        vstore2(val, id, output_coord);
        output[id] = iter;
    }        
    """
    _cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32)
    prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))

    # Calculate the "viewport".
    x0 = x - ((Decimal(3) * zoom)/Decimal(2.))
    y0 = y - ((Decimal(2) * zoom)/Decimal(2.))
    x1 = x + ((Decimal(3) * zoom)/Decimal(2.))
    y1 = y + ((Decimal(2) * zoom)/Decimal(2.))

    # Create index map in x,y pairs
    xx = np.arange(0, width, 1, dtype=np.uint32)
    yy = np.arange(0, height, 1, dtype=np.uint32)
    index_map = np.dstack(np.meshgrid(xx, yy))
    # and local "coordinates" (real, imaginary parts)
    coord_map = np.ndarray(index_map.shape, dtype=_nptype)
    coord_map[:] = index_map
    coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height)))
    coord_map[:] += (_nptype(x0), _nptype(y0))
    coord_map = coord_map.flatten()
    index_map = index_map.flatten().astype(dtype=np.uint32)
    # Create input and output buffer
    buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
    buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run
    buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
    buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values
    buffer_out_coords_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out_coords.nbytes)
    # 2D Buffer to collect the iterations needed per pixel 
    #iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
    iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width))

    start_max_iter = 0
    to_do = coord_map.size / 2
    steps_size = int(max_iter / float(iter_steps))
    while to_do > 0 and start_max_iter < max_iter:
        end_max_iter = min(max_iter, start_max_iter + steps_size )
        print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do)

        # copy x/y pairs to device 
        cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()        
        # and finally call the ocl function
        prg.mandel(cl_queue, (to_do,), None,
            buffer_in_cl,                   
            buffer_out_cl,
            buffer_out_coords_cl,
            np.uint32(end_max_iter),
            np.uint32(start_max_iter)
        ).wait()
        # Copy the output back
        cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait()
        cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()

        # Get indices of "found" escapes
        done = np.where(buffer_out[:to_do]<end_max_iter)[0]
        # and write the iterations to the coresponding cell
        index_reshaped = index_map[:to_do*2].reshape((to_do, 2))
        tmp = index_reshaped[done]
        iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]        
        #iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]        

        # Get the indices of non escapes
        undone = np.where(buffer_out[:to_do]==end_max_iter)[0]
        # and write them back to our "job" maps for the next loop
        tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2))
        coord_map[:undone.size*2] = tmp[undone].flatten()
        index_map[:undone.size*2] = index_reshaped[undone].flatten()

        to_do = undone.size
        start_max_iter = end_max_iter
        print "%i done. %i unknown" % (done.size, undone.size)                            

    # simple coloring by modulo 255 on the iter_map
    return (iter_map % 255).astype(np.uint8).reshape((height, width))


if __name__ == '__main__':
    ctx = cl.create_some_context(interactive=True)
    img = mandel(ctx,
          x=Decimal("-0.7546546453361122021732941811"),
          y=Decimal("0.05020518634419688663435986387"),
          zoom=Decimal("0.0002046859427855630601247281079"),
          max_iter=2000,
          iter_steps=1,
          width=500,
          height=400,
          use_double=False
    )
    Image.fromarray(img).show()

编辑：Here 是另一个版本，其中 real/imaginary 部分永远不会离开 GPU 内存。
结果是一样的。
我完全没思路了。

Answer 1

在进行 Z squared plus c 计算时，您正在使用 buffer_out_coords 更新后的 "co-ords" 而不是原始坐标作为 c 值。 buffer_out_coords 包含当前 Z 值而不是原始 c 坐标，因此这些是您想要开始的值，但您还需要原始坐标。

您只需要进行 4 处更改：

制作buffer_out_coords_clREAD_WRITE
在每个运行
通过 "undone"
在 opencl 代码中，从 output_coord 而不是坐标

我没有得到与您提供的代码相同的输出，所以我不确定这里是否还有其他问题，但此更改为我提供了一致的输出：

1 个区块 = 153052 未知

PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
Iterations from iteration 0 to 500 for 200000 numbers
46948 done. 153052 unknown

5 个区块 = 153052 个未知

PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
Iterations from iteration 0 to 100 for 200000 numbers
0 done. 200000 unknown
Iterations from iteration 100 to 200 for 200000 numbers
11181 done. 188819 unknown
Iterations from iteration 200 to 300 for 188819 numbers
9627 done. 179192 unknown
Iterations from iteration 300 to 400 for 179192 numbers
16878 done. 162314 unknown
Iterations from iteration 400 to 500 for 162314 numbers
9262 done. 153052 unknown

代码如下：

import pyopencl as cl
import numpy as np
from PIL import Image
from decimal import Decimal

def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False):
    mf = cl.mem_flags
    cl_queue = cl.CommandQueue(ctx)
    # build program
    code = """
    #if real_t == double
        #pragma OPENCL EXTENSION cl_khr_fp64 : enable
    #endif
    kernel void mandel(
        __global real_t *coords,
        __global uint *output,
        __global real_t *output_coord,
        const uint max_iter,
        const uint start_iter    
    ){
        uint id = get_global_id(0);         
        real_t2 my_coords = vload2(id, coords);           
        real_t2 my_value_coords = vload2(id, output_coord);           
        real_t x = my_value_coords.x;
        real_t y = my_value_coords.y;
        uint iter = 0;
        for(iter=start_iter; iter<max_iter; ++iter){
            if(x*x + y*y > 4.0f){
                break;
            }
            real_t xtemp = x*x - y*y + my_coords.x;
            y = 2*x*y + my_coords.y;
            x = xtemp;
        }
        // copy the current x,y pair back
        real_t2 val = (real_t2){x, y};
        vstore2(val, id, output_coord);
        output[id] = iter;
    }        
    """
    _cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32)
    prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))

    # Calculate the "viewport".
    x0 = x - ((Decimal(3) * zoom)/Decimal(2.))
    y0 = y - ((Decimal(2) * zoom)/Decimal(2.))
    x1 = x + ((Decimal(3) * zoom)/Decimal(2.))
    y1 = y + ((Decimal(2) * zoom)/Decimal(2.))

    # Create index map in x,y pairs
    xx = np.arange(0, width, 1, dtype=np.uint32)
    yy = np.arange(0, height, 1, dtype=np.uint32)
    index_map = np.dstack(np.meshgrid(xx, yy))
    # and local "coordinates" (real, imaginary parts)
    coord_map = np.ndarray(index_map.shape, dtype=_nptype)
    coord_map[:] = index_map
    coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height)))
    coord_map[:] += (_nptype(x0), _nptype(y0))
    coord_map = coord_map.flatten()
    index_map = index_map.flatten().astype(dtype=np.uint32)
    # Create input and output buffer
    buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
    buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run
    buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
    buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values
    buffer_out_coords_cl = cl.Buffer(ctx, mf.READ_WRITE, size=buffer_out_coords.nbytes)
    # 2D Buffer to collect the iterations needed per pixel 
    #iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
    iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width))

    start_max_iter = 0
    to_do = coord_map.size / 2
    steps_size = int(max_iter / float(iter_steps))
    while to_do > 0 and start_max_iter < max_iter:
        end_max_iter = min(max_iter, start_max_iter + steps_size )
        print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do)

        # copy x/y pairs to device 
        cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()        
        cl.enqueue_copy(cl_queue, buffer_out_coords_cl, buffer_out_coords[:to_do*2]).wait()        
        # and finally call the ocl function
        prg.mandel(cl_queue, (to_do,), None,
            buffer_in_cl,                   
            buffer_out_cl,
            buffer_out_coords_cl,
            np.uint32(end_max_iter),
            np.uint32(start_max_iter)
        ).wait()
        # Copy the output back
        cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait()
        cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()

        # Get indices of "found" escapes
        done = np.where(buffer_out[:to_do]<end_max_iter)[0]
        # and write the iterations to the coresponding cell
        index_reshaped = index_map[:to_do*2].reshape((to_do, 2))
        tmp = index_reshaped[done]
        iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]        
        #iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]        

        # Get the indices of non escapes
        undone = np.where(buffer_out[:to_do]==end_max_iter)[0]
        # and write them back to our "job" maps for the next loop
        tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2))
        buffer_out_coords[:undone.size*2] = tmp[undone].flatten()
        tmp = coord_map[:to_do*2].reshape((to_do, 2))
        coord_map[:undone.size*2] = tmp[undone].flatten()
        index_map[:undone.size*2] = index_reshaped[undone].flatten()

        to_do = undone.size
        start_max_iter = end_max_iter
        print "%i done. %i unknown" % (done.size, undone.size)                            

    # simple coloring by modulo 255 on the iter_map
    return (iter_map % 255).astype(np.uint8).reshape((height, width))


if __name__ == '__main__':
    ctx = cl.create_some_context(interactive=True)
    img = mandel(ctx,
          x=Decimal("-0.7546546453361122021732941811"),
          y=Decimal("0.05020518634419688663435986387"),
          zoom=Decimal("0.0002046859427855630601247281079"),
          max_iter=2000,
          iter_steps=1,
          width=500,
          height=400,
          use_double=False
    )
    Image.fromarray(img).show()

无法在 pyopencl 中呈现 mandelbrot

Failing to render mandelbrot in pyopencl

python

numpy

opencl

mandelbrot

pyopencl