PyOpenCL

Question

我正在尝试运行基于高斯脉冲传播的模拟。我正在我的 windows 台式机与 i5 4590 和 GTX 970（最新驱动程序）和我 2015 年初的 macbook air 之间进行交叉开发。

当运行使用我的主要代码时，我无法在我的桌面上获得任何像样的结果（计算结果不同），但在我的 mac 上，结果似乎还不错。

为了进一步研究这个问题，我尝试运行一个简单的高斯传播。 mac书上的结果或多或少是好的，而在桌面上它完全是一团糟。

我运行在两个 mac 引擎上使用相同的代码，并且都具有相同的 python (2.7.10) 分布和各自的模块。

这是我的代码

import scipy as sp
import pyopencl as cl
import matplotlib.pyplot as plot

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
MF = cl.mem_flags

dx = 0.01
X = sp.arange(0.0, 100, dx)
N = len(X)

A_h = (sp.exp(-(X-50)**2/10.)*sp.exp(-1j*1000*X)).astype(sp.complex64)
A_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=A_h)

plot.figure()
plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())

Source = """
    #define complex_ctr(x, y) (float2)(x, y)
    #define complex_mul(a, b) complex_ctr(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y))
    #define complex_unit (float2)(0, 1)

    __kernel void propagate(__global float2 *A){
        const int gid_x = get_global_id(0);
        float EPS = 0.1f;
        A[gid_x] = A[gid_x] + EPS*complex_mul((A[gid_x+1] + A[gid_x-1]), complex_unit);
    }
"""
prg = cl.Program(ctx, Source).build()
for i in range(3000):
    print i
    event = prg.propagate(queue, (N,), None, A_d)
    event.wait()
cl.enqueue_copy(queue, A_h, A_d)

plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())

plot.show()

这是结果

桌面结果：

Mac 结果：

绿线对应传播后的高斯，蓝线是初始高斯

在 NVidia 方面可能会导致此问题的原因是什么？我想我错过了防止这种情况发生的关键步骤，并且它运行在 mac 上有点运气

编辑

这是我根据用户的建议运行的最终代码

import scipy as sp
import pyopencl as cl
import matplotlib.pyplot as plot

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
MF = cl.mem_flags

dx = sp.float32(0.001)
X = sp.arange(0.0, 100, dx).astype(sp.float32)
N = len(X)

A_h = (sp.exp(-(X-50)**2/10.)*sp.exp(1j*1000*X)).astype(sp.complex64)
A_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=A_h)
B_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=A_h)

plot.figure()
plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())

Source = """
    #define complex_ctr(x, y) (float2)(x, y)
    #define complex_mul(a, b) complex_ctr((a).x * (b).x - (a).y * (b).y, (a).x * (b).y + (a).y * (b).x)
    #define complex_unit (float2)(0, 1)

    __kernel void propagate(__global float2 *A){
        const int gid_x = get_global_id(0);
        float EPS = 0.1f;
        float2 a1, a2;
        a1 = A[gid_x-1];
        a2 = A[gid_x+1];
        barrier(CLK_GLOBAL_MEM_FENCE);
        A[gid_x] += EPS*complex_mul((a1 + a2), complex_unit);
    }
"""

prg = cl.Program(ctx, Source).build()
for i in range(12000):
    print i
    evolve = prg.propagate(queue, (N,), None, A_d)
    evolve.wait()
cl.enqueue_copy(queue, A_h, A_d)

plot.plot(X, abs(A_h) ** 2)

plot.show()

Answer 1

编辑：哦，请阅读@talonmies 评论，它与此解决方案相同。

此代码在 OpenCL 中不安全，存在数据争用问题：

A[gid_x] = A[gid_x] + EPS*complex_mul((A[gid_x+1] + A[gid_x-1]), complex_unit);

每个工作项 x 使用 x+1 和 x-1。根据工作项目的时间表，结果会有所不同。

改为使用 2 个缓冲区，从 A 读取，写入 B，简单：

import scipy as sp
import pyopencl as cl
import matplotlib.pyplot as plot

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
MF = cl.mem_flags

dx = 0.01
X = sp.arange(0.0, 100, dx)
N = len(X)

A_h = (sp.exp(-(X-50)**2/10.)*sp.exp(-1j*1000*X)).astype(sp.complex64)
A_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=A_h)
B_d = cl.Buffer(ctx, MF.READ_WRITE)

plot.figure()
plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())

Source = """
    #define complex_ctr(x, y) (float2)(x, y)
    #define complex_mul(a, b) complex_ctr(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y))
    #define complex_unit (float2)(0, 1)

    __kernel void propagate(__global float2 *A, __global float2 *B){
        const int gid_x = get_global_id(0);
        float EPS = 0.1f;
        B[gid_x] = A[gid_x] + EPS*complex_mul((A[gid_x+1] + A[gid_x-1]), complex_unit);
    }
"""
prg = cl.Program(ctx, Source).build()
for i in range(3000):
    print i
    event = prg.propagate(queue, (N,), None, A_d, B_d)
    A_d, B_d = B_d, A_d #Swap buffers, so A always has results
    #event.wait() #You don't need this, this is slowing terribly the execution, enqueue_copy already waits
cl.enqueue_copy(queue, A_h, A_d)

plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())

plot.show()

PyOpenCL - 在 Intel 和 NVidia 上的不同结果

PyOpenCL - Different results on Intel and NVidia

python

numpy

nvidia

opencl