OpenCL 运行 比 Cython 慢(在 CPU 上)
OpenCL running slower than Cython (on CPU)
我有一个程序可以计算给定数据点 (pos) 与其余数据的 potential/force。它最初是用 Cython 编写的,我尝试使用 PyOpenCL(在我的 2013 Macbook Pro 上将设备设置为 Intel(R) Core(TM) i7-4750HQ CPU @ 2.00GHz)希望提高速度但是结果实际上比 Cython 慢得多。此外,Cython 版本是双精度的,而 CL 只是浮点数。确认结果相等
ipython笔记本如下,对于2mil x 2的数据,PyOpenCL用了176ms而Cython只用了82ms。有没有办法优化和减少开销?非常感谢
:
from __future__ import division
import numpy as np
import pyopencl as cl
import pyopencl.array
import math
import time
%load_ext pyopencl.ipython_ext
%load_ext Cython
%pylab inline
# prepare data
datad = np.random.rand(2000000,2)-[0.5, 0.5] # Double
data = datad.astype(np.float32)
N, dim = data.shape[0], data.shape[1]
sigma = 0.04
i = 2
pos = np.array(data[i,:]) # float
posd = np.array(datad[i,:]) #double
dt = 0.005
resistant = 0.9995
kernelsource = """
__kernel void forceFinder(
const int N,
const int dim,
const float sigma,
__global float* datacl,
__global float* poscl,
__global float* res)
{
int i = get_global_id(0); // Global id;
float f_sum ;
int k;
float sigma2 = sigma * sigma;
if (i < N) {
f_sum = 0.0;
for (k = 0; k < dim; k++)
{
f_sum += (poscl[k] - datacl[i * dim + k]) * (poscl[k] - datacl[i * dim + k]);
}
for (k = 0; k < dim; k++)
{
res[i * dim + k] = (datacl[i * dim + k] - poscl[k]) * exp(-f_sum/sigma2)/sigma2;
}
}
}
"""
# Setup PyOpenCl
platform = cl.get_platforms()[0]
device = platform.get_devices()[0] # Get the GPU ID
ctx = cl.Context([device]) # Tell CL to use GPU
queue = cl.CommandQueue(ctx) # Create a command queue for the target device.
program = cl.Program(ctx, kernelsource).build()
size = N * dim
datacl = data.reshape((size,))
rescl = np.empty(size).astype(np.float32)
rescl.fill(0.0)
datacl_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = datacl)
pos_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = pos)
rescl_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = rescl)
forceFinder = program.forceFinder
forceFinder.set_scalar_arg_dtypes([np.uint32, np.uint32, np.float32, None, None, None])
globalrange = (N, dim)
localrange = None
# Run CL
t0 = time.time()
forceFinder(queue, globalrange, localrange, N, dim, sigma, datacl_buf, \
pos_buf, rescl_buf)
queue.finish()
cl.enqueue_copy(queue, rescl, rescl_buf)
result = rescl.reshape((N,dim))
t1 = time.time()
print (t1-t0)
# Now Cython
%%cython
cimport numpy as np
import numpy as np
from libc.stdlib cimport rand, malloc, free
from libc.math cimport exp
def PTSM(np.ndarray[np.float64_t, ndim = 1] position, np.ndarray[np.float64_t, ndim = 2] X,\
double sigma=0.25,\
double dt=0.01, double r=0.99, int Nsamp=1000):
cdef int N, dim
N, dim = X.shape[0], X.shape[1]
cdef int i,j, steps # These 3 are for iterations.
cdef double sigma2
cdef double force1p_sum
cdef double *force = <double *>malloc(dim * sizeof(double))
sigma2 = sigma * sigma
#--------------------
# Force
# for steps in range(Nsamp):
for i in range(dim):
force[i] = 0
for j in range (N):
for i in range (dim):
force1p_sum += (position[i] - X[j,i]) * (position[i] - X[j,i])
for i in range (dim):
force[i] += ( X[j,i] - position[i]) * exp(- force1p_sum /sigma2) / sigma2
force1p_sum = 0
resultForce = np.zeros(dim)
for i in range(dim):
resultForce[i] = force[i]
free(force)
return resultForce
t0 = time.time()
f = PTSM(posd, datad, sigma, dt, resistant)
t1 = time.time()
print (t1 - t0)
您的全局范围是 globalrange = (N, dim)
。虽然在内部您仅使用 get_global_id(0)
并在 for 循环中循环 dim
.
如此有效地使用 N*dim*dim
vs N*dim
,一个不影响输出的额外暗维度操作(2 个内部线程正在做相同的工作并将相同的内容写入输出)。这是有道理的:176ms vs 82ms
几乎翻了一番。您使用这两种方法具有相同的硬件和相同的设备利用率,因此这似乎是合乎逻辑的。
此外,一些优化:
我不会在复制前使用queue.finish()
。因为这会导致 CL 设备的隐式块。
这个:
f_sum += (poscl[k] - datacl[i * dim + k]) * (poscl[k] - datacl[i * dim + k]);
- 应该是:(避免额外的全局读取)
f_sum += pown(poscl[k] - datacl[i * dim + k]), 2);
更改数据形状以进行合并访问。目前每个工作项都在小步访问一个 i*k
的矩阵。虽然以昏暗的市长顺序形成的矩阵可以提供联合访问。将其从 i*k
更改为 k*i
.
poscl
应为常量且只读。
你应该先计算poscl-datacl
。将其存储在私有数组中。然后在 2 个循环中使用它。避免额外的全局读取。
修改后的代码(未测试):注意:我没有添加矩阵排序更改。
# prepare data
datad = np.random.rand(2000000,2)-[0.5, 0.5] # Double
data = datad.astype(np.float32)
N, dim = data.shape[0], data.shape[1]
sigma = 0.04
i = 2
pos = np.array(data[i,:]) # float
posd = np.array(datad[i,:]) #double
dt = 0.005
resistant = 0.9995
kernelsource = """
__kernel void forceFinder(
const int N,
const int dim,
const float sigma,
__global float* datacl,
__constant float* poscl,
__global float* res)
{
int i = get_global_id(0); // Global id;
float f_sum ;
int k;
float sigma2 = sigma * sigma;
if (i < N) {
f_sum = 0.0;
float t[2]; //INCREASE TO THE MAX "DIM" POSSIBLE
for (k = 0; k < dim; k++){
t = poscl[k] - datacl[i * dim + k];
}
for (k = 0; k < dim; k++){
f_sum += pown(t,2);
}
for (k = 0; k < dim; k++){
res[i * dim + k] = (-t) * exp(-f_sum/sigma2)/sigma2;
}
}
}
"""
# Setup PyOpenCl
platform = cl.get_platforms()[0]
device = platform.get_devices()[0] # Get the GPU ID
ctx = cl.Context([device]) # Tell CL to use GPU
queue = cl.CommandQueue(ctx) # Create a command queue for the target device.
program = cl.Program(ctx, kernelsource).build()
size = N * dim
datacl = data.reshape((size,))
rescl = np.empty(size).astype(np.float32)
rescl.fill(0.0)
datacl_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = datacl)
pos_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = pos)
rescl_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, rescl.nbytes)
forceFinder = program.forceFinder
forceFinder.set_scalar_arg_dtypes([np.uint32, np.uint32, np.float32, None, None, None])
globalrange = (N, 1)
localrange = None
# Run CL
t0 = time.time()
forceFinder(queue, globalrange, localrange, N, dim, sigma, datacl_buf, \
pos_buf, rescl_buf)
cl.enqueue_copy(queue, rescl, rescl_buf)
queue.finish()
result = rescl.reshape((N,dim))
t1 = time.time()
print (t1-t0)
我有一个程序可以计算给定数据点 (pos) 与其余数据的 potential/force。它最初是用 Cython 编写的,我尝试使用 PyOpenCL(在我的 2013 Macbook Pro 上将设备设置为 Intel(R) Core(TM) i7-4750HQ CPU @ 2.00GHz)希望提高速度但是结果实际上比 Cython 慢得多。此外,Cython 版本是双精度的,而 CL 只是浮点数。确认结果相等
ipython笔记本如下,对于2mil x 2的数据,PyOpenCL用了176ms而Cython只用了82ms。有没有办法优化和减少开销?非常感谢
:
from __future__ import division
import numpy as np
import pyopencl as cl
import pyopencl.array
import math
import time
%load_ext pyopencl.ipython_ext
%load_ext Cython
%pylab inline
# prepare data
datad = np.random.rand(2000000,2)-[0.5, 0.5] # Double
data = datad.astype(np.float32)
N, dim = data.shape[0], data.shape[1]
sigma = 0.04
i = 2
pos = np.array(data[i,:]) # float
posd = np.array(datad[i,:]) #double
dt = 0.005
resistant = 0.9995
kernelsource = """
__kernel void forceFinder(
const int N,
const int dim,
const float sigma,
__global float* datacl,
__global float* poscl,
__global float* res)
{
int i = get_global_id(0); // Global id;
float f_sum ;
int k;
float sigma2 = sigma * sigma;
if (i < N) {
f_sum = 0.0;
for (k = 0; k < dim; k++)
{
f_sum += (poscl[k] - datacl[i * dim + k]) * (poscl[k] - datacl[i * dim + k]);
}
for (k = 0; k < dim; k++)
{
res[i * dim + k] = (datacl[i * dim + k] - poscl[k]) * exp(-f_sum/sigma2)/sigma2;
}
}
}
"""
# Setup PyOpenCl
platform = cl.get_platforms()[0]
device = platform.get_devices()[0] # Get the GPU ID
ctx = cl.Context([device]) # Tell CL to use GPU
queue = cl.CommandQueue(ctx) # Create a command queue for the target device.
program = cl.Program(ctx, kernelsource).build()
size = N * dim
datacl = data.reshape((size,))
rescl = np.empty(size).astype(np.float32)
rescl.fill(0.0)
datacl_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = datacl)
pos_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = pos)
rescl_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = rescl)
forceFinder = program.forceFinder
forceFinder.set_scalar_arg_dtypes([np.uint32, np.uint32, np.float32, None, None, None])
globalrange = (N, dim)
localrange = None
# Run CL
t0 = time.time()
forceFinder(queue, globalrange, localrange, N, dim, sigma, datacl_buf, \
pos_buf, rescl_buf)
queue.finish()
cl.enqueue_copy(queue, rescl, rescl_buf)
result = rescl.reshape((N,dim))
t1 = time.time()
print (t1-t0)
# Now Cython
%%cython
cimport numpy as np
import numpy as np
from libc.stdlib cimport rand, malloc, free
from libc.math cimport exp
def PTSM(np.ndarray[np.float64_t, ndim = 1] position, np.ndarray[np.float64_t, ndim = 2] X,\
double sigma=0.25,\
double dt=0.01, double r=0.99, int Nsamp=1000):
cdef int N, dim
N, dim = X.shape[0], X.shape[1]
cdef int i,j, steps # These 3 are for iterations.
cdef double sigma2
cdef double force1p_sum
cdef double *force = <double *>malloc(dim * sizeof(double))
sigma2 = sigma * sigma
#--------------------
# Force
# for steps in range(Nsamp):
for i in range(dim):
force[i] = 0
for j in range (N):
for i in range (dim):
force1p_sum += (position[i] - X[j,i]) * (position[i] - X[j,i])
for i in range (dim):
force[i] += ( X[j,i] - position[i]) * exp(- force1p_sum /sigma2) / sigma2
force1p_sum = 0
resultForce = np.zeros(dim)
for i in range(dim):
resultForce[i] = force[i]
free(force)
return resultForce
t0 = time.time()
f = PTSM(posd, datad, sigma, dt, resistant)
t1 = time.time()
print (t1 - t0)
您的全局范围是 globalrange = (N, dim)
。虽然在内部您仅使用 get_global_id(0)
并在 for 循环中循环 dim
.
如此有效地使用 N*dim*dim
vs N*dim
,一个不影响输出的额外暗维度操作(2 个内部线程正在做相同的工作并将相同的内容写入输出)。这是有道理的:176ms vs 82ms
几乎翻了一番。您使用这两种方法具有相同的硬件和相同的设备利用率,因此这似乎是合乎逻辑的。
此外,一些优化:
我不会在复制前使用
queue.finish()
。因为这会导致 CL 设备的隐式块。这个:
f_sum += (poscl[k] - datacl[i * dim + k]) * (poscl[k] - datacl[i * dim + k]);
- 应该是:(避免额外的全局读取)
f_sum += pown(poscl[k] - datacl[i * dim + k]), 2);
更改数据形状以进行合并访问。目前每个工作项都在小步访问一个
i*k
的矩阵。虽然以昏暗的市长顺序形成的矩阵可以提供联合访问。将其从i*k
更改为k*i
.poscl
应为常量且只读。你应该先计算
poscl-datacl
。将其存储在私有数组中。然后在 2 个循环中使用它。避免额外的全局读取。
修改后的代码(未测试):注意:我没有添加矩阵排序更改。
# prepare data
datad = np.random.rand(2000000,2)-[0.5, 0.5] # Double
data = datad.astype(np.float32)
N, dim = data.shape[0], data.shape[1]
sigma = 0.04
i = 2
pos = np.array(data[i,:]) # float
posd = np.array(datad[i,:]) #double
dt = 0.005
resistant = 0.9995
kernelsource = """
__kernel void forceFinder(
const int N,
const int dim,
const float sigma,
__global float* datacl,
__constant float* poscl,
__global float* res)
{
int i = get_global_id(0); // Global id;
float f_sum ;
int k;
float sigma2 = sigma * sigma;
if (i < N) {
f_sum = 0.0;
float t[2]; //INCREASE TO THE MAX "DIM" POSSIBLE
for (k = 0; k < dim; k++){
t = poscl[k] - datacl[i * dim + k];
}
for (k = 0; k < dim; k++){
f_sum += pown(t,2);
}
for (k = 0; k < dim; k++){
res[i * dim + k] = (-t) * exp(-f_sum/sigma2)/sigma2;
}
}
}
"""
# Setup PyOpenCl
platform = cl.get_platforms()[0]
device = platform.get_devices()[0] # Get the GPU ID
ctx = cl.Context([device]) # Tell CL to use GPU
queue = cl.CommandQueue(ctx) # Create a command queue for the target device.
program = cl.Program(ctx, kernelsource).build()
size = N * dim
datacl = data.reshape((size,))
rescl = np.empty(size).astype(np.float32)
rescl.fill(0.0)
datacl_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = datacl)
pos_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = pos)
rescl_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, rescl.nbytes)
forceFinder = program.forceFinder
forceFinder.set_scalar_arg_dtypes([np.uint32, np.uint32, np.float32, None, None, None])
globalrange = (N, 1)
localrange = None
# Run CL
t0 = time.time()
forceFinder(queue, globalrange, localrange, N, dim, sigma, datacl_buf, \
pos_buf, rescl_buf)
cl.enqueue_copy(queue, rescl, rescl_buf)
queue.finish()
result = rescl.reshape((N,dim))
t1 = time.time()
print (t1-t0)