PyOpenCl 简单矩阵乘法
PyOpenCl simple matrix multiplication
我正在努力学习 PyOpenCl。
我正在关注我在网上找到的各种 tutorials/examples,并且我一直在尝试组合一个简单的矩阵乘法。
我不明白为什么我不能得到正确的结果:在我看来我的内核中的 for cycle 没有被执行(输出 C_flat 总是零)或者也许我以错误的方式管理了一些记忆。
谁能给我任何建议?
非常感谢!
代码如下:
import numpy as np
import pyopencl as cl
import time
def create_input_memory(context, input_arrays):
return [(array, cl.Buffer(context, flags=cl.mem_flags.READ_ONLY, size=array.nbytes))
for array in input_arrays]
def create_output_memory(context, output_arrays):
return [(array, cl.Buffer(context, flags=cl.mem_flags.WRITE_ONLY, size=array.nbytes))
for array in output_arrays]
def matrix_multiply_gpu(A, B):
A_height, A_width = A.shape[0], A.shape[1]
B_height, B_width = B.shape[0], B.shape[1]
C = np.zeros((A_height, B_width))
A_flat = A.flatten()
B_flat = B.flatten()
C_flat = C.flatten()
print(C_flat)
kernel_source = """
kernel void mul(int Wa, int Ha, int Wb, int Hb,
global float *input_a,
global float *input_b,
global float *result){
/* ROW MAJOR notation (I imagine the "GPU matrix") --> no, just model*/
int row = get_global_id(0);
int col = get_global_id(1);
float sum = 0.0f;
for (int i = 0; i < Wa; i++){
sum += input_a[row * Wa + i] * input_b[i * Wb + col];
}
result[row * Wb + col] = sum;
}
"""
platforms = cl.get_platforms()
context = cl.Context(dev_type=cl.device_type.GPU,
properties=[(cl.context_properties.PLATFORM, platforms[0])])
gpu_program_source = cl.Program(context, kernel_source)
gpu_program = gpu_program_source.build()
input_tuples = create_input_memory(context,
(A_flat, B_flat))
output_tuples = create_output_memory(context, (C_flat,))
gpu_queue = cl.CommandQueue(context)
kernel_arguments = [buffer for (_,buffer) in input_tuples]
kernel_arguments += [buffer for (_,buffer) in output_tuples]
gpu_program.mul(gpu_queue, (1024,), (32,),
np.int32(A_height), np.int32(A_width), np.int32(B_height),
np.int32(B_width), *kernel_arguments)
for (array, buffer) in output_tuples:
cl.enqueue_copy(gpu_queue, src=buffer, dest=array)
#wait for everyone to finish
gpu_queue.finish()
return C_flat
if __name__=='__main__':
A, B = np.ones((100, 100)), np.ones((100, 100))
C = matrix_multiply_gpu(A, B)
print("\n", C, "\n")
以下是您的代码存在的一些问题:
- 您发送高宽对,但您的内核读取宽高。
- 您发送的是扁平化数组,但期望内核中有一个 2 dim 数组。
- 你没有将数据复制到gpu,没有COPY_HOST_PTR标志。
- 当您的代码无法正常工作时,尝试使用辅助函数使其自动化只会混淆逻辑。在您的代码运行后执行此操作。我不会使用你的
create_...
功能。
- 我发现将数组复制到设备比创建缓冲区更容易,这就是我要那样做的原因,但无论哪种方式它都应该工作相同。
- 始终指定您要发送的数据的
dtype
,并确保它与您的内核所期望的相匹配。为此,我花了无数个小时调试。
代码如下:
import numpy as np
import pyopencl as cl
import pyopencl.array
def matrix_multiply_gpu(A, B):
# ----------------------------this is your code, with minor changes
A_height, A_width = A.shape
B_height, B_width = B.shape
C = np.empty((A_height, B_width), dtype=np.float32) # some changes
A_flat = A.flatten()
B_flat = B.flatten()
C_flat = C.flatten()
platforms = cl.get_platforms()
context = cl.Context(dev_type=cl.device_type.GPU,
properties=[(cl.context_properties.PLATFORM, platforms[0])])
gpu_queue = cl.CommandQueue(context)
# ------------------------------------------------------------------
# --------------------------------------This is new or modified code
size = A_height * B_width
kernel_source = """
kernel void mul(int Ha, int Wa, int Hb, int Wb,
global const float *input_a,
global const float *input_b,
global float *result ){
int gid = get_global_id(0);
int Arow = gid / Wb;
int Bcol = gid % Wb;
float sum = 0.0f;
for (int i = 0; i < Wa; i++)
sum += input_a[Arow*Wa+i] * input_b[i*Wb+Bcol];
result[gid] = sum;
}
"""
Ad = cl.array.to_device(gpu_queue, A_flat)
Bd = cl.array.to_device(gpu_queue, B_flat)
Cd = cl.array.to_device(gpu_queue, C_flat)
# --------------------------------- and the return is different too
# -------------------------------your code again with minor changes
gpu_program_source = cl.Program(context, kernel_source)
gpu_program = gpu_program_source.build()
gpu_program.mul(gpu_queue, (size,), None, # some changes here
np.int32(A_height), np.int32(A_width),
np.int32(B_height), np.int32(B_width),
Ad.data, Bd.data, Cd.data)
# -----------------------------------------------------------------
return Cd.get().reshape((A_height, B_width))
# or send it flattened if you wish, without the reshape
if __name__=='__main__':
from pprint import pprint
A = 2.0*np.ones((5, 3), dtype=np.float32)
B = 3.0*np.ones((3, 4), dtype=np.float32)
C = matrix_multiply_gpu(A, B)
pprint(C)
我正在努力学习 PyOpenCl。 我正在关注我在网上找到的各种 tutorials/examples,并且我一直在尝试组合一个简单的矩阵乘法。 我不明白为什么我不能得到正确的结果:在我看来我的内核中的 for cycle 没有被执行(输出 C_flat 总是零)或者也许我以错误的方式管理了一些记忆。 谁能给我任何建议? 非常感谢!
代码如下:
import numpy as np
import pyopencl as cl
import time
def create_input_memory(context, input_arrays):
return [(array, cl.Buffer(context, flags=cl.mem_flags.READ_ONLY, size=array.nbytes))
for array in input_arrays]
def create_output_memory(context, output_arrays):
return [(array, cl.Buffer(context, flags=cl.mem_flags.WRITE_ONLY, size=array.nbytes))
for array in output_arrays]
def matrix_multiply_gpu(A, B):
A_height, A_width = A.shape[0], A.shape[1]
B_height, B_width = B.shape[0], B.shape[1]
C = np.zeros((A_height, B_width))
A_flat = A.flatten()
B_flat = B.flatten()
C_flat = C.flatten()
print(C_flat)
kernel_source = """
kernel void mul(int Wa, int Ha, int Wb, int Hb,
global float *input_a,
global float *input_b,
global float *result){
/* ROW MAJOR notation (I imagine the "GPU matrix") --> no, just model*/
int row = get_global_id(0);
int col = get_global_id(1);
float sum = 0.0f;
for (int i = 0; i < Wa; i++){
sum += input_a[row * Wa + i] * input_b[i * Wb + col];
}
result[row * Wb + col] = sum;
}
"""
platforms = cl.get_platforms()
context = cl.Context(dev_type=cl.device_type.GPU,
properties=[(cl.context_properties.PLATFORM, platforms[0])])
gpu_program_source = cl.Program(context, kernel_source)
gpu_program = gpu_program_source.build()
input_tuples = create_input_memory(context,
(A_flat, B_flat))
output_tuples = create_output_memory(context, (C_flat,))
gpu_queue = cl.CommandQueue(context)
kernel_arguments = [buffer for (_,buffer) in input_tuples]
kernel_arguments += [buffer for (_,buffer) in output_tuples]
gpu_program.mul(gpu_queue, (1024,), (32,),
np.int32(A_height), np.int32(A_width), np.int32(B_height),
np.int32(B_width), *kernel_arguments)
for (array, buffer) in output_tuples:
cl.enqueue_copy(gpu_queue, src=buffer, dest=array)
#wait for everyone to finish
gpu_queue.finish()
return C_flat
if __name__=='__main__':
A, B = np.ones((100, 100)), np.ones((100, 100))
C = matrix_multiply_gpu(A, B)
print("\n", C, "\n")
以下是您的代码存在的一些问题:
- 您发送高宽对,但您的内核读取宽高。
- 您发送的是扁平化数组,但期望内核中有一个 2 dim 数组。
- 你没有将数据复制到gpu,没有COPY_HOST_PTR标志。
- 当您的代码无法正常工作时,尝试使用辅助函数使其自动化只会混淆逻辑。在您的代码运行后执行此操作。我不会使用你的
create_...
功能。 - 我发现将数组复制到设备比创建缓冲区更容易,这就是我要那样做的原因,但无论哪种方式它都应该工作相同。
- 始终指定您要发送的数据的
dtype
,并确保它与您的内核所期望的相匹配。为此,我花了无数个小时调试。
代码如下:
import numpy as np
import pyopencl as cl
import pyopencl.array
def matrix_multiply_gpu(A, B):
# ----------------------------this is your code, with minor changes
A_height, A_width = A.shape
B_height, B_width = B.shape
C = np.empty((A_height, B_width), dtype=np.float32) # some changes
A_flat = A.flatten()
B_flat = B.flatten()
C_flat = C.flatten()
platforms = cl.get_platforms()
context = cl.Context(dev_type=cl.device_type.GPU,
properties=[(cl.context_properties.PLATFORM, platforms[0])])
gpu_queue = cl.CommandQueue(context)
# ------------------------------------------------------------------
# --------------------------------------This is new or modified code
size = A_height * B_width
kernel_source = """
kernel void mul(int Ha, int Wa, int Hb, int Wb,
global const float *input_a,
global const float *input_b,
global float *result ){
int gid = get_global_id(0);
int Arow = gid / Wb;
int Bcol = gid % Wb;
float sum = 0.0f;
for (int i = 0; i < Wa; i++)
sum += input_a[Arow*Wa+i] * input_b[i*Wb+Bcol];
result[gid] = sum;
}
"""
Ad = cl.array.to_device(gpu_queue, A_flat)
Bd = cl.array.to_device(gpu_queue, B_flat)
Cd = cl.array.to_device(gpu_queue, C_flat)
# --------------------------------- and the return is different too
# -------------------------------your code again with minor changes
gpu_program_source = cl.Program(context, kernel_source)
gpu_program = gpu_program_source.build()
gpu_program.mul(gpu_queue, (size,), None, # some changes here
np.int32(A_height), np.int32(A_width),
np.int32(B_height), np.int32(B_width),
Ad.data, Bd.data, Cd.data)
# -----------------------------------------------------------------
return Cd.get().reshape((A_height, B_width))
# or send it flattened if you wish, without the reshape
if __name__=='__main__':
from pprint import pprint
A = 2.0*np.ones((5, 3), dtype=np.float32)
B = 3.0*np.ones((3, 4), dtype=np.float32)
C = matrix_multiply_gpu(A, B)
pprint(C)