PyOpenCl 简单矩阵乘法

PyOpenCl simple matrix multiplication

我正在努力学习 PyOpenCl。 我正在关注我在网上找到的各种 tutorials/examples,并且我一直在尝试组合一个简单的矩阵乘法。 我不明白为什么我不能得到正确的结果:在我看来我的内核中的 for cycle 没有被执行(输出 C_flat 总是零)或者也许我以错误的方式管理了一些记忆。 谁能给我任何建议? 非常感谢!

代码如下:

import numpy as np
import pyopencl as cl
import time

def create_input_memory(context, input_arrays):
    return [(array, cl.Buffer(context, flags=cl.mem_flags.READ_ONLY, size=array.nbytes))
            for array in input_arrays]

def create_output_memory(context, output_arrays):
    return [(array, cl.Buffer(context, flags=cl.mem_flags.WRITE_ONLY, size=array.nbytes))
            for array in output_arrays]

def matrix_multiply_gpu(A, B):
    A_height, A_width = A.shape[0], A.shape[1]
    B_height, B_width = B.shape[0], B.shape[1]
    C = np.zeros((A_height, B_width))
    A_flat = A.flatten()
    B_flat = B.flatten()
    C_flat = C.flatten()
    print(C_flat)

    kernel_source = """
        kernel void mul(int Wa, int Ha, int Wb, int Hb,
                        global float *input_a,
                        global float *input_b,
                        global float *result){
            /* ROW MAJOR notation (I imagine the "GPU matrix") --> no, just model*/
            int row = get_global_id(0);
            int col = get_global_id(1);
            float sum = 0.0f;
            for (int i = 0; i < Wa; i++){
                sum += input_a[row * Wa + i] * input_b[i * Wb + col];
            }
            result[row * Wb + col] = sum;
                        }
    """
    platforms = cl.get_platforms()

    context = cl.Context(dev_type=cl.device_type.GPU,
        properties=[(cl.context_properties.PLATFORM, platforms[0])])
    gpu_program_source = cl.Program(context, kernel_source)
    gpu_program = gpu_program_source.build()

    input_tuples = create_input_memory(context,
                                        (A_flat, B_flat))
    output_tuples = create_output_memory(context, (C_flat,)) 

    gpu_queue = cl.CommandQueue(context)    
        
    kernel_arguments = [buffer for (_,buffer) in input_tuples]
    kernel_arguments += [buffer for (_,buffer) in output_tuples]

    gpu_program.mul(gpu_queue, (1024,), (32,), 
                        np.int32(A_height), np.int32(A_width), np.int32(B_height), 
                        np.int32(B_width), *kernel_arguments)

    for (array, buffer) in output_tuples:
        cl.enqueue_copy(gpu_queue, src=buffer, dest=array)
    
    #wait for everyone to finish
    gpu_queue.finish()

    return C_flat

if __name__=='__main__':
    A, B = np.ones((100, 100)), np.ones((100, 100))
    C = matrix_multiply_gpu(A, B)
    print("\n", C, "\n")

以下是您的代码存在的一些问题:

  1. 您发送高宽对,但您的内核读取宽高。
  2. 您发送的是扁平化数组,但期望内核中有一个 2 dim 数组。
  3. 你没有将数据复制到gpu,没有COPY_HOST_PTR标志。
  4. 当您的代码无法正常工作时,尝试使用辅助函数使其自动化只会混淆逻辑。在您的代码运行后执行此操作。我不会使用你的 create_... 功能。
  5. 我发现将数组复制到设备比创建缓冲区更容易,这就是我要那样做的原因,但无论哪种方式它都应该工作相同。
  6. 始终指定您要发送的数据的 dtype,并确保它与您的内核所期望的相匹配。为此,我花了无数个小时调试。

代码如下:

import numpy as np
import pyopencl as cl
import pyopencl.array

def matrix_multiply_gpu(A, B):
    # ----------------------------this is your code, with minor changes
    A_height, A_width = A.shape
    B_height, B_width = B.shape
    C = np.empty((A_height, B_width), dtype=np.float32)  # some changes
    A_flat = A.flatten()
    B_flat = B.flatten()
    C_flat = C.flatten()
    platforms = cl.get_platforms()
    context = cl.Context(dev_type=cl.device_type.GPU,
        properties=[(cl.context_properties.PLATFORM, platforms[0])])
    gpu_queue = cl.CommandQueue(context)    
    # ------------------------------------------------------------------

    # --------------------------------------This is new or modified code
    size = A_height * B_width

    kernel_source = """
        kernel void mul(int Ha, int Wa, int Hb, int Wb,
                        global const float *input_a,
                        global const float *input_b,
                        global       float *result     ){
            int gid = get_global_id(0);
            int Arow = gid / Wb;
            int Bcol = gid % Wb;
            float sum = 0.0f;
            for (int i = 0; i < Wa; i++)
                sum += input_a[Arow*Wa+i] * input_b[i*Wb+Bcol];
            result[gid] = sum;
        }
        """
    Ad = cl.array.to_device(gpu_queue, A_flat)
    Bd = cl.array.to_device(gpu_queue, B_flat)
    Cd = cl.array.to_device(gpu_queue, C_flat)
    # --------------------------------- and the return is different too
    
    # -------------------------------your code again with minor changes
    gpu_program_source = cl.Program(context, kernel_source)
    gpu_program = gpu_program_source.build()
    gpu_program.mul(gpu_queue, (size,), None,  # some changes here
                        np.int32(A_height), np.int32(A_width), 
                        np.int32(B_height), np.int32(B_width), 
                        Ad.data, Bd.data, Cd.data)
    # -----------------------------------------------------------------

    return Cd.get().reshape((A_height, B_width)) 
    # or send it flattened if you wish, without the reshape

if __name__=='__main__':
    from pprint import pprint
    A = 2.0*np.ones((5, 3), dtype=np.float32)
    B = 3.0*np.ones((3, 4), dtype=np.float32)
    
    C = matrix_multiply_gpu(A, B)
    pprint(C)