OpenCL中矩阵乘法中的右global_work_size和local_work_size是什么?

What is the right global_work_size and local_work_size in Matrix Multiplication in OpenCL?

我在 JOCL 中做矩阵乘法 Java 绑定到 OpenCL。问题,我得到一个错误

Exception in thread "main" org.jocl.CLException: CL_INVALID_WORK_ITEM_SIZE at org.jocl.CL.checkResult(CL.java:787) at org.jocl.CL.clEnqueueNDRangeKernel(CL.java:20802) at org.jocl.samples.JOCLSample.main(JOCLSample.java:150)

代码如下:

package org.jocl.samples;

import static org.jocl.CL.*;

import org.jocl.*;
import static java.lang.System.nanoTime;


public class JOCLSample
{
    /**
     * The source code of the OpenCL program to execute
     */
    private static String programSource =
    "__kernel void "+
    "matrixMul(__global float* C,"+ 
    "          __global float* A,"+ 
    "          __global float* B,"+ 
    "          int wA, int wB)"+
    "{"+

       "int tx = get_global_id(0);"+ 
       "int ty = get_global_id(1);"+

       "float value = 0;"+
       "for (int k = 0; k < wA; ++k)"+
       "{"+
       "   float elementA = A[ty * wA + k];"+
       "   float elementB = B[k * wB + tx];"+
       "   value += elementA * elementB;"+
       "}"+

      "C[ty * wA + tx] = value;"+
    "}";



    /**
     * The entry point of this sample
     * 
     * @param args Not used
     */
    public static void main(String args[])
    {
        // Create input- and output data 
        int n = 10;
        float srcArrayA[] = new float[n];
        float srcArrayB[] = new float[n];
        float dstArray[] = new float[n];
        for (int i=0; i<n; i++)
        {
            srcArrayA[i] = i;
            srcArrayB[i] = i;
        }
        Pointer srcA = Pointer.to(srcArrayA);
        Pointer srcB = Pointer.to(srcArrayB);
        Pointer dst = Pointer.to(dstArray);

        // The platform, device type and device number
        // that will be used
        final int platformIndex = 0;
        final long deviceType = CL_DEVICE_TYPE_ALL;
        final int deviceIndex = 0;

        // Enable exceptions and subsequently omit error checks in this sample
        CL.setExceptionsEnabled(true);

        // Obtain the number of platforms
        int numPlatformsArray[] = new int[1];
        clGetPlatformIDs(0, null, numPlatformsArray);
        int numPlatforms = numPlatformsArray[0];

        // Obtain a platform ID
        cl_platform_id platforms[] = new cl_platform_id[numPlatforms];
        clGetPlatformIDs(platforms.length, platforms, null);
        cl_platform_id platform = platforms[platformIndex];

        // Initialize the context properties
        cl_context_properties contextProperties = new cl_context_properties();
        contextProperties.addProperty(CL_CONTEXT_PLATFORM, platform);

        // Obtain the number of devices for the platform
        int numDevicesArray[] = new int[1];
        clGetDeviceIDs(platform, deviceType, 0, null, numDevicesArray);
        int numDevices = numDevicesArray[0];

        // Obtain a device ID 
        cl_device_id devices[] = new cl_device_id[numDevices];
        clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
        cl_device_id device = devices[deviceIndex];

        // Create a context for the selected device
        cl_context context = clCreateContext(
            contextProperties, 1, new cl_device_id[]{device}, 
            null, null, null);

        // Create a command-queue for the selected device
        cl_command_queue commandQueue = 
            clCreateCommandQueue(context, device, 0, null);

        // Allocate the memory objects for the input- and output data
        cl_mem memObjects[] = new cl_mem[3];
        memObjects[0] = clCreateBuffer(context, 
            CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
            Sizeof.cl_float * n, srcA, null);
        memObjects[1] = clCreateBuffer(context, 
            CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
            Sizeof.cl_float * n, srcB, null);
        memObjects[2] = clCreateBuffer(context, 
            CL_MEM_READ_WRITE, 
            Sizeof.cl_float * n, null, null);

        // Create the program from the source code
        cl_program program = clCreateProgramWithSource(context,
            1, new String[]{ programSource }, null, null);

        // Build the program
        clBuildProgram(program, 0, null, null, null, null);

        // Create the kernel
        cl_kernel kernel = clCreateKernel(program, "matrixMul", null);

        long time = nanoTime();
        // Set the arguments for the kernel
        clSetKernelArg(kernel, 0, 
            Sizeof.cl_mem, Pointer.to(memObjects[0]));
        clSetKernelArg(kernel, 1, 
            Sizeof.cl_mem, Pointer.to(memObjects[1]));
        clSetKernelArg(kernel, 2, 
            Sizeof.cl_mem, Pointer.to(memObjects[2]));

        // Set the work-item dimensions
        long global_work_size[] = new long[]{n};
        long local_work_size[] = new long[]{1};


        // Execute the kernel
        clEnqueueNDRangeKernel(commandQueue, kernel, 2, null,
            global_work_size, local_work_size, 0, null, null);

        // Read the output data
        clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE, 0,
            n * Sizeof.cl_float, dst, 0, null, null);

        time = nanoTime() - time;
        System.out.println("GPU time: "+ time +"ns " + (time/1000000)+"ms");

        // Release kernel, program, and memory objects
        clReleaseMemObject(memObjects[0]);
        clReleaseMemObject(memObjects[1]);
        clReleaseMemObject(memObjects[2]);
        clReleaseKernel(kernel);
        clReleaseProgram(program);
        clReleaseCommandQueue(commandQueue);
        clReleaseContext(context);

        // Verify the result
        boolean passed = true;
        final float epsilon = 1e-7f;
        for (int i=0; i<n; i++)
        {
            float x = dstArray[i];
            float y = srcArrayA[i] * srcArrayB[i];
            boolean epsilonEqual = Math.abs(x - y) <= epsilon * Math.abs(x);
            if (!epsilonEqual)
            {
                passed = false;
                break;
            }
        }
        System.out.println("Test "+(passed?"PASSED":"FAILED"));
        if (n <= 128)
        {
            System.out.println("Result: "+java.util.Arrays.toString(dstArray));
        }
    }
}

我的问题是,大小为[1024 x 1024]、[2048 x 2048]、[4096 x 4096]和[的矩阵乘法的右global_work_size和local_work_size是多少? 8192 x 8192]?

这是 global_work_size 和 local_work_size 中导致错误的代码

// Set the work-item dimensions
        long global_work_size[] = new long[]{n};
        long local_work_size[] = new long[]{1};


// Execute the kernel
        clEnqueueNDRangeKernel(commandQueue, kernel, 2, null,
        global_work_size, local_work_size, 0, null, null);

除非您正在使用共享本地内存、async_copy 函数,或出于其他原因检查 get_local_id,否则您不需要设置本地工作组大小,您可以让driver select 一个(通过传递 NULL)。一旦一切正常,您可以尝试传递不同的值以查看它是否会改变性能,但请注意全局大小需要是局部大小的整数倍,因此如果这会使它大于您的实际工作大小,您必须在内核中添加条件以跳过边界情况。

您正在将 clEnqueueNDRange 中的 work_dim 参数分配给 2。但是您传递的全局大小数组的大小为 1(您使用单个值对其进行初始化)。这是轰炸,因为它希望数组有 2 个条目。 work_dim 字段指定全局工作项的维度,而不是数据数组的维度。所以将 work_dim 字段(第三个参数)设置为 1.