高效地同步排列许多小型 OpenCL 内核

Efficiently synchronously queue many small OpenCL kernels

TLDR:我怎样才能 运行 多个小内核,一次一个,而不会有显着的开销?

我正在做一个充当虚拟绿屏的项目。它接受图像输入,寻找与颜色键相似的像素,并用替换颜色替换这些像素。我计划在 Windows 中将生成的图像提要输出为虚拟网络摄像头。完整的源代码是 on Github。目前,我在 Java (JOCL) 中使用 OpenCL 绑定来加速该过程。主要应用程序是用 JavaFX 和 Kotlin 编写的,我对此很满意,但 OpenCL 内核是用 C 编写的,我是新手。

这是我为程序创建的主要“API”。我尝试使 API 接口相对开放,以便将来可以添加直接的 Cuda 支持。

class OpenClApi constructor(
    platformIndex: Int = 0,
    deviceIndex: Int = 0,
    val localWorkSize: Long? = null
) : AbstractApi {
    companion object : AbstractApi.AbstractApiConsts {
        override val listName = "OpenCl"

        enum class ClMemOperation(val flags: Long) {
            // CL_MEM_USE_HOST_PTR instead of CL_MEM_COPY_HOST_PTR speeds up most operations for realtime video

        private fun getPlatforms(): Array<cl_platform_id?> {
            val numPlatformsArray = IntArray(1)
            clGetPlatformIDs(0, null, numPlatformsArray)
            val numPlatforms = numPlatformsArray[0]
            val platforms = arrayOfNulls<cl_platform_id>(numPlatforms)
            clGetPlatformIDs(platforms.size, platforms, null)
            return platforms

        private fun getPlatform(platformId: Int) = getPlatforms()[platformId]
            ?: throw ArrayIndexOutOfBoundsException("Couldn't find the specified platform")

        fun getPlatformsMap(): Map<Int, String> {
            val platforms = getPlatforms()
            val result = mutableMapOf<Int, String>()
            for (platformId in platforms.indices) {
                val platformFromList = platforms[platformId]
                val size = LongArray(1)
                clGetPlatformInfo(platformFromList, CL_PLATFORM_NAME, 0, null, size)
                val buffer = ByteArray(size[0].toInt())
                clGetPlatformInfo(platformFromList, CL_PLATFORM_NAME, buffer.size.toLong(), Pointer.to(buffer), null)
                result[platformId] = String(buffer, 0, buffer.size - 1)
            return result

        private fun getDevices(platformId: Int): Array<cl_device_id?> {
            val platform = getPlatform(platformId)
            val numDevicesArray = IntArray(1)
            clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, null, numDevicesArray)
            val numDevices = numDevicesArray[0]
            val devices = arrayOfNulls<cl_device_id>(numDevices)
            clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, null)
            return devices

        private fun getDevice(platformId: Int, deviceId: Int) = getDevices(platformId)[deviceId]
                ?: throw ArrayIndexOutOfBoundsException("Couldn't find the specified platform or device")

        fun getDevicesMap(platformId: Int): Map<Int, String> {
            val devices = getDevices(platformId)
            val result = mutableMapOf<Int, String>()
            for (deviceId in devices.indices) {
                val deviceFromList = devices[deviceId]
                val size = LongArray(1)
                clGetDeviceInfo(deviceFromList, CL_DEVICE_NAME, 0, null, size)
                val buffer = ByteArray(size[0].toInt())
                clGetDeviceInfo(deviceFromList, CL_DEVICE_NAME, buffer.size.toLong(), Pointer.to(buffer), null)
                result[deviceId] = String(buffer, 0, buffer.size - 1)
            return result

    private val platform: cl_platform_id = getPlatform(platformIndex)

    private val contextProperties: cl_context_properties = cl_context_properties()

    private val device: cl_device_id = getDevice(platformIndex, deviceIndex)

    private val context: cl_context = clCreateContext(contextProperties, 1, arrayOf(device), null, null, null)

    val commandQueue: cl_command_queue

    val program: cl_program

    init {
        contextProperties.addProperty(CL_CONTEXT_PLATFORM.toLong(), platform)
        val properties = cl_queue_properties()
        commandQueue = clCreateCommandQueueWithProperties(context, device, properties, null)
        val sources = arrayOf(
        ).map {
        program = clCreateProgramWithSource(context, sources.size, sources, null, null)
        clBuildProgram(program, 0, null, null, null, null)

    override fun getFilters(): Map<String, AbstractFilter> = mapOf(
        OpenClInitialComparisonFilter.listName to OpenClInitialComparisonFilter(api = this),
        OpenClNoiseReductionFilter.listName to OpenClNoiseReductionFilter(api = this),
        OpenClFlowKeyFilter.listName to OpenClFlowKeyFilter(api = this),
        OpenClSplashFilter.listName to OpenClSplashFilter(api = this),

    override fun close() {

    fun allocMem(ptr: Pointer?, op: ClMemOperation, size: Int): cl_mem = clCreateBuffer(

这是一个使用 API 实例处理帧的示例“过滤器”。

class OpenClInitialComparisonFilter @Suppress("LongParameterList") constructor(
    private val api: OpenClApi,
    var colorKey: ByteArray = byteArrayOf(0, 255.toByte(), 0),
    var replacementKey: ByteArray = byteArrayOf(0, 255.toByte(), 0),
    var percentTolerance: Float = 0.025f,
    var colorSpace: ColorSpace = ColorSpace.ALL,
    var width: Int = DEFAULT_WIDTH_PIXELS,
    var height: Int = DEFAULT_HEIGHT_PIXELS
) : AbstractFilter{
    companion object : AbstractFilterConsts {
        override val listName = "Initial Comparison"

        private const val KERNEL_NAME = "initialComparisonKernel"

    override fun getProperties(): Map<AbstractFilterProperty, Any> = mapOf(
        AbstractFilterProperty.TOLERANCE to percentTolerance,
        AbstractFilterProperty.COLOR_KEY to colorKey,
        AbstractFilterProperty.REPLACEMENT_KEY to replacementKey,
        AbstractFilterProperty.COLOR_SPACE to colorSpace

    override fun setProperty(listName: String, newValue: Any) = when (listName) {
        AbstractFilterProperty.TOLERANCE.listName -> percentTolerance = newValue as Float
        AbstractFilterProperty.COLOR_KEY.listName -> colorKey = newValue as ByteArray
        AbstractFilterProperty.REPLACEMENT_KEY.listName -> replacementKey = newValue as ByteArray
        AbstractFilterProperty.COLOR_SPACE.listName -> colorSpace = newValue as ColorSpace
        else -> throw ArrayIndexOutOfBoundsException("Couldn't find property $listName")

    override fun apply(inputBuffer: ByteArray): ByteArray {
        val outputBuffer = ByteArray(size = inputBuffer.size)
        val floatOptionsBuffer = floatArrayOf(percentTolerance)
        val intOptionsBuffer = intArrayOf(colorSpace.i, width, height)

        val inputPtr = Pointer.to(inputBuffer)
        val outputPtr = Pointer.to(outputBuffer)
        val colorKeyPtr = Pointer.to(colorKey)
        val replacementKeyPtr = Pointer.to(replacementKey)
        val floatOptionsPtr = Pointer.to(floatOptionsBuffer)
        val intOptionsPtr = Pointer.to(intOptionsBuffer)

        val inputMem = api.allocMem(inputPtr, ClMemOperation.READ, Sizeof.cl_char * inputBuffer.size)
        val outputMem = api.allocMem(null, ClMemOperation.WRITE, Sizeof.cl_char * outputBuffer.size)
        val colorKeyMem = api.allocMem(colorKeyPtr, ClMemOperation.READ, Sizeof.cl_char * colorKey.size)
        val replacementKeyMem = api.allocMem(
            Sizeof.cl_char * replacementKey.size
        val floatOptionsMem = api.allocMem(
            Sizeof.cl_float * floatOptionsBuffer.size
        val intOptionsMem = api.allocMem(intOptionsPtr, ClMemOperation.READ, Sizeof.cl_int * intOptionsBuffer.size)

        val kernel = clCreateKernel(api.program, KERNEL_NAME, null)
        var a = 0
        clSetKernelArg(kernel, a++, Sizeof.cl_mem.toLong(), Pointer.to(inputMem))
        clSetKernelArg(kernel, a++, Sizeof.cl_mem.toLong(), Pointer.to(outputMem))
        clSetKernelArg(kernel, a++, Sizeof.cl_mem.toLong(), Pointer.to(colorKeyMem))
        clSetKernelArg(kernel, a++, Sizeof.cl_mem.toLong(), Pointer.to(replacementKeyMem))
        clSetKernelArg(kernel, a++, Sizeof.cl_mem.toLong(), Pointer.to(floatOptionsMem))
        clSetKernelArg(kernel, a, Sizeof.cl_mem.toLong(), Pointer.to(intOptionsMem))
        val globalWorkSizeBuffer = api.localWorkSize?.let {
            longArrayOf(ceil(inputBuffer.size / it.toFloat()).toLong() * it)
        } ?: longArrayOf(inputBuffer.size.toLong())
        val localWorkSizeBuffer = api.localWorkSize?.let { longArrayOf(api.localWorkSize) }

            (inputBuffer.size * Sizeof.cl_char).toLong(),


        return outputBuffer

这里是 InitialComparison 内核的一个例子,它寻找并替换相似的像素。

enum ColorSpace {
    BLUE = 0,
    GREEN = 1,
    RED = 2,
    ALL = 3

enum FloatOptions {

enum IntOptions {
    COLOR_SPACE = 0,
    WIDTH = 1,
    HEIGHT = 2,
    BLOCK_SIZE = 3

float calcColorDiff(
    const char *a,
    const int i,
    const char *b,
    const int j,
    const int colorSpace
) {
    float colorDiff[3];
    for (int k = 0; k < 3; k++) {
        colorDiff[k] = abs(a[i + k] - b[j + k]);
    if (colorSpace < 3) {
        return colorDiff[colorSpace] / 255.0;
    } else {
        float percentDiff = 0.0;
        for (int i = 0; i < 3; i++) {
            percentDiff += colorDiff[i] / 765.0;
        return percentDiff;

void writePixel(
    char *canvas,
    const int i,
    const char *ink,
    const int j
) {
    for (int k = 0; k < 3; k++) {
        canvas[i + k] = ink[j + k];

__kernel void initialComparisonKernel(
    __global const char *input,
    __global char *output,
    __global const char *colorKey,
    __global const char *replacementKey,
    __global const float *floatOptions,
    __global const int *intOptions
) {
    float percentTolerance = floatOptions[PERCENT_TOLERANCE];
    int colorSpace = intOptions[COLOR_SPACE];
    int gid = get_global_id(0);

    if (gid % 3 == 0) {
        float percentDiff = calcColorDiff(input, gid, colorKey, 0, colorSpace);
        if (percentDiff < percentTolerance) {
            writePixel(output, gid, replacementKey, 0);
        } else {
            writePixel(output, gid, input, gid);

效果很好!比 运行 在 CPU 上运行它快得多,甚至使用 Java 的多线程 ExecutorService。除了比较之外,我还 运行 两个额外的过滤器:NoiseReduction,它删除了大部分没有被其他绿屏像素包围的绿屏像素,以及 FlowKey,它填充了绿屏像素之间的间隙。

int checkPixelEquality(
    const char *input, 
    const int i, 
    const char *colorKey
) {
    int diffSum = 0;
    for (int j = 0; j < 3; j++) {
        diffSum += abs(input[i + j] - colorKey[j]);
    if (diffSum == 0) {
        return 1;
    } else {
        return 0;

__kernel void noiseReductionKernel(
    __global const char *input,
    __global char *output,
    __global const char *template,
    __global const char *colorKey,
    __global const int *intOptions
) {
    int width = intOptions[WIDTH];
    int height = intOptions[HEIGHT];
    int gid = get_global_id(0);

    if (gid % 3 == 0) {
        int anchorEquality = checkPixelEquality(input, gid, colorKey);
        if (anchorEquality == 1) {
            int surroundingPixels = 0;
            if ((gid / 3) % width == 0) {
                surroundingPixels += 1;
            } else {
                surroundingPixels += checkPixelEquality(input, gid - 3, colorKey);
            if ((gid / 3) % width == width - 1) {
                surroundingPixels += 1;
            } else {
                surroundingPixels += checkPixelEquality(input, gid + 3, colorKey);
            if ((gid / 3) / width == 0) {
                surroundingPixels += 1;
            } else {
                surroundingPixels += checkPixelEquality(input, gid - (width * 3), colorKey);
            if ((gid / 3) / width == height - 1) {
                surroundingPixels += 1;
            } else {
                surroundingPixels += checkPixelEquality(input, gid + (width * 3), colorKey);
            if (surroundingPixels < 3) {
                writePixel(output, gid, template, gid);
            } else {
                writePixel(output, gid, colorKey, 0);
        } else {
            writePixel(output, gid, template, gid);

__kernel void flowKeyKernel(
    __global const char *input,
    __global char *output,
    __global const char *template,
    __global const char *colorKey,
    __global const float *floatOptions,
    __global const int *intOptions
) {
    float gradientTolerance = floatOptions[GRADIENT_TOLERANCE];
    int colorSpace = intOptions[COLOR_SPACE];
    int width = intOptions[WIDTH];
    int height = intOptions[HEIGHT];
    int gid = get_global_id(0);

    if (gid % 3 == 0) {
        if (checkPixelEquality(input, gid, colorKey) == 0) {
            if (
                (gid / 3) % width != 0 &&
                checkPixelEquality(input, gid - 3, colorKey) == 1 &&
                calcColorDiff(input, gid, template, gid - 3, colorSpace) > gradientTolerance
            ) {
                writePixel(output, gid, colorKey, 0);
            if (
                (gid / 3) % width != width - 1 &&
                checkPixelEquality(input, gid + 3, colorKey) == 1 &&
                calcColorDiff(input, gid, template, gid + 3, colorSpace) > gradientTolerance
            ) {
                writePixel(output, gid, colorKey, 0);
            if (
                (gid / 3) / width != 0 &&
                checkPixelEquality(input, gid - (width * 3), colorKey) == 1 && 
                calcColorDiff(input, gid, template, gid - (width * 3), colorSpace) > gradientTolerance
            ) {
                writePixel(output, gid, colorKey, 0);
            if (
                (gid / 3) / width != height - 1 &&
                checkPixelEquality(input, gid + (width * 3), colorKey) == 1 &&
                calcColorDiff(input, gid, template, gid + (width * 3), colorSpace) > gradientTolerance
            ) {
                writePixel(output, gid, colorKey, 0);
            writePixel(output, gid, template, gid);
        } else {
            writePixel(output, gid, colorKey, 0);
    } else {
        writePixel(output, gid, colorKey, 0);

问题在于,与通过 clEnqueueNDRangeKernel 排队内核相比,这些内核的 运行 时间微不足道。这意味着 运行ning 所有内核的开销太大,导致帧延迟。每个过滤器必须一次运行一个,直到图像被完全处理。

我目前对 OpenCL 的理解是,在一个排队的内核中,每个工作组都会在没有任何特定顺序的情况下排队,并且没有任何 gua运行 总并发性。因为这些过滤器必须在整个图像中一次一个地应用,所以我能想到的唯一选择是将许多小内核排队。


  1. 工作组运行不考虑总并发,这意味着一行像素可以完成所有过滤器,而另一行像素根本没有运行。
  2. 当我为聚合内核实现锁定时,它会冻结,因为并非所有工作组都同时 运行ning。
__kernel void openClKernel(
    __global const char *input,
    __global char *output,
    __global const char *colorKey,
    __global const char *replacementKey,
    __global const float *floatOptions,
    __global const int *intOptions,
    __global char *tmpActive,
    __global char *tmpStale
) {
    float tolerance = floatOptions[TOLERANCE];
    float flowKeyTolerance = floatOptions[FLOW_KEY_TOLERANCE];
    int colorSpace = intOptions[COLOR_SPACE];
    int width = intOptions[WIDTH];
    int height = intOptions[HEIGHT];
    int initialNoiseReductionIterations = intOptions[INITIAL_NOISE_REDUCTION_ITERATIONS];
    int flowKeyIterations = intOptions[FLOW_KEY_ITERATIONS];
    int finalNoiseReductionIterations = intOptions[FINAL_NOISE_REDUCTION_ITERATIONS];
    int gid = get_global_id(0);

    if (gid % 3 == 0) {
        applyInitialComparison(input, tmpActive, colorKey, replacementKey, tolerance, colorSpace, gid);
        writePixel(tmpStale, gid, tmpActive, gid);
        for (int i = 0; i < initialNoiseReductionIterations; i++) {
            applyNoiseReduction(tmpStale, tmpActive, input, replacementKey, width, height, gid);
            writePixel(tmpStale, gid, tmpActive, gid);
        for (int i = 0; i < flowKeyIterations; i++) {
            applyFlowKey(tmpStale, tmpActive, input, replacementKey, flowKeyTolerance, colorSpace, width, height, gid);
            writePixel(tmpStale, gid, tmpActive, gid);
        for (int i = 0; i < finalNoiseReductionIterations; i++) {
            applyNoiseReduction(tmpStale, tmpActive, input, replacementKey, width, height, gid);
            writePixel(tmpStale, gid, tmpActive, gid);
        writePixel(output, gid, tmpStale, gid);

也就是说,聚合内核 运行 比拆分内核快 1,000 倍(我实现了一个小的帧延迟计数器)。这向我表明,与手头的任务相比,排队内核的开销太大了。

我可以做些什么来优化这个程序?有没有办法有效地排队许多小内核?有没有办法同时将内核重组为 运行 ?如果需要,还请告诉我如何提高问题的质量。


每次内核启动都有固定的开销,比方说 1 毫秒。开销部分源于指令的加载,但主要源于每个内核末尾所有线程的同步。因此,如果您启动许多每个执行时间为 1 毫秒的小内核,则总时间的一半将作为开销损失​​。如果将许多小内核聚合成一个 运行s 9ms,那么开销仅为 10%。

因此,在数据移动允许的情况下,将尽可能多的小内核聚合为一个,而不会 运行进入竞争条件。还要确保每个内核的范围尽可能大。

或者,您可以并行使用多个队列。范围小的内核不会使 GPU 饱和,因此 GPU 的一部分随时处于空闲状态。如果您有多个并发队列,这些队列中的内核可以 运行 同时并发并一起使硬件饱和。然而,你仍然有间接损失。