ios metal:一个命令缓冲区中的多个内核调用
ios metal: multiple kernel calls in one command buffer
我在 Metal 中结合 Swift 实现多个内核函数时遇到问题。
我的目标是对图像实施逐块 DCT 变换。 DCT是用两个矩阵乘法实现的。
J = H * I * H^-1
以下代码显示了内核函数本身以及在 swift 代码中使用的调用。如果我 运行 每个内核函数单独工作,但我无法设法将写缓冲区从第一个内核函数移交给第二个函数。因此,第二个函数总是 returns 一个仅填充 0 的缓冲区。
所有图像输入和输出缓冲区都是 400x400 大的 RGB(每个分量为 16 位整数)。矩阵是 8x8 16 位整数。
是否需要特殊命令来同步不同内核函数的缓冲区读写访问?还是我做错了什么?
感谢您的帮助
shaders.metal
struct Image3D16{
short data[400][400][3];
};
struct Matrix{
short data[8 * 8];
};
kernel void dct1(device Image3D16 *inputImage [[buffer(0)]],
device Image3D16 *outputImage [[buffer(1)]],
device Matrix *mult [[buffer(2)]],
uint2 gid [[thread_position_in_grid]],
uint2 tid [[thread_position_in_threadgroup]]){
int red = 0, green = 0, blue = 0;
for(int x=0;x<8;x++){
short r = inputImage->data[gid.x-tid.x + x][gid.y][0];
short g = inputImage->data[gid.x-tid.x + x][gid.y][1];
short b = inputImage->data[gid.x-tid.x + x][gid.y][2];
red += r * mult->data[tid.x*8 + x];
green += g * mult->data[tid.x*8 + x];
blue += b * mult->data[tid.x*8 + x];
}
outputImage->data[gid.x][gid.y][0] = red;
outputImage->data[gid.x][gid.y][1] = green;
outputImage->data[gid.x][gid.y][2] = blue;
}
kernel void dct2(device Image3D16 *inputImage [[buffer(0)]],
device Image3D16 *outputImage [[buffer(1)]],
device Matrix *mult [[buffer(2)]],
uint2 gid [[thread_position_in_grid]],
uint2 tid [[thread_position_in_threadgroup]]){
int red = 0, green = 0, blue = 0;
for(int y=0;y<8;y++){
short r = inputImage->data[gid.x][gid.y-tid.y + y][0];
short g = inputImage->data[gid.x][gid.y-tid.y + y][1];
short b = inputImage->data[gid.x][gid.y-tid.y + y][2];
red += r * mult->data[tid.y*8 + y];
green += g * mult->data[tid.y*8 + y];
blue += b * mult->data[tid.y*8 + y];
}
outputImage->data[gid.x][gid.y][0] = red;
outputImage->data[gid.x][gid.y][1] = green;
outputImage->data[gid.x][gid.y][2] = blue;
}
ViewController.swift
...
let commandBuffer = commandQueue.commandBuffer()
let computeEncoder1 = commandBuffer.computeCommandEncoder()
computeEncoder1.setComputePipelineState(computeDCT1)
computeEncoder1.setBuffer(input, offset: 0, atIndex: 0)
computeEncoder1.setBuffer(tmpBuffer3D1, offset: 0, atIndex: 1)
computeEncoder1.setBuffer(dctMatrix1, offset: 0, atIndex: 2)
computeEncoder1.dispatchThreadgroups(blocks, threadsPerThreadgroup: dctSize)
computeEncoder1.endEncoding()
let computeEncoder2 = commandBuffer.computeCommandEncoder()
computeEncoder2.setComputePipelineState(computeDCT2)
computeEncoder2.setBuffer(tmpBuffer3D1, offset: 0, atIndex: 0)
computeEncoder2.setBuffer(output, offset: 0, atIndex: 1)
computeEncoder2.setBuffer(dctMatrix2, offset: 0, atIndex: 2)
computeEncoder2.dispatchThreadgroups(blocks, threadsPerThreadgroup: dctSize)
computeEncoder2.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
我发现了错误。我的内核函数试图读取其分配的内存之外的内容。金属接口的反应是停止执行命令缓冲区中的所有后续命令。因此,输出始终为零,因为计算从未完成。应用程序的GPU使用率下降,可用于检测错误。
我在 Metal 中结合 Swift 实现多个内核函数时遇到问题。
我的目标是对图像实施逐块 DCT 变换。 DCT是用两个矩阵乘法实现的。
J = H * I * H^-1
以下代码显示了内核函数本身以及在 swift 代码中使用的调用。如果我 运行 每个内核函数单独工作,但我无法设法将写缓冲区从第一个内核函数移交给第二个函数。因此,第二个函数总是 returns 一个仅填充 0 的缓冲区。
所有图像输入和输出缓冲区都是 400x400 大的 RGB(每个分量为 16 位整数)。矩阵是 8x8 16 位整数。
是否需要特殊命令来同步不同内核函数的缓冲区读写访问?还是我做错了什么?
感谢您的帮助
shaders.metal
struct Image3D16{
short data[400][400][3];
};
struct Matrix{
short data[8 * 8];
};
kernel void dct1(device Image3D16 *inputImage [[buffer(0)]],
device Image3D16 *outputImage [[buffer(1)]],
device Matrix *mult [[buffer(2)]],
uint2 gid [[thread_position_in_grid]],
uint2 tid [[thread_position_in_threadgroup]]){
int red = 0, green = 0, blue = 0;
for(int x=0;x<8;x++){
short r = inputImage->data[gid.x-tid.x + x][gid.y][0];
short g = inputImage->data[gid.x-tid.x + x][gid.y][1];
short b = inputImage->data[gid.x-tid.x + x][gid.y][2];
red += r * mult->data[tid.x*8 + x];
green += g * mult->data[tid.x*8 + x];
blue += b * mult->data[tid.x*8 + x];
}
outputImage->data[gid.x][gid.y][0] = red;
outputImage->data[gid.x][gid.y][1] = green;
outputImage->data[gid.x][gid.y][2] = blue;
}
kernel void dct2(device Image3D16 *inputImage [[buffer(0)]],
device Image3D16 *outputImage [[buffer(1)]],
device Matrix *mult [[buffer(2)]],
uint2 gid [[thread_position_in_grid]],
uint2 tid [[thread_position_in_threadgroup]]){
int red = 0, green = 0, blue = 0;
for(int y=0;y<8;y++){
short r = inputImage->data[gid.x][gid.y-tid.y + y][0];
short g = inputImage->data[gid.x][gid.y-tid.y + y][1];
short b = inputImage->data[gid.x][gid.y-tid.y + y][2];
red += r * mult->data[tid.y*8 + y];
green += g * mult->data[tid.y*8 + y];
blue += b * mult->data[tid.y*8 + y];
}
outputImage->data[gid.x][gid.y][0] = red;
outputImage->data[gid.x][gid.y][1] = green;
outputImage->data[gid.x][gid.y][2] = blue;
}
ViewController.swift
...
let commandBuffer = commandQueue.commandBuffer()
let computeEncoder1 = commandBuffer.computeCommandEncoder()
computeEncoder1.setComputePipelineState(computeDCT1)
computeEncoder1.setBuffer(input, offset: 0, atIndex: 0)
computeEncoder1.setBuffer(tmpBuffer3D1, offset: 0, atIndex: 1)
computeEncoder1.setBuffer(dctMatrix1, offset: 0, atIndex: 2)
computeEncoder1.dispatchThreadgroups(blocks, threadsPerThreadgroup: dctSize)
computeEncoder1.endEncoding()
let computeEncoder2 = commandBuffer.computeCommandEncoder()
computeEncoder2.setComputePipelineState(computeDCT2)
computeEncoder2.setBuffer(tmpBuffer3D1, offset: 0, atIndex: 0)
computeEncoder2.setBuffer(output, offset: 0, atIndex: 1)
computeEncoder2.setBuffer(dctMatrix2, offset: 0, atIndex: 2)
computeEncoder2.dispatchThreadgroups(blocks, threadsPerThreadgroup: dctSize)
computeEncoder2.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
我发现了错误。我的内核函数试图读取其分配的内存之外的内容。金属接口的反应是停止执行命令缓冲区中的所有后续命令。因此,输出始终为零,因为计算从未完成。应用程序的GPU使用率下降,可用于检测错误。