Swift iOS 上数组的金属并行求和计算

Question

基于@Kametrixom ，我做了一些用于并行计算数组和的测试应用程序。

我的测试应用程序如下所示：

import UIKit
import Metal

class ViewController: UIViewController {

// Data type, has to be the same as in the shader
typealias DataType = CInt

override func viewDidLoad() {
    super.viewDidLoad()

    let data = (0..<10000000).map{ _ in DataType(200) } // Our data, randomly generated


    var start, end : UInt64


    var result:DataType = 0
    start = mach_absolute_time()
    data.withUnsafeBufferPointer { buffer in
        for elem in buffer {
            result += elem
        }
    }
    end = mach_absolute_time()

    print("CPU result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")

    result = 0


    start = mach_absolute_time()
    result = sumParallel4(data)
    end = mach_absolute_time()

    print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")


    result = 0

    start = mach_absolute_time()
    result = sumParralel(data)
    end = mach_absolute_time()

    print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")

    result = 0

    start = mach_absolute_time()
    result = sumParallel3(data)
    end = mach_absolute_time()

    print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")





}

func sumParralel(data : Array<DataType>) -> DataType {

    let count = data.count
    let elementsPerSum: Int = Int(sqrt(Double(count)))

    let device = MTLCreateSystemDefaultDevice()!
    let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
    let pipeline = try! device.newComputePipelineStateWithFunction(parsum)


    var dataCount = CUnsignedInt(count)
    var elementsPerSumC = CUnsignedInt(elementsPerSum)
    let resultsCount = (count + elementsPerSum - 1) / elementsPerSum // Number of individual results = count / elementsPerSum (rounded up)


    let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied)
    let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized)
    let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later

    let queue = device.newCommandQueue()
    let cmds = queue.commandBuffer()
    let encoder = cmds.computeCommandEncoder()

    encoder.setComputePipelineState(pipeline)

    encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0)
    encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1)
    encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2)
    encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3)

    // We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
    let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)

    // Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
    let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)

    encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
    encoder.endEncoding()


    var result : DataType = 0


    cmds.commit()
    cmds.waitUntilCompleted()
    for elem in results {
        result += elem
    }


    return result
}



func sumParralel1(data : Array<DataType>) -> UnsafeBufferPointer<DataType> {

    let count = data.count
    let elementsPerSum: Int = Int(sqrt(Double(count)))

    let device = MTLCreateSystemDefaultDevice()!
    let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
    let pipeline = try! device.newComputePipelineStateWithFunction(parsum)


    var dataCount = CUnsignedInt(count)
    var elementsPerSumC = CUnsignedInt(elementsPerSum)
    let resultsCount = (count + elementsPerSum - 1) / elementsPerSum // Number of individual results = count / elementsPerSum (rounded up)

    let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied)
    let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized)
    let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later

    let queue = device.newCommandQueue()
    let cmds = queue.commandBuffer()
    let encoder = cmds.computeCommandEncoder()

    encoder.setComputePipelineState(pipeline)

    encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0)
    encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1)
    encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2)
    encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3)

    // We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
    let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)

    // Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
    let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)

    encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
    encoder.endEncoding()


    cmds.commit()
    cmds.waitUntilCompleted()



    return results
}

func sumParallel3(data : Array<DataType>) -> DataType {

    var results = sumParralel1(data)

    repeat {
        results = sumParralel1(Array(results))
    } while results.count >= 100

    var result : DataType = 0

    for elem in results {
        result += elem
    }


    return result
}

func sumParallel4(data : Array<DataType>) -> DataType {

    let queue = NSOperationQueue()
    queue.maxConcurrentOperationCount = 4

    var a0 : DataType = 0
    var a1 : DataType = 0
    var a2 : DataType = 0
    var a3 : DataType = 0

    let op0 = NSBlockOperation( block : {

        for i in 0..<(data.count/4) {
            a0 = a0 + data[i]
        }

    })

    let op1 = NSBlockOperation( block : {
        for i in (data.count/4)..<(data.count/2) {
            a1 = a1 + data[i]
        }
    })

    let op2 = NSBlockOperation( block : {
        for i in (data.count/2)..<(3 * data.count/4) {
            a2 = a2 + data[i]
        }
    })

    let op3 = NSBlockOperation( block : {
        for i in (3 * data.count/4)..<(data.count) {
            a3 = a3 + data[i]
        }
    })



    queue.addOperation(op0)
    queue.addOperation(op1)
    queue.addOperation(op2)
    queue.addOperation(op3)

    queue.suspended = false
    queue.waitUntilAllOperationsAreFinished()

    let aaa: DataType = a0 + a1 + a2 + a3

    return aaa
 }
}

我有一个看起来像这样的着色器：

kernel void parsum(const device DataType* data [[ buffer(0) ]],
               const device uint& dataLength [[ buffer(1) ]],
               device DataType* sums [[ buffer(2) ]],
               const device uint& elementsPerSum [[ buffer(3) ]],

               const uint tgPos [[ threadgroup_position_in_grid ]],
               const uint tPerTg [[ threads_per_threadgroup ]],
               const uint tPos [[ thread_position_in_threadgroup ]]) {

    uint resultIndex = tgPos * tPerTg + tPos; // This is the index of the individual result, this var is unique to this thread
    uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin
    uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end

    for (; dataIndex < endIndex; dataIndex++)
        sums[resultIndex] += data[dataIndex];
}

我的惊喜功能 sumParallel4 是最快的，我认为它不应该。我注意到当我调用函数 sumParralel 和 sumParallel3 时，第一个函数总是比较慢，即使我改变了函数的顺序。（因此，如果我先调用 sumParralel，则速度较慢，如果我调用 sumParallel3，则速度较慢。）

这是为什么？为什么 sumParallel3 并不比 sumParallel 快很多？为什么 sumParallel4 是最快的，虽然它是在 CPU?

上计算的

如何使用 posix_memalign 更新我的 GPU 功能？我知道它应该工作得更快，因为它会在 GPU 和 CPU 之间共享内存，但我不知道数组应该以这种方式分配（数据或结果）以及我如何分配数据posix_memalign data 是否是函数传入的参数？

Answer 1

在运行对 iPhone 6 进行这些测试时，我发现 Metal 版本运行比天真的 CPU 总和慢 3 倍，快 2 倍。通过我在下面描述的修改，它始终更快。

我发现运行使用 Metal 版本的很多成本不仅可以归因于缓冲区的分配，尽管这很重要，而且还可以归因于第一次创建设备和计算管道状态。这些是您通常会在应用程序初始化时执行一次的操作，因此将它们包含在时间中并不完全公平。

还应注意，如果您在启用 Metal 验证层和 GPU 帧捕获的情况下运行通过 Xcode 进行这些测试，那么运行-时间成本，并且会偏向 CPU 的结果。

考虑到这些注意事项，下面是您可以如何使用 posix_memalign 分配可用于备份 MTLBuffer 的内存。诀窍是确保您请求的内存实际上是页面对齐的（即它的地址是 getpagesize() 的倍数），这可能需要将超出您实际需要存储数据的内存量四舍五入:

let dataCount = 1_000_000
let dataSize = dataCount * strideof(DataType)
let pageSize = Int(getpagesize())
let pageCount = (dataSize + (pageSize - 1)) / pageSize
var dataPointer: UnsafeMutablePointer<Void> = nil
posix_memalign(&dataPointer, pageSize, pageCount * pageSize)
let data = UnsafeMutableBufferPointer(start: UnsafeMutablePointer<DataType>(dataPointer),
                                      count: (pageCount * pageSize) / strideof(DataType))

for i in 0..<dataCount {
    data[i] = 200
}

这确实需要使 data 成为 UnsafeMutableBufferPointer<DataType>，而不是 [DataType]，因为 Swift 的 Array 分配了自己的后备存储。您还需要传递要操作的数据项的计数，因为可变缓冲区指针的 count 已四舍五入以使缓冲区页对齐。

要使用此数据实际创建 MTLBuffer，请使用 newBufferWithBytesNoCopy(_:length:options:deallocator:) API。同样重要的是，您提供的长度是页面大小的倍数；否则这个方法 returns nil:

let roundedUpDataSize = strideof(DataType) * data.count
let dataBuffer = device.newBufferWithBytesNoCopy(data.baseAddress, length: roundedUpDataSize, options: [], deallocator: nil)

在这里，我们不提供释放器，但您应该在使用完内存后释放内存，方法是将缓冲区指针的 baseAddress 传递给 free()。

Swift iOS 上数组的金属并行求和计算

Swift metal parallel sum calculation of array on iOS

arrays

posix-api

ios

swift

metal