Swift、macOS、mac 使用 2 个 GPU,矩阵运算在一个 GPU 上运行,而不在另一个 GPU 上运行

Swift, macOS, mac with 2 GPUs, matrix operations work on one GPU, not on the other

我 运行在我的 MacBook Pro(Retina,15 英寸,2015 年中)上使用 macOS,根据 Apple 菜单中的 "About this Mac",它有两个 GPU。一个 GPU 是 AMD Radeon R9 M370X 2 GB,另一个是 Intel Iris Pro 1536 MB——我猜是标准芯片吧?是我买的时候里面的薯条,我什么都没加。

我正在使用 Swift MPS 库进行矩阵计算;它在 Intel GPU 上运行良好,但是当我 select Radeon 时,我只能从每个操作中返回零,没有错误报告。我四处寻找有关它的文档,但找不到任何东西。到目前为止,我唯一的线索是 Radeon 报告 "not integrated"(或者至少,我认为它是,基于 Finding GPUs on macOS 处的示例代码,这与 Apple 的文档一样有用,意思是不太)。如果我没看错那一页,这就是我的两个 GPU 告诉我的。

Device Intel Iris Pro Graphics; caps: headful, not discrete, integrated, not external

Device AMD Radeon R9 M370X; caps: headful, discrete, not integrated, not external

我找不到任何可以提示我做错了什么的文档。我遍历了 Apple 的 MPS 文档,但无济于事。正如我所说,该代码在 Intel GPU 上运行良好,所以我认为它在 Radeon 上也会 运行。我有 运行 一些可下载的诊断工具来检查 Radeon,但它没有出现在这些工具的菜单中。所以我什至不知道这是我在代码中做错了什么,还是芯片本身坏了。

下面是代码,您可以通过将其粘贴到 main.swift 来将其构建为控制台应用程序。找到以下行:

let device = MTLCopyAllDevices()[1]

我对Intel使用[0],对Radeon使用[1],你可以看到输出是不同的,即Radeon全为零。我想你的里程可能会因你的机器而异。欢迎任何意见,干杯

import MetalPerformanceShaders

typealias MPSNumber = Float32

let MPSNumberSize = MemoryLayout<MPSNumber>.size
let MPSNumberTypeInGPU = MPSDataType.float32

class MPSNet {
    let commandBuffer: MTLCommandBuffer
    let commandQueue: MTLCommandQueue
    let device = MTLCopyAllDevices()[1]
    var neuronsInMatrix1: MPSMatrix?
    var neuronsInMatrix2: MPSMatrix?
    var neuronsOutMatrix: MPSMatrix?

    init() {
        guard let cq = device.makeCommandQueue() else { fatalError() }
        guard let cb = cq.makeCommandBuffer() else { fatalError() }

        commandQueue = cq
        commandBuffer = cb

        let cMatrices = 2
        let cRows = 1
        let cColumns = 3

        let sensoryInputs1: [MPSNumber] = [1, 2, 3]
        let sensoryInputs2: [MPSNumber] = [4, 5, 6]

        neuronsInMatrix1 = makeMatrix(device, sensoryInputs1)
        neuronsInMatrix2 = makeMatrix(device, sensoryInputs2)

        let rowStride = MPSMatrixDescriptor.rowBytes(fromColumns: cColumns, dataType: MPSNumberTypeInGPU)
        neuronsOutMatrix = makeMatrix(device, cRows, cColumnsOut: cColumns, rowStride: rowStride)

        let adder = MPSMatrixSum(
            device: device, count: cMatrices, rows: cRows, columns: cColumns, transpose: false
        )

        adder.encode(
            to: commandBuffer,
            sourceMatrices: [neuronsInMatrix1!, neuronsInMatrix2!],
            resultMatrix: neuronsOutMatrix!, scale: nil, offsetVector: nil,
            biasVector: nil, start: 0
        )

        commandBuffer.addCompletedHandler { _ in
            let motorOutputs = self.getComputeOutput(self.neuronsOutMatrix!)

            let discrete = !self.device.isLowPower && !self.device.isRemovable
            let caps = "\(self.device.isHeadless ? " headless" : " headful")" +
                       "\(discrete ? ", discrete" : ", not discrete")" +
                       "\(self.device.isLowPower ? ", integrated" : ", not integrated")" +
                       "\(self.device.isRemovable ? ", external" : ", not external")"

            print("Device \(self.device.name); caps:\(caps); motor outputs \(motorOutputs)")
        }
    }

    func compute() {
        commandBuffer.commit()
        commandBuffer.waitUntilCompleted()
    }
}

extension MPSNet {
    func getComputeOutput(_ matrix: MPSMatrix) -> [Double] {
        let rc = matrix.data.contents()
        return stride(from: 0, to: matrix.columns * MPSNumberSize, by: MPSNumberSize).map {
            offset in

            let rr = rc.load(fromByteOffset: offset, as: MPSNumber.self)

            return Double(rr)
        }
    }

    func loadMatrix(_ data: MTLBuffer, _ rawValues: [MPSNumber]) {
        let dContents = data.contents()

        zip(stride(from: 0, to: rawValues.count * MPSNumberSize, by: MPSNumberSize), rawValues).forEach { z in
            let (byteOffset, rawValue) = (z.0, MPSNumber(z.1))

            dContents.storeBytes(of: rawValue, toByteOffset: byteOffset, as: MPSNumber.self)
        }
    }

    func makeMatrix(_ device: MTLDevice, _ rawValues: [MPSNumber]) -> MPSMatrix {
        let rowStride = MPSMatrixDescriptor.rowBytes(
            fromColumns: rawValues.count, dataType: MPSNumberTypeInGPU
        )

        let descriptor = MPSMatrixDescriptor(
            dimensions: 1, columns: rawValues.count, rowBytes: rowStride,
            dataType: MPSNumberTypeInGPU
        )

        guard let inputBuffer = device.makeBuffer(
            length: descriptor.matrixBytes, options: MTLResourceOptions.storageModeManaged
        ) else { fatalError() }

        loadMatrix(inputBuffer, rawValues)

        return MPSMatrix(buffer: inputBuffer, descriptor: descriptor)
    }

    func makeMatrix(_ device: MTLDevice, _ cRowsOut: Int, cColumnsOut: Int, rowStride: Int) -> MPSMatrix {
        let matrixDescriptor = MPSMatrixDescriptor(
            dimensions: cRowsOut, columns: cColumnsOut,
            rowBytes: rowStride, dataType: MPSNumberTypeInGPU
        )

        return MPSMatrix(device: device, descriptor: matrixDescriptor)
    }
}

let net = MPSNet()
net.compute()

您似乎未能使用-[MPSMatrix synchronizeOnCommandBuffer:]。在离散设备上,在数据从 GPU 返回之前需要进行一些显式同步。

问题出在你的矩阵缓冲区的存储模式上。您正在使用 MTLResourceOptions.storageModeManaged,它告诉 Metal 您想要管理 CPU 和 GPU 之间共享的内存的同步。正如此处另一个答案中所述,在尝试使用 CPU 读取数据之前,必须在 GPU 操作之后使用 MPSMatrix.synchronize(on: MTLCommandBuffer)。但是您还必须在另一个方向同步,即在 CPU 操作之后,在您将命令提交给 GPU 之前,使用 MTLBuffer.didModifyRange(_: Range).

或者,您可以使用共享存储模式,MTLResourceOptions.storageModeShared,它会为您处理同步。

有关详细信息,请参阅 Apple 文档中的 Synchronizing a Managed Resource

下面是您的示例的工作版本,它使用您所拥有的托管存储模式。注意函数 MPSNet.compute() 的差异。如果您的应用程序可以使用共享存储模式,则可以在为矩阵创建 MTLBuffer 时将这些东西放在一边,只更改存储模式。

import MetalPerformanceShaders

typealias MPSNumber = Float32

let MPSNumberSize = MemoryLayout<MPSNumber>.size
let MPSNumberTypeInGPU = MPSDataType.float32

class MPSNet {
    let commandBuffer: MTLCommandBuffer
    let commandQueue: MTLCommandQueue
    let device = MTLCopyAllDevices()[1]
    var neuronsInMatrix1: MPSMatrix?
    var neuronsInMatrix2: MPSMatrix?
    var neuronsOutMatrix: MPSMatrix?

    init() {
        guard let cq = device.makeCommandQueue() else { fatalError() }
        guard let cb = cq.makeCommandBuffer() else { fatalError() }

        commandQueue = cq
        commandBuffer = cb

        let cMatrices = 2
        let cRows = 1
        let cColumns = 3

        let sensoryInputs1: [MPSNumber] = [1, 2, 3]
        let sensoryInputs2: [MPSNumber] = [4, 5, 6]

        neuronsInMatrix1 = makeMatrix(device, sensoryInputs1)
        neuronsInMatrix2 = makeMatrix(device, sensoryInputs2)

        let rowStride = MPSMatrixDescriptor.rowBytes(fromColumns: cColumns, dataType: MPSNumberTypeInGPU)
        neuronsOutMatrix = makeMatrix(device, cRows, cColumnsOut: cColumns, rowStride: rowStride)

        let adder = MPSMatrixSum(
            device: device, count: cMatrices, rows: cRows, columns: cColumns, transpose: false
        )

        adder.encode(
            to: commandBuffer,
            sourceMatrices: [neuronsInMatrix1!, neuronsInMatrix2!],
            resultMatrix: neuronsOutMatrix!, scale: nil, offsetVector: nil,
            biasVector: nil, start: 0
        )

        commandBuffer.addCompletedHandler { _ in
            let motorOutputs = self.getComputeOutput(self.neuronsOutMatrix!)

            let discrete = !self.device.isLowPower && !self.device.isRemovable
            let caps = "\(self.device.isHeadless ? " headless" : " headful")" +
                       "\(discrete ? ", discrete" : ", not discrete")" +
                       "\(self.device.isLowPower ? ", integrated" : ", not integrated")" +
                       "\(self.device.isRemovable ? ", external" : ", not external")"

            print("Device \(self.device.name); caps:\(caps); motor outputs \(motorOutputs)")
        }
    }

    func compute() {
        for matrix in [neuronsInMatrix1!, neuronsInMatrix2!, neuronsOutMatrix!] {
            let matrixData = matrix.data
            matrixData.didModifyRange(0..<matrixData.length)

            matrix.synchronize(on: commandBuffer)
        }

        commandBuffer.commit()
    }
}

extension MPSNet {
    func getComputeOutput(_ matrix: MPSMatrix) -> [Double] {
        let rc = matrix.data.contents()
        return stride(from: 0, to: matrix.columns * MPSNumberSize, by: MPSNumberSize).map {
            offset in

            let rr = rc.load(fromByteOffset: offset, as: MPSNumber.self)

            return Double(rr)
        }
    }

    func loadMatrix(_ data: MTLBuffer, _ rawValues: [MPSNumber]) {
        let dContents = data.contents()

        zip(stride(from: 0, to: rawValues.count * MPSNumberSize, by: MPSNumberSize), rawValues).forEach { z in
            let (byteOffset, rawValue) = (z.0, MPSNumber(z.1))

            dContents.storeBytes(of: rawValue, toByteOffset: byteOffset, as: MPSNumber.self)
        }
    }

    func makeMatrix(_ device: MTLDevice, _ rawValues: [MPSNumber]) -> MPSMatrix {
        let rowStride = MPSMatrixDescriptor.rowBytes(
            fromColumns: rawValues.count, dataType: MPSNumberTypeInGPU
        )

        let descriptor = MPSMatrixDescriptor(
            dimensions: 1, columns: rawValues.count, rowBytes: rowStride,
            dataType: MPSNumberTypeInGPU
        )

        guard let inputBuffer = device.makeBuffer(
            length: descriptor.matrixBytes, options: MTLResourceOptions.storageModeManaged
        ) else { fatalError() }

        loadMatrix(inputBuffer, rawValues)

        return MPSMatrix(buffer: inputBuffer, descriptor: descriptor)
    }

    func makeMatrix(_ device: MTLDevice, _ cRowsOut: Int, cColumnsOut: Int, rowStride: Int) -> MPSMatrix {
        let matrixDescriptor = MPSMatrixDescriptor(
            dimensions: cRowsOut, columns: cColumnsOut,
            rowBytes: rowStride, dataType: MPSNumberTypeInGPU
        )

        return MPSMatrix(device: device, descriptor: matrixDescriptor)
    }
}

let net = MPSNet()
net.compute()