Swift、macOS、mac 使用 2 个 GPU,矩阵运算在一个 GPU 上运行,而不在另一个 GPU 上运行
Swift, macOS, mac with 2 GPUs, matrix operations work on one GPU, not on the other
我 运行在我的 MacBook Pro(Retina,15 英寸,2015 年中)上使用 macOS,根据 Apple 菜单中的 "About this Mac",它有两个 GPU。一个 GPU 是 AMD Radeon R9 M370X 2 GB,另一个是 Intel Iris Pro 1536 MB——我猜是标准芯片吧?是我买的时候里面的薯条,我什么都没加。
我正在使用 Swift MPS 库进行矩阵计算;它在 Intel GPU 上运行良好,但是当我 select Radeon 时,我只能从每个操作中返回零,没有错误报告。我四处寻找有关它的文档,但找不到任何东西。到目前为止,我唯一的线索是 Radeon 报告 "not integrated"(或者至少,我认为它是,基于 Finding GPUs on macOS 处的示例代码,这与 Apple 的文档一样有用,意思是不太)。如果我没看错那一页,这就是我的两个 GPU 告诉我的。
Device Intel Iris Pro Graphics; caps: headful, not discrete, integrated, not external
Device AMD Radeon R9 M370X; caps: headful, discrete, not integrated, not external
我找不到任何可以提示我做错了什么的文档。我遍历了 Apple 的 MPS 文档,但无济于事。正如我所说,该代码在 Intel GPU 上运行良好,所以我认为它在 Radeon 上也会 运行。我有 运行 一些可下载的诊断工具来检查 Radeon,但它没有出现在这些工具的菜单中。所以我什至不知道这是我在代码中做错了什么,还是芯片本身坏了。
下面是代码,您可以通过将其粘贴到 main.swift
来将其构建为控制台应用程序。找到以下行:
let device = MTLCopyAllDevices()[1]
我对Intel使用[0]
,对Radeon使用[1]
,你可以看到输出是不同的,即Radeon全为零。我想你的里程可能会因你的机器而异。欢迎任何意见,干杯
import MetalPerformanceShaders
typealias MPSNumber = Float32
let MPSNumberSize = MemoryLayout<MPSNumber>.size
let MPSNumberTypeInGPU = MPSDataType.float32
class MPSNet {
let commandBuffer: MTLCommandBuffer
let commandQueue: MTLCommandQueue
let device = MTLCopyAllDevices()[1]
var neuronsInMatrix1: MPSMatrix?
var neuronsInMatrix2: MPSMatrix?
var neuronsOutMatrix: MPSMatrix?
init() {
guard let cq = device.makeCommandQueue() else { fatalError() }
guard let cb = cq.makeCommandBuffer() else { fatalError() }
commandQueue = cq
commandBuffer = cb
let cMatrices = 2
let cRows = 1
let cColumns = 3
let sensoryInputs1: [MPSNumber] = [1, 2, 3]
let sensoryInputs2: [MPSNumber] = [4, 5, 6]
neuronsInMatrix1 = makeMatrix(device, sensoryInputs1)
neuronsInMatrix2 = makeMatrix(device, sensoryInputs2)
let rowStride = MPSMatrixDescriptor.rowBytes(fromColumns: cColumns, dataType: MPSNumberTypeInGPU)
neuronsOutMatrix = makeMatrix(device, cRows, cColumnsOut: cColumns, rowStride: rowStride)
let adder = MPSMatrixSum(
device: device, count: cMatrices, rows: cRows, columns: cColumns, transpose: false
)
adder.encode(
to: commandBuffer,
sourceMatrices: [neuronsInMatrix1!, neuronsInMatrix2!],
resultMatrix: neuronsOutMatrix!, scale: nil, offsetVector: nil,
biasVector: nil, start: 0
)
commandBuffer.addCompletedHandler { _ in
let motorOutputs = self.getComputeOutput(self.neuronsOutMatrix!)
let discrete = !self.device.isLowPower && !self.device.isRemovable
let caps = "\(self.device.isHeadless ? " headless" : " headful")" +
"\(discrete ? ", discrete" : ", not discrete")" +
"\(self.device.isLowPower ? ", integrated" : ", not integrated")" +
"\(self.device.isRemovable ? ", external" : ", not external")"
print("Device \(self.device.name); caps:\(caps); motor outputs \(motorOutputs)")
}
}
func compute() {
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
}
}
extension MPSNet {
func getComputeOutput(_ matrix: MPSMatrix) -> [Double] {
let rc = matrix.data.contents()
return stride(from: 0, to: matrix.columns * MPSNumberSize, by: MPSNumberSize).map {
offset in
let rr = rc.load(fromByteOffset: offset, as: MPSNumber.self)
return Double(rr)
}
}
func loadMatrix(_ data: MTLBuffer, _ rawValues: [MPSNumber]) {
let dContents = data.contents()
zip(stride(from: 0, to: rawValues.count * MPSNumberSize, by: MPSNumberSize), rawValues).forEach { z in
let (byteOffset, rawValue) = (z.0, MPSNumber(z.1))
dContents.storeBytes(of: rawValue, toByteOffset: byteOffset, as: MPSNumber.self)
}
}
func makeMatrix(_ device: MTLDevice, _ rawValues: [MPSNumber]) -> MPSMatrix {
let rowStride = MPSMatrixDescriptor.rowBytes(
fromColumns: rawValues.count, dataType: MPSNumberTypeInGPU
)
let descriptor = MPSMatrixDescriptor(
dimensions: 1, columns: rawValues.count, rowBytes: rowStride,
dataType: MPSNumberTypeInGPU
)
guard let inputBuffer = device.makeBuffer(
length: descriptor.matrixBytes, options: MTLResourceOptions.storageModeManaged
) else { fatalError() }
loadMatrix(inputBuffer, rawValues)
return MPSMatrix(buffer: inputBuffer, descriptor: descriptor)
}
func makeMatrix(_ device: MTLDevice, _ cRowsOut: Int, cColumnsOut: Int, rowStride: Int) -> MPSMatrix {
let matrixDescriptor = MPSMatrixDescriptor(
dimensions: cRowsOut, columns: cColumnsOut,
rowBytes: rowStride, dataType: MPSNumberTypeInGPU
)
return MPSMatrix(device: device, descriptor: matrixDescriptor)
}
}
let net = MPSNet()
net.compute()
您似乎未能使用-[MPSMatrix synchronizeOnCommandBuffer:]。在离散设备上,在数据从 GPU 返回之前需要进行一些显式同步。
问题出在你的矩阵缓冲区的存储模式上。您正在使用 MTLResourceOptions.storageModeManaged
,它告诉 Metal 您想要管理 CPU 和 GPU 之间共享的内存的同步。正如此处另一个答案中所述,在尝试使用 CPU 读取数据之前,必须在 GPU 操作之后使用 MPSMatrix.synchronize(on: MTLCommandBuffer)
。但是您还必须在另一个方向同步,即在 CPU 操作之后,在您将命令提交给 GPU 之前,使用 MTLBuffer.didModifyRange(_: Range)
.
或者,您可以使用共享存储模式,MTLResourceOptions.storageModeShared
,它会为您处理同步。
有关详细信息,请参阅 Apple 文档中的 Synchronizing a Managed Resource。
下面是您的示例的工作版本,它使用您所拥有的托管存储模式。注意函数 MPSNet.compute()
的差异。如果您的应用程序可以使用共享存储模式,则可以在为矩阵创建 MTLBuffer
时将这些东西放在一边,只更改存储模式。
import MetalPerformanceShaders
typealias MPSNumber = Float32
let MPSNumberSize = MemoryLayout<MPSNumber>.size
let MPSNumberTypeInGPU = MPSDataType.float32
class MPSNet {
let commandBuffer: MTLCommandBuffer
let commandQueue: MTLCommandQueue
let device = MTLCopyAllDevices()[1]
var neuronsInMatrix1: MPSMatrix?
var neuronsInMatrix2: MPSMatrix?
var neuronsOutMatrix: MPSMatrix?
init() {
guard let cq = device.makeCommandQueue() else { fatalError() }
guard let cb = cq.makeCommandBuffer() else { fatalError() }
commandQueue = cq
commandBuffer = cb
let cMatrices = 2
let cRows = 1
let cColumns = 3
let sensoryInputs1: [MPSNumber] = [1, 2, 3]
let sensoryInputs2: [MPSNumber] = [4, 5, 6]
neuronsInMatrix1 = makeMatrix(device, sensoryInputs1)
neuronsInMatrix2 = makeMatrix(device, sensoryInputs2)
let rowStride = MPSMatrixDescriptor.rowBytes(fromColumns: cColumns, dataType: MPSNumberTypeInGPU)
neuronsOutMatrix = makeMatrix(device, cRows, cColumnsOut: cColumns, rowStride: rowStride)
let adder = MPSMatrixSum(
device: device, count: cMatrices, rows: cRows, columns: cColumns, transpose: false
)
adder.encode(
to: commandBuffer,
sourceMatrices: [neuronsInMatrix1!, neuronsInMatrix2!],
resultMatrix: neuronsOutMatrix!, scale: nil, offsetVector: nil,
biasVector: nil, start: 0
)
commandBuffer.addCompletedHandler { _ in
let motorOutputs = self.getComputeOutput(self.neuronsOutMatrix!)
let discrete = !self.device.isLowPower && !self.device.isRemovable
let caps = "\(self.device.isHeadless ? " headless" : " headful")" +
"\(discrete ? ", discrete" : ", not discrete")" +
"\(self.device.isLowPower ? ", integrated" : ", not integrated")" +
"\(self.device.isRemovable ? ", external" : ", not external")"
print("Device \(self.device.name); caps:\(caps); motor outputs \(motorOutputs)")
}
}
func compute() {
for matrix in [neuronsInMatrix1!, neuronsInMatrix2!, neuronsOutMatrix!] {
let matrixData = matrix.data
matrixData.didModifyRange(0..<matrixData.length)
matrix.synchronize(on: commandBuffer)
}
commandBuffer.commit()
}
}
extension MPSNet {
func getComputeOutput(_ matrix: MPSMatrix) -> [Double] {
let rc = matrix.data.contents()
return stride(from: 0, to: matrix.columns * MPSNumberSize, by: MPSNumberSize).map {
offset in
let rr = rc.load(fromByteOffset: offset, as: MPSNumber.self)
return Double(rr)
}
}
func loadMatrix(_ data: MTLBuffer, _ rawValues: [MPSNumber]) {
let dContents = data.contents()
zip(stride(from: 0, to: rawValues.count * MPSNumberSize, by: MPSNumberSize), rawValues).forEach { z in
let (byteOffset, rawValue) = (z.0, MPSNumber(z.1))
dContents.storeBytes(of: rawValue, toByteOffset: byteOffset, as: MPSNumber.self)
}
}
func makeMatrix(_ device: MTLDevice, _ rawValues: [MPSNumber]) -> MPSMatrix {
let rowStride = MPSMatrixDescriptor.rowBytes(
fromColumns: rawValues.count, dataType: MPSNumberTypeInGPU
)
let descriptor = MPSMatrixDescriptor(
dimensions: 1, columns: rawValues.count, rowBytes: rowStride,
dataType: MPSNumberTypeInGPU
)
guard let inputBuffer = device.makeBuffer(
length: descriptor.matrixBytes, options: MTLResourceOptions.storageModeManaged
) else { fatalError() }
loadMatrix(inputBuffer, rawValues)
return MPSMatrix(buffer: inputBuffer, descriptor: descriptor)
}
func makeMatrix(_ device: MTLDevice, _ cRowsOut: Int, cColumnsOut: Int, rowStride: Int) -> MPSMatrix {
let matrixDescriptor = MPSMatrixDescriptor(
dimensions: cRowsOut, columns: cColumnsOut,
rowBytes: rowStride, dataType: MPSNumberTypeInGPU
)
return MPSMatrix(device: device, descriptor: matrixDescriptor)
}
}
let net = MPSNet()
net.compute()
我 运行在我的 MacBook Pro(Retina,15 英寸,2015 年中)上使用 macOS,根据 Apple 菜单中的 "About this Mac",它有两个 GPU。一个 GPU 是 AMD Radeon R9 M370X 2 GB,另一个是 Intel Iris Pro 1536 MB——我猜是标准芯片吧?是我买的时候里面的薯条,我什么都没加。
我正在使用 Swift MPS 库进行矩阵计算;它在 Intel GPU 上运行良好,但是当我 select Radeon 时,我只能从每个操作中返回零,没有错误报告。我四处寻找有关它的文档,但找不到任何东西。到目前为止,我唯一的线索是 Radeon 报告 "not integrated"(或者至少,我认为它是,基于 Finding GPUs on macOS 处的示例代码,这与 Apple 的文档一样有用,意思是不太)。如果我没看错那一页,这就是我的两个 GPU 告诉我的。
Device Intel Iris Pro Graphics; caps: headful, not discrete, integrated, not external
Device AMD Radeon R9 M370X; caps: headful, discrete, not integrated, not external
我找不到任何可以提示我做错了什么的文档。我遍历了 Apple 的 MPS 文档,但无济于事。正如我所说,该代码在 Intel GPU 上运行良好,所以我认为它在 Radeon 上也会 运行。我有 运行 一些可下载的诊断工具来检查 Radeon,但它没有出现在这些工具的菜单中。所以我什至不知道这是我在代码中做错了什么,还是芯片本身坏了。
下面是代码,您可以通过将其粘贴到 main.swift
来将其构建为控制台应用程序。找到以下行:
let device = MTLCopyAllDevices()[1]
我对Intel使用[0]
,对Radeon使用[1]
,你可以看到输出是不同的,即Radeon全为零。我想你的里程可能会因你的机器而异。欢迎任何意见,干杯
import MetalPerformanceShaders
typealias MPSNumber = Float32
let MPSNumberSize = MemoryLayout<MPSNumber>.size
let MPSNumberTypeInGPU = MPSDataType.float32
class MPSNet {
let commandBuffer: MTLCommandBuffer
let commandQueue: MTLCommandQueue
let device = MTLCopyAllDevices()[1]
var neuronsInMatrix1: MPSMatrix?
var neuronsInMatrix2: MPSMatrix?
var neuronsOutMatrix: MPSMatrix?
init() {
guard let cq = device.makeCommandQueue() else { fatalError() }
guard let cb = cq.makeCommandBuffer() else { fatalError() }
commandQueue = cq
commandBuffer = cb
let cMatrices = 2
let cRows = 1
let cColumns = 3
let sensoryInputs1: [MPSNumber] = [1, 2, 3]
let sensoryInputs2: [MPSNumber] = [4, 5, 6]
neuronsInMatrix1 = makeMatrix(device, sensoryInputs1)
neuronsInMatrix2 = makeMatrix(device, sensoryInputs2)
let rowStride = MPSMatrixDescriptor.rowBytes(fromColumns: cColumns, dataType: MPSNumberTypeInGPU)
neuronsOutMatrix = makeMatrix(device, cRows, cColumnsOut: cColumns, rowStride: rowStride)
let adder = MPSMatrixSum(
device: device, count: cMatrices, rows: cRows, columns: cColumns, transpose: false
)
adder.encode(
to: commandBuffer,
sourceMatrices: [neuronsInMatrix1!, neuronsInMatrix2!],
resultMatrix: neuronsOutMatrix!, scale: nil, offsetVector: nil,
biasVector: nil, start: 0
)
commandBuffer.addCompletedHandler { _ in
let motorOutputs = self.getComputeOutput(self.neuronsOutMatrix!)
let discrete = !self.device.isLowPower && !self.device.isRemovable
let caps = "\(self.device.isHeadless ? " headless" : " headful")" +
"\(discrete ? ", discrete" : ", not discrete")" +
"\(self.device.isLowPower ? ", integrated" : ", not integrated")" +
"\(self.device.isRemovable ? ", external" : ", not external")"
print("Device \(self.device.name); caps:\(caps); motor outputs \(motorOutputs)")
}
}
func compute() {
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
}
}
extension MPSNet {
func getComputeOutput(_ matrix: MPSMatrix) -> [Double] {
let rc = matrix.data.contents()
return stride(from: 0, to: matrix.columns * MPSNumberSize, by: MPSNumberSize).map {
offset in
let rr = rc.load(fromByteOffset: offset, as: MPSNumber.self)
return Double(rr)
}
}
func loadMatrix(_ data: MTLBuffer, _ rawValues: [MPSNumber]) {
let dContents = data.contents()
zip(stride(from: 0, to: rawValues.count * MPSNumberSize, by: MPSNumberSize), rawValues).forEach { z in
let (byteOffset, rawValue) = (z.0, MPSNumber(z.1))
dContents.storeBytes(of: rawValue, toByteOffset: byteOffset, as: MPSNumber.self)
}
}
func makeMatrix(_ device: MTLDevice, _ rawValues: [MPSNumber]) -> MPSMatrix {
let rowStride = MPSMatrixDescriptor.rowBytes(
fromColumns: rawValues.count, dataType: MPSNumberTypeInGPU
)
let descriptor = MPSMatrixDescriptor(
dimensions: 1, columns: rawValues.count, rowBytes: rowStride,
dataType: MPSNumberTypeInGPU
)
guard let inputBuffer = device.makeBuffer(
length: descriptor.matrixBytes, options: MTLResourceOptions.storageModeManaged
) else { fatalError() }
loadMatrix(inputBuffer, rawValues)
return MPSMatrix(buffer: inputBuffer, descriptor: descriptor)
}
func makeMatrix(_ device: MTLDevice, _ cRowsOut: Int, cColumnsOut: Int, rowStride: Int) -> MPSMatrix {
let matrixDescriptor = MPSMatrixDescriptor(
dimensions: cRowsOut, columns: cColumnsOut,
rowBytes: rowStride, dataType: MPSNumberTypeInGPU
)
return MPSMatrix(device: device, descriptor: matrixDescriptor)
}
}
let net = MPSNet()
net.compute()
您似乎未能使用-[MPSMatrix synchronizeOnCommandBuffer:]。在离散设备上,在数据从 GPU 返回之前需要进行一些显式同步。
问题出在你的矩阵缓冲区的存储模式上。您正在使用 MTLResourceOptions.storageModeManaged
,它告诉 Metal 您想要管理 CPU 和 GPU 之间共享的内存的同步。正如此处另一个答案中所述,在尝试使用 CPU 读取数据之前,必须在 GPU 操作之后使用 MPSMatrix.synchronize(on: MTLCommandBuffer)
。但是您还必须在另一个方向同步,即在 CPU 操作之后,在您将命令提交给 GPU 之前,使用 MTLBuffer.didModifyRange(_: Range)
.
或者,您可以使用共享存储模式,MTLResourceOptions.storageModeShared
,它会为您处理同步。
有关详细信息,请参阅 Apple 文档中的 Synchronizing a Managed Resource。
下面是您的示例的工作版本,它使用您所拥有的托管存储模式。注意函数 MPSNet.compute()
的差异。如果您的应用程序可以使用共享存储模式,则可以在为矩阵创建 MTLBuffer
时将这些东西放在一边,只更改存储模式。
import MetalPerformanceShaders
typealias MPSNumber = Float32
let MPSNumberSize = MemoryLayout<MPSNumber>.size
let MPSNumberTypeInGPU = MPSDataType.float32
class MPSNet {
let commandBuffer: MTLCommandBuffer
let commandQueue: MTLCommandQueue
let device = MTLCopyAllDevices()[1]
var neuronsInMatrix1: MPSMatrix?
var neuronsInMatrix2: MPSMatrix?
var neuronsOutMatrix: MPSMatrix?
init() {
guard let cq = device.makeCommandQueue() else { fatalError() }
guard let cb = cq.makeCommandBuffer() else { fatalError() }
commandQueue = cq
commandBuffer = cb
let cMatrices = 2
let cRows = 1
let cColumns = 3
let sensoryInputs1: [MPSNumber] = [1, 2, 3]
let sensoryInputs2: [MPSNumber] = [4, 5, 6]
neuronsInMatrix1 = makeMatrix(device, sensoryInputs1)
neuronsInMatrix2 = makeMatrix(device, sensoryInputs2)
let rowStride = MPSMatrixDescriptor.rowBytes(fromColumns: cColumns, dataType: MPSNumberTypeInGPU)
neuronsOutMatrix = makeMatrix(device, cRows, cColumnsOut: cColumns, rowStride: rowStride)
let adder = MPSMatrixSum(
device: device, count: cMatrices, rows: cRows, columns: cColumns, transpose: false
)
adder.encode(
to: commandBuffer,
sourceMatrices: [neuronsInMatrix1!, neuronsInMatrix2!],
resultMatrix: neuronsOutMatrix!, scale: nil, offsetVector: nil,
biasVector: nil, start: 0
)
commandBuffer.addCompletedHandler { _ in
let motorOutputs = self.getComputeOutput(self.neuronsOutMatrix!)
let discrete = !self.device.isLowPower && !self.device.isRemovable
let caps = "\(self.device.isHeadless ? " headless" : " headful")" +
"\(discrete ? ", discrete" : ", not discrete")" +
"\(self.device.isLowPower ? ", integrated" : ", not integrated")" +
"\(self.device.isRemovable ? ", external" : ", not external")"
print("Device \(self.device.name); caps:\(caps); motor outputs \(motorOutputs)")
}
}
func compute() {
for matrix in [neuronsInMatrix1!, neuronsInMatrix2!, neuronsOutMatrix!] {
let matrixData = matrix.data
matrixData.didModifyRange(0..<matrixData.length)
matrix.synchronize(on: commandBuffer)
}
commandBuffer.commit()
}
}
extension MPSNet {
func getComputeOutput(_ matrix: MPSMatrix) -> [Double] {
let rc = matrix.data.contents()
return stride(from: 0, to: matrix.columns * MPSNumberSize, by: MPSNumberSize).map {
offset in
let rr = rc.load(fromByteOffset: offset, as: MPSNumber.self)
return Double(rr)
}
}
func loadMatrix(_ data: MTLBuffer, _ rawValues: [MPSNumber]) {
let dContents = data.contents()
zip(stride(from: 0, to: rawValues.count * MPSNumberSize, by: MPSNumberSize), rawValues).forEach { z in
let (byteOffset, rawValue) = (z.0, MPSNumber(z.1))
dContents.storeBytes(of: rawValue, toByteOffset: byteOffset, as: MPSNumber.self)
}
}
func makeMatrix(_ device: MTLDevice, _ rawValues: [MPSNumber]) -> MPSMatrix {
let rowStride = MPSMatrixDescriptor.rowBytes(
fromColumns: rawValues.count, dataType: MPSNumberTypeInGPU
)
let descriptor = MPSMatrixDescriptor(
dimensions: 1, columns: rawValues.count, rowBytes: rowStride,
dataType: MPSNumberTypeInGPU
)
guard let inputBuffer = device.makeBuffer(
length: descriptor.matrixBytes, options: MTLResourceOptions.storageModeManaged
) else { fatalError() }
loadMatrix(inputBuffer, rawValues)
return MPSMatrix(buffer: inputBuffer, descriptor: descriptor)
}
func makeMatrix(_ device: MTLDevice, _ cRowsOut: Int, cColumnsOut: Int, rowStride: Int) -> MPSMatrix {
let matrixDescriptor = MPSMatrixDescriptor(
dimensions: cRowsOut, columns: cColumnsOut,
rowBytes: rowStride, dataType: MPSNumberTypeInGPU
)
return MPSMatrix(device: device, descriptor: matrixDescriptor)
}
}
let net = MPSNet()
net.compute()