从 GPU 上的矩阵乘法中获益的最小矩阵大小
Minimum matrix sizes to benefit from matrix multiplication on GPU
我对使用 Metal Performance Shaders 的矩阵乘法特别感兴趣,但关于其他框架的答案也很好。
矩阵乘法在理论上是高度可并行化的操作。我需要将很多矩阵自己相乘,例如 A’ A(其中撇号代表转置)。矩阵 A 的大小约为 4000 x 300。考虑到这些矩阵的大小,我想知道是否值得将乘法代码移植到 GPU。据我了解,在 GPU 上进行乘法还涉及将数据从主内存复制到 GPU 内存(我使用的是 eGPU,因此内存不共享)。然后必须在来回复制数据的额外工作与加速计算之间进行权衡。所以我的问题是:在多大的矩阵(大约)下我可以开始看到在 GPU 上这样做的好处?
P.S。还有这篇文章基本上说不要打扰,因为 GPU 没有帮助,它的内存缓存很慢(通常在所有 GPU 上):https://graphics.stanford.edu/papers/gpumatrixmult/gpumatrixmult.pdf
我建议您查看 Apple 的 Accelerate 框架的 vDSP 部分。他们对矩阵 multiplication and transposition.
有非常快的 SIMD 函数
他们最近也添加了一些Swift-friendly APIs。
我进行了测试,对于我的情况,它在 GPU 上明显更快 (x 8-9),甚至包括从 CPU 到 GPU 和返回的所有内存复制。我正在比较 float32 矩阵乘法性能,因为 Metal 不支持 float64。
let count = 100
let N = 7005
let K = 700
let DIV = 8
let K2 = (K / DIV) * DIV + (K % DIV > 0 ? 1 : 0) * DIV
let N2 = (N / DIV) * DIV + (N % DIV > 0 ? 1 : 0) * DIV
print(N2)
print(K2)
printTimeElapsedWhenRunningCode(title: "vDSP(f)") {
let ATf = [Float].init(repeating: Float(1), count: N*K)
let Af = [Float].init(repeating: Float(1), count: N*K)
var C = Array(repeating: Float(0), count: K*K)
for _ in 0..<count {
vDSP_mmul(ATf, 1,
Af, 1,
&C, 1,
vDSP_Length(K),
vDSP_Length(K),
vDSP_Length(N))
}
}
guard let bufferA = device.makeBuffer(length: K2 * N2 * MemoryLayout<Float>.stride,
options: [.storageModeManaged]) else {
fatalError("Could not make buffer A")
}
guard let bufferC = device.makeBuffer(length: K2 * K2 * MemoryLayout<Float>.stride,
options: [.storageModeManaged]) else {
fatalError("Could not make buffer C")
}
let descA = MPSMatrixDescriptor(dimensions: N2,
columns: K2,
rowBytes: K2 * MemoryLayout<Float>.stride,
dataType: .float32)
let descC = MPSMatrixDescriptor(dimensions: K2,
columns: K2,
rowBytes: K2 * MemoryLayout<Float>.stride,
dataType: .float32)
let matrixA = MPSMatrix(buffer: bufferA, descriptor: descA)
let matrixC = MPSMatrix(buffer: bufferC, descriptor: descC)
let matrixMultiplication = MPSMatrixMultiplication(device: device,
transposeLeft: true,
transposeRight: false,
resultRows: K2,
resultColumns: K2,
interiorColumns: N2,
alpha: 1,
beta: 0)
guard let commandQueue = device.makeCommandQueue() else {
fatalError("Could not make command queue")
}
printTimeElapsedWhenRunningCode(title: "Metal") {
let Af = [Float].init(repeating: Float(1), count: N*K)
let zeros = [Float].init(repeating: Float(0), count: K2)
for i in 0..<count {
var dest = bufferA.contents()
Af.withUnsafeBufferPointer { pA in
var from = pA.baseAddress!
for _ in 0..<N {
dest.copyMemory(from: from, byteCount: K)
dest += K
if K2 > K {
dest.copyMemory(from: zeros, byteCount: K2 - K)
dest += K2 - K
}
from += K
}
}
for _ in 0..<(N2-N) {
dest.copyMemory(from: zeros, byteCount: K2)
}
bufferA.didModifyRange(0..<N2*K2)
let commandBuffer = commandQueue.makeCommandBuffer()!
matrixMultiplication.encode(commandBuffer: commandBuffer,
leftMatrix: matrixA,
rightMatrix: matrixA,
resultMatrix: matrixC)
let blitEncoder = commandBuffer.makeBlitCommandEncoder()!
blitEncoder.synchronize(resource: bufferC)
blitEncoder.endEncoding()
commandBuffer.commit()
if i == count - 1 {
commandBuffer.waitUntilCompleted()
}
}
}
输出:
AMD Radeon RX 5700 XT
7008
704
Time elapsed for vDSP(f): 5.156805992126465 s.
Time elapsed for Metal: 0.6834449768066406 s.
DONE.
我对使用 Metal Performance Shaders 的矩阵乘法特别感兴趣,但关于其他框架的答案也很好。
矩阵乘法在理论上是高度可并行化的操作。我需要将很多矩阵自己相乘,例如 A’ A(其中撇号代表转置)。矩阵 A 的大小约为 4000 x 300。考虑到这些矩阵的大小,我想知道是否值得将乘法代码移植到 GPU。据我了解,在 GPU 上进行乘法还涉及将数据从主内存复制到 GPU 内存(我使用的是 eGPU,因此内存不共享)。然后必须在来回复制数据的额外工作与加速计算之间进行权衡。所以我的问题是:在多大的矩阵(大约)下我可以开始看到在 GPU 上这样做的好处?
P.S。还有这篇文章基本上说不要打扰,因为 GPU 没有帮助,它的内存缓存很慢(通常在所有 GPU 上):https://graphics.stanford.edu/papers/gpumatrixmult/gpumatrixmult.pdf
我建议您查看 Apple 的 Accelerate 框架的 vDSP 部分。他们对矩阵 multiplication and transposition.
有非常快的 SIMD 函数他们最近也添加了一些Swift-friendly APIs。
我进行了测试,对于我的情况,它在 GPU 上明显更快 (x 8-9),甚至包括从 CPU 到 GPU 和返回的所有内存复制。我正在比较 float32 矩阵乘法性能,因为 Metal 不支持 float64。
let count = 100
let N = 7005
let K = 700
let DIV = 8
let K2 = (K / DIV) * DIV + (K % DIV > 0 ? 1 : 0) * DIV
let N2 = (N / DIV) * DIV + (N % DIV > 0 ? 1 : 0) * DIV
print(N2)
print(K2)
printTimeElapsedWhenRunningCode(title: "vDSP(f)") {
let ATf = [Float].init(repeating: Float(1), count: N*K)
let Af = [Float].init(repeating: Float(1), count: N*K)
var C = Array(repeating: Float(0), count: K*K)
for _ in 0..<count {
vDSP_mmul(ATf, 1,
Af, 1,
&C, 1,
vDSP_Length(K),
vDSP_Length(K),
vDSP_Length(N))
}
}
guard let bufferA = device.makeBuffer(length: K2 * N2 * MemoryLayout<Float>.stride,
options: [.storageModeManaged]) else {
fatalError("Could not make buffer A")
}
guard let bufferC = device.makeBuffer(length: K2 * K2 * MemoryLayout<Float>.stride,
options: [.storageModeManaged]) else {
fatalError("Could not make buffer C")
}
let descA = MPSMatrixDescriptor(dimensions: N2,
columns: K2,
rowBytes: K2 * MemoryLayout<Float>.stride,
dataType: .float32)
let descC = MPSMatrixDescriptor(dimensions: K2,
columns: K2,
rowBytes: K2 * MemoryLayout<Float>.stride,
dataType: .float32)
let matrixA = MPSMatrix(buffer: bufferA, descriptor: descA)
let matrixC = MPSMatrix(buffer: bufferC, descriptor: descC)
let matrixMultiplication = MPSMatrixMultiplication(device: device,
transposeLeft: true,
transposeRight: false,
resultRows: K2,
resultColumns: K2,
interiorColumns: N2,
alpha: 1,
beta: 0)
guard let commandQueue = device.makeCommandQueue() else {
fatalError("Could not make command queue")
}
printTimeElapsedWhenRunningCode(title: "Metal") {
let Af = [Float].init(repeating: Float(1), count: N*K)
let zeros = [Float].init(repeating: Float(0), count: K2)
for i in 0..<count {
var dest = bufferA.contents()
Af.withUnsafeBufferPointer { pA in
var from = pA.baseAddress!
for _ in 0..<N {
dest.copyMemory(from: from, byteCount: K)
dest += K
if K2 > K {
dest.copyMemory(from: zeros, byteCount: K2 - K)
dest += K2 - K
}
from += K
}
}
for _ in 0..<(N2-N) {
dest.copyMemory(from: zeros, byteCount: K2)
}
bufferA.didModifyRange(0..<N2*K2)
let commandBuffer = commandQueue.makeCommandBuffer()!
matrixMultiplication.encode(commandBuffer: commandBuffer,
leftMatrix: matrixA,
rightMatrix: matrixA,
resultMatrix: matrixC)
let blitEncoder = commandBuffer.makeBlitCommandEncoder()!
blitEncoder.synchronize(resource: bufferC)
blitEncoder.endEncoding()
commandBuffer.commit()
if i == count - 1 {
commandBuffer.waitUntilCompleted()
}
}
}
输出:
AMD Radeon RX 5700 XT
7008
704
Time elapsed for vDSP(f): 5.156805992126465 s.
Time elapsed for Metal: 0.6834449768066406 s.
DONE.