使用 Numba 进行矩阵乘法时出现 CUDA 内存不足错误

Question

我需要将一个矩阵与其转置相乘，我运行我的 GPU 内存不足并出现错误消息 numba.cuda.cudadrv.driver.CudaAPIError: [2] Call to cuMemAlloc results in CUDA_ERROR_OUT_OF_MEMORY

我预计矩阵的大小约为 10k 行和 100k 列，因此将它与其 trnspose 相乘将得到 10k 行和 10k 列的方阵。矩阵只包含0和1.

这是我运行的脚本。

from numba import cuda, uint16
import numba
import numpy
import math
import time

TPB = 16

@cuda.jit()
def matmul_shared_mem(A, B, C):
    sA = cuda.shared.array((TPB, TPB), dtype=uint16)
    sB = cuda.shared.array((TPB, TPB), dtype=uint16)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    if x >= C.shape[0] and y >= C.shape[1]:
        return
    tmp = 0.
    for i in range(int(A.shape[1] / TPB)):
        sA[tx, ty] = A[x, ty + i * TPB]
        sB[tx, ty] = B[tx + i * TPB, y]
        cuda.syncthreads()
        for j in range(TPB):
            tmp += sA[tx, j] * sB[j, ty]
        cuda.syncthreads()
    C[x, y] = tmp

A = numpy.random.randint(2, size=(TPB * 625, 50000))

B = A.transpose()

C_shared_mem = cuda.device_array((A.shape[0], B.shape[1]))

threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(B.shape[1] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

start_gpu_shared_memory = time.time()
matmul_shared_mem[blocks_per_grid, threads_per_block](A, B, C_shared_mem)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))

更新 1：

根据您的建议，我做了一些更改，但我仍然运行内存不足。

import numpy as np
import numba as nb
colm = int(200000/8)
rows = 100000
cols = int(colm*8)
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
A = np.empty((rows,colm), dtype=np.uint8)

@nb.njit('void(uint8[:,:],int8[:,:])', parallel=True)
def compute(A, AU):
    for i in nb.prange(A.shape[0]):
        for j in range(A.shape[1]):
            offset = j * 8
            res = AU[i,offset] << 7
            res |= AU[i,offset+1] << 6
            res |= AU[i,offset+2] << 5
            res |= AU[i,offset+3] << 4
            res |= AU[i,offset+4] << 3
            res |= AU[i,offset+5] << 2
            res |= AU[i,offset+6] << 1
            res |= AU[i,offset+7]
            A[i,j] = res

compute(A, AU)

from numba import cuda, uint8, int32
import numba
import numpy as np
import math
import time

TPB = 8
TPB1 = 9

@cuda.jit()
def bit_A_AT(A, C):
    sA = cuda.shared.array((TPB, TPB), dtype=uint8)
    sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    if bx >= by:
        tmp = int32(0)
        for i in range((A.shape[1]+TPB-1) // TPB):
            if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sA[ty, tx] = A[y, i*TPB+tx]
            else:
                sA[ty, tx] = 0
            if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
            else:
                sB[ty, tx] = 0
            cuda.syncthreads()
            for j in range(TPB):
                tmp1 = sA[ty,j] & sB[tx, j]
                test = uint8(1)
                for k in range(8):
                    if (tmp1 & test) > 0:
                        tmp += 1
                    test <<= 1
            cuda.syncthreads()
        if y < C.shape[0] and x < C.shape[1]:
            C[y, x] = tmp

C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

start_gpu_shared_memory = time.time()
bit_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))

知道如何解决这个问题吗？

Answer 1

以下方法应减少计算 A x AT 所需的设备内存量。我们将使用以下想法：

由于输入数组 (A) 仅采用值 0,1，我们将将该数组的存储减少到最小方便大小，int8，即一个字节每个元素
因为B数组只是A数组的转置，所以不需要显式处理。我们可以从 A 数组中导出它，有点类似于，尽管那是执行 AT x A
A x AT 的矩阵乘法涉及对矩阵 A 的行进行点积，如 here
我们将使用调整后的索引

sB

您的代码还有一系列其他更改，以解决各种错误并提高 load/store 效率，例如对 x,y 索引的使用进行一般逆转
我还修复了 syncthreads 的用法并修改了代码以允许行和列维度的任意值

这是一个有效的例子：

$ cat t62.py
from numba import cuda, int32, int8
import numba
import numpy as np
import math
import time

TPB = 32
TPB1 = TPB+1
@cuda.jit()
def byte_A_AT(A, C):
    sA = cuda.shared.array((TPB, TPB),  dtype=int8)
    sB = cuda.shared.array((TPB, TPB1), dtype=int8)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
# uncomment and indent remainder of kernel to only do the "symmetric half" of calculation
#    if bx >= by:
    tmp = int32(0)
    for i in range((A.shape[1]+TPB-1)// TPB):
        if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sA[ty, tx] = A[y, i*TPB+tx]
        else:
            sA[ty, tx] = 0
        if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
        else:
            sB[ty, tx] = 0
        cuda.syncthreads()
        for j in range(TPB):
            tmp += int32(sA[ty,j]) * int32(sB[tx, j])
        cuda.syncthreads()
    if y < C.shape[0] and x < C.shape[1]:
        C[y, x] = tmp
rows = 1041
cols = 1043
print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (rows*cols+rows*rows*4)//1048576, 'MB')
A = np.random.randint(2,size=(rows, cols),dtype=np.int8)
AT = A.transpose()
CU = np.matmul(A,AT, dtype = np.int32)
C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
byte_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
start_gpu_shared_memory = time.time()
byte_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
test = np.array_equal(C, CU)
print(test)
if test == False:
    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            if C[i,j] != CU[i,j]:
                print(i, ' ' , j ,' ' , C[i,j] , ' ' , CU[i,j])
$ python t62.py
host mem:  10 MB device mem:  5 MB
GPU time(shared memory):0.019593000411987305
True
$

备注：

上面代码的大部分运行时间会花在python（在np.matmul()操作中，真的只是用来验证结果，不应该实际实现所必需的），而不是在 GPU 部分。随着矩阵变得更大，代码将运行慢得多。
如评论中所述，A x AT 的结果是一个对称矩阵。我的代码没有利用这一点，但是我们可以通过取消注释内核开头的 if 测试然后缩进内核的其余部分来粗略地利用它。但是，这当然会导致主机代码 np.array_equal 测试失败。
此设备内存消耗在代码中计算。对于您在评论中的最大值（行数 = 30k，列数 = 200k），这将达到大约 10GB，因此在您的 8GB GPU 中它仍然不会运行。
我创建了此代码的一个版本，它为 A 矩阵每个字节打包 8 个元素，这将进一步减少内存需求，但是编写此代码是为了处理任意列维度的情况（相对于8) 的倍数证明是相当混乱的。但是，对于 30k 行和 200k 列的情况，该代码可以使设备的总内存消耗降低到大约 5GB。

这里是每元素一位的版本，它要求列数能被8整除：

$ cat t61.py
from numba import cuda, uint8, int32
import numba
import numpy as np
import math
import time

TPB = 32
TPB1 = 33

@cuda.jit()
def bit_A_AT(A, C):
    sA = cuda.shared.array((TPB, TPB), dtype=uint8)
    sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    tmp = int32(0)
    for i in range((A.shape[1]+TPB-1) // TPB):
        if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sA[ty, tx] = A[y, i*TPB+tx]
        else:
            sA[ty, tx] = 0
        if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
        else:
            sB[ty, tx] = 0
        cuda.syncthreads()
        for j in range(TPB):
            tmp1 = sA[ty,j] & sB[tx, j]
            test = uint8(1)
            for k in range(8):
                if (tmp1 & test) > 0:
                    tmp += 1
                test <<= 1
        cuda.syncthreads()
    if y < C.shape[0] and x < C.shape[1]:
        C[y, x] = tmp
colm = 131
rows = 1041
cols = int(colm*8)
print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (((rows*cols)//8)+rows*rows*4)//1048576, 'MB')
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
AUT = AU.transpose()
CU = np.matmul(AU,AUT,dtype=np.int32)
A = np.empty((rows,colm), dtype=np.uint8)
for i in range(A.shape[0]):
    for j in range(A.shape[1]):
        A[i,j] = 0
        for k in range(8):
            if AU[i,(j*8)+k] == 1:
                A[i,j] = A[i,j] | (1<<(7-k))
C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
bit_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()

start_gpu_shared_memory = time.time()
bit_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
test = np.array_equal(C, CU)
print(test)
if test == False:
    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            if C[i,j] != CU[i,j]:
                print(i, ' ' , j ,' ' , C[i,j] , ' ' , CU[i,j])
                break
$ python t61.py
host mem:  10 MB device mem:  4 MB
GPU time(shared memory):0.009343624114990234
True
$

编辑：回答评论、更新中的一些问题，现在考虑到 A 矩阵可能有明显超过 30k 行，这将导致 C 矩阵也增加。如果 A 矩阵可以放入 GPU 内存中，我们可以通过分段计算来减少 C 矩阵的内存需求。这些部分将是一起计算的一组行，我将其称为 C 矩阵的 row_slice。下面的代码演示了这可以通过对上面的代码进行相对较小的更改来实现：

$ cat t63.py
from numba import cuda, uint8, int32
import numba as nb
import numpy as np
import math
import time

TPB = 32
TPB1 = 33

@cuda.jit()
def bit_slice_A_AT(A, C, row_offset):
    sA = cuda.shared.array((TPB, TPB), dtype=uint8)
    sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    tmp = int32(0)
    for i in range((A.shape[1]+TPB-1) // TPB):
        if y+row_offset < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sA[ty, tx] = A[y+row_offset, i*TPB+tx]
        else:
            sA[ty, tx] = 0
        if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
        else:
            sB[ty, tx] = 0
        cuda.syncthreads()
        for j in range(TPB):
            tmp1 = sA[ty,j] & sB[tx, j]
            test = uint8(1)
            for k in range(8):
                if (tmp1 & test) > 0:
                    tmp += 1
                test <<= 1
        cuda.syncthreads()
    if y < C.shape[0] and x < C.shape[1]:
        C[y, x] = tmp

@nb.njit('void(uint8[:,:],int8[:,:])', parallel=True)
def bitpack(A, AU):
    for i in nb.prange(A.shape[0]):
        for j in range(A.shape[1]):
            offset = j * 8
            res = AU[i,offset] << 7
            res |= AU[i,offset+1] << 6
            res |= AU[i,offset+2] << 5
            res |= AU[i,offset+3] << 4
            res |= AU[i,offset+4] << 3
            res |= AU[i,offset+5] << 2
            res |= AU[i,offset+6] << 1
            res |= AU[i,offset+7]
            A[i,j] = res

colm = 131
rows = 1535
cols = int(colm*8)
row_slice = 512
print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (((rows*cols)//8)+row_slice*rows*4)//1048576, 'MB')
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
CU = np.matmul(AU,AU.T,dtype=np.int32)
A = np.empty((rows,colm), dtype=np.uint8)
bitpack(A, AU)
A_dev = cuda.to_device(A)
threads_per_block = (TPB, TPB)
C = np.empty((row_slice, A.shape[0]), dtype=np.int32)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(row_slice / threads_per_block[1])
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
for i in range((A.shape[0]+row_slice-1)//row_slice):
    bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C, i*row_slice)
    lower = i*row_slice
    upper = min(lower+row_slice, CU.shape[0])
    width = upper-lower
    test = np.array_equal(C[:width,:], CU[i*row_slice:i*row_slice+width,:])
    print(test)
cuda.synchronize()
C_dev = cuda.device_array_like(C)
start_gpu_shared_memory = time.time()
for i in range((A.shape[0]+row_slice-1)//row_slice):
    bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C_dev, i*row_slice)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
$ python t63.py
host mem:  21 MB device mem:  3 MB
True
True
True
GPU time(shared memory):0.010116815567016602
$

这意味着，如建议的那样，对于问题的最新更新中给出的行 = 100k 和列 = 200k 的情况，我们应该能够将 C 矩阵分成 5k 行的块。 A 矩阵的内存使用量为 2.5GB，但对于 C 矩阵，由于我们一次只计算 5k 行切片，因此所需的设备内存存储量为 100k*5k*4 字节，因此本例为 2GB .

经过进一步研究，我们可以通过从 int8 数据类型切换到 float32 数据类型来加快主机代码 matmul 的运行。这使得该操作快了很多，但 GPU 代码似乎仍然比它快 4 倍：

$ cat t64.py
from numba import cuda, uint8, int32
import numba as nb
import numpy as np
import math
import time

TPB = 32
TPB1 = 33

@cuda.jit()
def bit_slice_A_AT(A, C, row_offset):
    sA = cuda.shared.array((TPB, TPB), dtype=uint8)
    sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    tmp = int32(0)
    for i in range((A.shape[1]+TPB-1) // TPB):
        if y+row_offset < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sA[ty, tx] = A[y+row_offset, i*TPB+tx]
        else:
            sA[ty, tx] = 0
        if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
        else:
            sB[ty, tx] = 0
        cuda.syncthreads()
        for j in range(TPB):
            tmp1 = sA[ty,j] & sB[tx, j]
            test = uint8(1)
            for k in range(8):
                if (tmp1 & test) > 0:
                    tmp += 1
                test <<= 1
        cuda.syncthreads()
    if y < C.shape[0] and x < C.shape[1]:
        C[y, x] = tmp

@nb.njit('void(uint8[:,:],float32[:,:])', parallel=True)
def bitpack(A, AU):
    for i in nb.prange(A.shape[0]):
        for j in range(A.shape[1]):
            offset = j * 8
            res = int(AU[i,offset]) << 7
            res |= int(AU[i,offset+1]) << 6
            res |= int(AU[i,offset+2]) << 5
            res |= int(AU[i,offset+3]) << 4
            res |= int(AU[i,offset+4]) << 3
            res |= int(AU[i,offset+5]) << 2
            res |= int(AU[i,offset+6]) << 1
            res |= int(AU[i,offset+7])
            A[i,j] = res

colm = 1000
rows = 6000
cols = int(colm*8)
row_slice = 512
print('host mem: ', (rows*cols*4+rows*colm+rows*rows*4+rows*row_slice*4)//1048576, 'MB device mem: ', (((rows*cols)//8)+row_slice*rows*4)//1048576, 'MB')
t1 = time.time()
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
AU = AU.astype(np.float32)
print("randint:" + str(time.time()-t1))
t1 = time.time()
#CU = np.empty((rows, rows), dtype=np.int32)
CU = np.matmul(AU,AU.T,dtype=np.float32)
print("matmul:" + str(time.time()-t1))
t1 = time.time()
A = np.empty((rows,colm), dtype=np.uint8)
print("np.empty:" + str(time.time()-t1))
t1 = time.time()
bitpack(A, AU)
print("bitpack:" + str(time.time()-t1))
t1 = time.time()
A_dev = cuda.to_device(A)
threads_per_block = (TPB, TPB)
C = np.empty((row_slice, A.shape[0]), dtype=np.int32)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(row_slice / threads_per_block[1])
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
for i in range((A.shape[0]+row_slice-1)//row_slice):
    bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C, i*row_slice)
    lower = i*row_slice
    upper = min(lower+row_slice, CU.shape[0])
    width = upper-lower
    test = np.array_equal(C[:width,:], CU[i*row_slice:i*row_slice+width,:])
    print(test)
cuda.synchronize()
C_dev = cuda.device_array_like(C)
start_gpu_shared_memory = time.time()
for i in range((A.shape[0]+row_slice-1)//row_slice):
    bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C_dev, i*row_slice)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
$ python t64.py
host mem:  337 MB device mem:  17 MB
randint:0.1817936897277832
matmul:3.498671531677246
np.empty:7.62939453125e-05
bitpack:0.03707313537597656
True
True
True
True
True
True
True
True
True
True
True
True
GPU time(shared memory):0.8318064212799072
$

我还没有彻底测试这些代码。可能存在错误。使用风险自负。

为了归属，numba 位压缩代码似乎来自。

使用 Numba 进行矩阵乘法时出现 CUDA 内存不足错误

CUDA out of memory error when doing matrix multiplication using Numba

cuda

matrix-multiplication

pycuda

numba