Python Numbe Suda 比 Jit 慢
Python Numba Cuda slower than JIT
我目前正致力于通过将某些数值处理卸载到 GPU 来加速它。我在下面有一些演示代码(实际代码会更复杂)。我正在获取一个 NP 数组并计算有多少值落在一个范围内。
硬件,我是 运行ning n 和 AMD 3600X(6 核 12 线程)和 RTX 2060 Super(2176 cuda 核)。
示例代码:
import time
import numpy as np
from numba import cuda
from numba import jit
width = 1024
height = 1024
size = width * height
print(f'Number of records {size}')
array_of_random = np.random.rand(size)
output_array = np.zeros(size, dtype=bool)
device_array = cuda.to_device(array_of_random)
device_output_array = cuda.device_array_like(output_array)
def count_array_standard(array, pivot_point, local_output_array):
for i in range(array.shape[0]):
if (pivot_point - 0.05) < array[i] < (pivot_point + 0.05):
local_output_array[i] = True
else:
local_output_array[i] = False
@jit('(f8,b1[:])')
def count_array_jit(pivot_point, local_output_array):
global array_of_random
for i in range(len(array_of_random)):
if (pivot_point - 0.05) < array_of_random[i] < (pivot_point + 0.05):
local_output_array[i] = True
else:
local_output_array[i] = False
@cuda.jit()
def count_array_cuda(local_device_array, pivot_point, local_device_output_array):
tx = cuda.threadIdx.x
ty = cuda.blockIdx.x
bw = cuda.blockDim.x
pos = tx + ty * bw
for i in range(pos, pos + bw):
if i<local_device_output_array.size:
if (pivot_point - 0.05) < local_device_array[i] < (pivot_point + 0.05):
local_device_output_array[i] = True
else:
local_device_output_array[i] = False
print("")
print("Standard")
for x in range(3):
start = time.perf_counter()
count_array_standard(array_of_random, 0.5, output_array)
result = np.sum(output_array)
print(f'Run: {x} Result: {result} Time: {time.perf_counter() - start}')
print("")
print("Jit")
for x in range(3):
start = time.perf_counter()
count_array_jit(0.5, output_array)
result = np.sum(output_array)
print(f'Run: {x} Result: {result} Time: {time.perf_counter() - start}')
print("")
print("Cuda Jit")
threads_per_block = 16
blocks_per_grid = (array_of_random.size + (threads_per_block - 1)) // threads_per_block
for x in range(3):
start = time.perf_counter()
count_array_cuda[blocks_per_grid, threads_per_block](device_array, .5, device_output_array)
result = np.sum(device_output_array.copy_to_host())
print(f'Run: {x} Result: {result} Time: {time.perf_counter() - start}')
给我一组结果:
Number of records 1048576
Standard
Run: 0 Result: 104778 Time: 0.35327580000000003
Run: 1 Result: 104778 Time: 0.3521047999999999
Run: 2 Result: 104778 Time: 0.35452510000000004
Jit
Run: 0 Result: 104778 Time: 0.0020474000000001435
Run: 1 Result: 104778 Time: 0.001856599999999986
Run: 2 Result: 104778 Time: 0.0018399000000000054
Cuda Jit
Run: 0 Result: 104778 Time: 0.10867309999999986
Run: 1 Result: 104778 Time: 0.0023599000000000814
Run: 2 Result: 104778 Time: 0.002314700000000114
numba 的基本 jit 和 cuda jit 都比标准代码快,我确实希望 jit 的初始 运行 需要更长的时间,随后的 运行s 使用 jit 会更快比cuda。当使用大约 16 个线程时,我还看到了 cuda 的最佳结果,我期望需要更高的线程数。
由于我是 cuda 编码的新手,我想知道我是否遗漏了一些基本知识。非常感谢收到任何指导。
我发现了 2 个问题。
输入数组中每个数据项的工作量太小,无法在 GPU 上发挥作用。
您选择的线程组织与 cuda.jit 例程中的 for 循环相结合似乎在做多余的工作。
为了解决第 1 项,您可能需要对每个项目做更多的工作,而不仅仅是将其与限制进行比较并写下比较结果。或者,如果您真的对这个基准测试感兴趣,如果您将数据移动分开,您可以对内核本身计时,看看计算成本是多少。
对于解决第 2 项的简单方法,我会去掉 cuda.jit 内核中的 for 循环,让每个线程处理输入数组中的 1 个元素。这是一个这样做的例子(转换为 python 2.x 因为这是我用来玩 numba 的机器设置):
$ cat t58.py
import time
import numpy as np
from numba import cuda
from numba import jit
width = 1024
height = 1024
size = width * height
print("Number of records")
print(size)
array_of_random = np.random.rand(size)
output_array = np.zeros(size, dtype=bool)
device_array = cuda.to_device(array_of_random)
device_output_array = cuda.device_array_like(output_array)
def count_array_standard(array, pivot_point, local_output_array):
for i in range(array.shape[0]):
if (pivot_point - 0.05) < array[i] < (pivot_point + 0.05):
local_output_array[i] = True
else:
local_output_array[i] = False
@jit('(f8,b1[:])')
def count_array_jit(pivot_point, local_output_array):
global array_of_random
for i in range(len(array_of_random)):
if (pivot_point - 0.05) < array_of_random[i] < (pivot_point + 0.05):
local_output_array[i] = True
else:
local_output_array[i] = False
@cuda.jit()
def count_array_cuda(local_device_array, pivot_point, local_device_output_array):
tx = cuda.threadIdx.x
ty = cuda.blockIdx.x
bw = cuda.blockDim.x
i = tx + ty * bw
if i<local_device_output_array.size:
if (pivot_point - 0.05) < local_device_array[i] < (pivot_point + 0.05):
local_device_output_array[i] = True
else:
local_device_output_array[i] = False
print("")
print("Standard")
for x in range(3):
start = time.clock()
count_array_standard(array_of_random, 0.5, output_array)
result = np.sum(output_array)
print(x)
print(result)
print(time.clock() - start)
print("")
print("Jit")
for x in range(3):
start = time.clock()
count_array_jit(0.5, output_array)
result = np.sum(output_array)
print(x)
print(result)
print(time.clock() - start)
print("")
print("Cuda Jit")
threads_per_block = 128
blocks_per_grid = (array_of_random.size + (threads_per_block - 1)) // threads_per_block
for x in range(3):
start = time.clock()
count_array_cuda[blocks_per_grid, threads_per_block](device_array, .5, device_output_array)
cuda.synchronize()
stop = time.clock()
result = np.sum(device_output_array.copy_to_host())
print(x)
print(result)
print(stop - start)
$ python t58.py
Number of records
1048576
Standard
0
104891
0.53704
1
104891
0.528287
2
104891
0.515948
Jit
0
104891
0.002993
1
104891
0.002635
2
104891
0.002595
Cuda Jit
0
104891
0.146518
1
104891
0.000832
2
104891
0.000813
$
我目前正致力于通过将某些数值处理卸载到 GPU 来加速它。我在下面有一些演示代码(实际代码会更复杂)。我正在获取一个 NP 数组并计算有多少值落在一个范围内。
硬件,我是 运行ning n 和 AMD 3600X(6 核 12 线程)和 RTX 2060 Super(2176 cuda 核)。
示例代码:
import time
import numpy as np
from numba import cuda
from numba import jit
width = 1024
height = 1024
size = width * height
print(f'Number of records {size}')
array_of_random = np.random.rand(size)
output_array = np.zeros(size, dtype=bool)
device_array = cuda.to_device(array_of_random)
device_output_array = cuda.device_array_like(output_array)
def count_array_standard(array, pivot_point, local_output_array):
for i in range(array.shape[0]):
if (pivot_point - 0.05) < array[i] < (pivot_point + 0.05):
local_output_array[i] = True
else:
local_output_array[i] = False
@jit('(f8,b1[:])')
def count_array_jit(pivot_point, local_output_array):
global array_of_random
for i in range(len(array_of_random)):
if (pivot_point - 0.05) < array_of_random[i] < (pivot_point + 0.05):
local_output_array[i] = True
else:
local_output_array[i] = False
@cuda.jit()
def count_array_cuda(local_device_array, pivot_point, local_device_output_array):
tx = cuda.threadIdx.x
ty = cuda.blockIdx.x
bw = cuda.blockDim.x
pos = tx + ty * bw
for i in range(pos, pos + bw):
if i<local_device_output_array.size:
if (pivot_point - 0.05) < local_device_array[i] < (pivot_point + 0.05):
local_device_output_array[i] = True
else:
local_device_output_array[i] = False
print("")
print("Standard")
for x in range(3):
start = time.perf_counter()
count_array_standard(array_of_random, 0.5, output_array)
result = np.sum(output_array)
print(f'Run: {x} Result: {result} Time: {time.perf_counter() - start}')
print("")
print("Jit")
for x in range(3):
start = time.perf_counter()
count_array_jit(0.5, output_array)
result = np.sum(output_array)
print(f'Run: {x} Result: {result} Time: {time.perf_counter() - start}')
print("")
print("Cuda Jit")
threads_per_block = 16
blocks_per_grid = (array_of_random.size + (threads_per_block - 1)) // threads_per_block
for x in range(3):
start = time.perf_counter()
count_array_cuda[blocks_per_grid, threads_per_block](device_array, .5, device_output_array)
result = np.sum(device_output_array.copy_to_host())
print(f'Run: {x} Result: {result} Time: {time.perf_counter() - start}')
给我一组结果:
Number of records 1048576
Standard
Run: 0 Result: 104778 Time: 0.35327580000000003
Run: 1 Result: 104778 Time: 0.3521047999999999
Run: 2 Result: 104778 Time: 0.35452510000000004
Jit
Run: 0 Result: 104778 Time: 0.0020474000000001435
Run: 1 Result: 104778 Time: 0.001856599999999986
Run: 2 Result: 104778 Time: 0.0018399000000000054
Cuda Jit
Run: 0 Result: 104778 Time: 0.10867309999999986
Run: 1 Result: 104778 Time: 0.0023599000000000814
Run: 2 Result: 104778 Time: 0.002314700000000114
numba 的基本 jit 和 cuda jit 都比标准代码快,我确实希望 jit 的初始 运行 需要更长的时间,随后的 运行s 使用 jit 会更快比cuda。当使用大约 16 个线程时,我还看到了 cuda 的最佳结果,我期望需要更高的线程数。
由于我是 cuda 编码的新手,我想知道我是否遗漏了一些基本知识。非常感谢收到任何指导。
我发现了 2 个问题。
输入数组中每个数据项的工作量太小,无法在 GPU 上发挥作用。
您选择的线程组织与 cuda.jit 例程中的 for 循环相结合似乎在做多余的工作。
为了解决第 1 项,您可能需要对每个项目做更多的工作,而不仅仅是将其与限制进行比较并写下比较结果。或者,如果您真的对这个基准测试感兴趣,如果您将数据移动分开,您可以对内核本身计时,看看计算成本是多少。
对于解决第 2 项的简单方法,我会去掉 cuda.jit 内核中的 for 循环,让每个线程处理输入数组中的 1 个元素。这是一个这样做的例子(转换为 python 2.x 因为这是我用来玩 numba 的机器设置):
$ cat t58.py
import time
import numpy as np
from numba import cuda
from numba import jit
width = 1024
height = 1024
size = width * height
print("Number of records")
print(size)
array_of_random = np.random.rand(size)
output_array = np.zeros(size, dtype=bool)
device_array = cuda.to_device(array_of_random)
device_output_array = cuda.device_array_like(output_array)
def count_array_standard(array, pivot_point, local_output_array):
for i in range(array.shape[0]):
if (pivot_point - 0.05) < array[i] < (pivot_point + 0.05):
local_output_array[i] = True
else:
local_output_array[i] = False
@jit('(f8,b1[:])')
def count_array_jit(pivot_point, local_output_array):
global array_of_random
for i in range(len(array_of_random)):
if (pivot_point - 0.05) < array_of_random[i] < (pivot_point + 0.05):
local_output_array[i] = True
else:
local_output_array[i] = False
@cuda.jit()
def count_array_cuda(local_device_array, pivot_point, local_device_output_array):
tx = cuda.threadIdx.x
ty = cuda.blockIdx.x
bw = cuda.blockDim.x
i = tx + ty * bw
if i<local_device_output_array.size:
if (pivot_point - 0.05) < local_device_array[i] < (pivot_point + 0.05):
local_device_output_array[i] = True
else:
local_device_output_array[i] = False
print("")
print("Standard")
for x in range(3):
start = time.clock()
count_array_standard(array_of_random, 0.5, output_array)
result = np.sum(output_array)
print(x)
print(result)
print(time.clock() - start)
print("")
print("Jit")
for x in range(3):
start = time.clock()
count_array_jit(0.5, output_array)
result = np.sum(output_array)
print(x)
print(result)
print(time.clock() - start)
print("")
print("Cuda Jit")
threads_per_block = 128
blocks_per_grid = (array_of_random.size + (threads_per_block - 1)) // threads_per_block
for x in range(3):
start = time.clock()
count_array_cuda[blocks_per_grid, threads_per_block](device_array, .5, device_output_array)
cuda.synchronize()
stop = time.clock()
result = np.sum(device_output_array.copy_to_host())
print(x)
print(result)
print(stop - start)
$ python t58.py
Number of records
1048576
Standard
0
104891
0.53704
1
104891
0.528287
2
104891
0.515948
Jit
0
104891
0.002993
1
104891
0.002635
2
104891
0.002595
Cuda Jit
0
104891
0.146518
1
104891
0.000832
2
104891
0.000813
$