Cuda 并行化内核
Cuda Parallelize Kernel
我正在尝试在 GPU 上并行化模拟的简单更新循环。基本上有一堆 "creatures" 由圆圈表示,在每个更新循环中它们将移动,然后将检查它们是否相交。半径是不同类型生物的半径。
import numpy as np
import math
from numba import cuda
@cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32, uint32)')
def update(p_x, p_y, radii, types, velocities, max_velocities, acceleration, num_creatures, cycles):
for c in range(cycles):
for i in range(num_creatures):
velocities[i] = velocities[i] + acceleration
if velocities[i] > max_velocities[i]:
velocities[i] = max_velocities[i]
p_x[i] = p_x[i] + (math.cos(1.0) * velocities[i])
p_y[i] = p_y[i] + (math.sin(1.0) * velocities[i])
for i in range(num_creatures):
for j in range(i, num_creatures):
delta_x = p_x[j] - p_x[i]
delta_y = p_y[j] - p_y[i]
distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
sum_of_radii = radii[types[i]] + radii[types[i]]
if distance_squared < sum_of_radii * sum_of_radii:
pass
acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3
max_num_creatures = 1500
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 1
p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)
velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 800 // spacing):
for y in range(1, 600 // spacing):
if num_creatures % 2 == 0:
types[num_creatures] = 0
else:
types[num_creatures] = 1
p_x[num_creatures] = x * spacing
p_y[num_creatures] = y * spacing
max_velocities[num_creatures] = 5
num_creatures += 1
device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = 16
update[blockspergrid, threadsperblock](device_p_x, device_p_y, device_radii, device_types, device_velocities, device_max_velocities,
acceleration, num_creatures, cycles)
print(device_p_x.copy_to_host())
math.cos 和 math.sin 中的 1.0 只是单个生物方向的占位符。
现在有多个线程,但它们执行相同的代码。
我必须对内核进行哪些更改才能将其并行化?
对我来说,最明显的并行化维度似乎是内核中 i
中的循环,即遍历 num_creatures
。所以我将描述如何做到这一点。
我们的目标是删除 num_creatures
上的循环,而是让循环的每次迭代都由单独的 CUDA 线程处理。这是可能的,因为在每个循环迭代中完成的工作(大部分)是 独立的 - 它不依赖于其他循环迭代的结果(但请参见下面的 2)。
我们将 运行 进入的一个挑战是 num_creatures
中的第二个 i
for 循环可能取决于第一个循环的结果。如果我们将所有内容都作为串行代码 运行ning 留在单个线程中,那么串行代码执行的本质会处理这种依赖性。但是我们想将其并行化。因此我们需要在 num_creatures
中的第一个 for 循环和第二个 for 循环之间有一个 global sync。 CUDA 中一个简单、方便的全局同步是内核启动,因此我们将内核代码分成两个内核函数。我们将这些称为 update1
和 update2
这就提出了关于如何处理 cycles
中的总体循环的挑战。我们不能简单地在两个内核中复制该循环,因为这会改变功能行为 - 例如,我们会在计算 delta_x
的单个计算之前计算 cycles
更新到 p_x
。这大概不是我们想要的。因此,为简单起见,我们将把这个循环从内核代码中取出并返回到主机代码中。然后主机代码将调用 update1
和 update2
内核进行 cycles
次迭代。
我们还想让内核处理适应不同大小的num_creatures
。因此,我们将为 threadsperblock
选择一个硬编码大小,但我们将根据 num_creatures
的大小使启动的块数可变。为了促进这一点,我们需要在每个内核中使用 thread-check(初始 if 语句),以便 "extra" 线程不做任何事情。
有了这个描述,我们最终得到这样的结果:
$ cat t11.py
import numpy as np
import math
from numba import cuda
@cuda.jit('void(float32[:], float32[:], float32[:], float32[:], float32, uint32)')
def update1(p_x, p_y, velocities, max_velocities, acceleration, num_creatures):
i = cuda.grid(1)
if i < num_creatures:
velocities[i] = velocities[i] + acceleration
if velocities[i] > max_velocities[i]:
velocities[i] = max_velocities[i]
p_x[i] = p_x[i] + (math.cos(1.0) * velocities[i])
p_y[i] = p_y[i] + (math.sin(1.0) * velocities[i])
@cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], uint32)')
def update2(p_x, p_y, radii, types, num_creatures):
i = cuda.grid(1)
if i < num_creatures:
for j in range(i, num_creatures):
delta_x = p_x[j] - p_x[i]
delta_y = p_y[j] - p_y[i]
distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
sum_of_radii = radii[types[i]] + radii[types[i]]
if distance_squared < sum_of_radii * sum_of_radii:
pass
acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3
max_num_creatures = 1500
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 1
p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)
velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 800 // spacing):
for y in range(1, 600 // spacing):
if num_creatures % 2 == 0:
types[num_creatures] = 0
else:
types[num_creatures] = 1
p_x[num_creatures] = x * spacing
p_y[num_creatures] = y * spacing
max_velocities[num_creatures] = 5
num_creatures += 1
device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = (num_creatures // threadsperblock) + 1
for i in range(cycles):
update1[blockspergrid, threadsperblock](device_p_x, device_p_y, device_velocities, device_max_velocities, acceleration, num_creatures)
update2[blockspergrid, threadsperblock](device_p_x, device_p_y, device_radii, device_types, num_creatures)
print(device_p_x.copy_to_host())
$ python t11.py
[ 20.05402946 20.05402946 20.05402946 ..., 0. 0. 0. ]
$
产生与原始发布版本相同的输出(原始代码是 运行ning 16 个块,每块 64 个线程做完全相同的事情,并在写入相同数据时相互踩踏。所以我指的是原始发布版本,但 运行宁一个线程的一个块)。
请注意,当然还有其他并行化方法,可能还有其他可能的优化,但这应该为您提供 CUDA 工作的合理起点。
正如在 中提到的,这里的第二个内核确实没有做任何有用的事情,但我认为这只是未来工作的一个占位符。我也不确定你在这里使用 radii
会得到你想要的东西,但这也不是这个问题的重点。
那么所有这些性能明智的效果是什么?我们将再次比较原始发布的版本(t12.py
,如下)运行仅一个线程的一个块(不是 64 个线程的 16 个块,无论如何只会更糟)与发生的这个版本运行宁 18 个块,每块 64 个线程(t11.py
,如下):
$ nvprof --print-gpu-trace python t11.py
==3551== NVPROF is profiling process 3551, command: python t11.py
[ 20.05402946 20.05402946 20.05402946 ..., 0. 0. 0. ]
==3551== Profiling application: python t11.py
==3551== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name
446.77ms 1.8240us - - - - - 7.8125KB 4.0847GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
446.97ms 1.7600us - - - - - 7.8125KB 4.2333GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
447.35ms 1.2160us - - - - - 12B 9.4113MB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
447.74ms 1.3440us - - - - - 1.9531KB 1.3859GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
447.93ms 1.5040us - - - - - 5.8594KB 3.7154GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
448.13ms 1.5360us - - - - - 5.8594KB 3.6380GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
448.57ms 5.4720us (18 1 1) (64 1 1) 36 0B 0B - - - - Quadro K2000 (0 1 7 cudapy::__main__::update11(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, float, unsigned int) [49]
448.82ms 1.1200us (18 1 1) (64 1 1) 8 0B 0B - - - - Quadro K2000 (0 1 7 cudapy::__main__::update22(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<unsigned char, int=1, A, mutable, aligned>, unsigned int) [50]
448.90ms 2.1120us - - - - - 7.8125KB 3.5277GB/s Device Pageable Quadro K2000 (0 1 7 [CUDA memcpy DtoH]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
$ python t12.py
[ 20.05402946 20.05402946 20.05402946 ..., 0. 0. 0. ]
$ nvprof --print-gpu-trace python t12.py
==3604== NVPROF is profiling process 3604, command: python t12.py
[ 20.05402946 20.05402946 20.05402946 ..., 0. 0. 0. ]
==3604== Profiling application: python t12.py
==3604== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name
296.22ms 1.8240us - - - - - 7.8125KB 4.0847GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
296.41ms 1.7920us - - - - - 7.8125KB 4.1577GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
296.79ms 1.2160us - - - - - 12B 9.4113MB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
297.21ms 1.3440us - - - - - 1.9531KB 1.3859GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
297.40ms 1.5040us - - - - - 5.8594KB 3.7154GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
297.60ms 1.5360us - - - - - 5.8594KB 3.6380GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
298.05ms 1.8453ms (1 1 1) (1 1 1) 36 0B 0B - - - - Quadro K2000 (0 1 7 cudapy::__main__::update1(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<unsigned char, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, float, unsigned int, unsigned int) [38]
299.91ms 2.1120us - - - - - 7.8125KB 3.5277GB/s Device Pageable Quadro K2000 (0 1 7 [CUDA memcpy DtoH]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
$
我们看到分析器报告原始 t12.py
版本,有一个 update
内核 运行ning,有 1 个块和 1 个线程,它正在占用1.8453 毫秒。对于在此答案中发布的修改后的 t11.py
版本,探查器报告了 18 个块,每个块有 64 个线程,对于 update1
和 update2
内核,这两个内核的组合执行时间是大约 5.47 + 1.12 = 6.59 微秒。
编辑:
根据评论中的一些讨论,应该可以将两个内核组合成一个内核,在 p_x
和 p_y
上使用双缓冲方案:
$ cat t11.py
import numpy as np
import math
from numba import cuda
@cuda.jit('void(float32[:], float32[:], float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32)')
def update(p_x, p_y, p_x_new, p_y_new, radii, types, velocities, max_velocities, acceleration, num_creatures):
i = cuda.grid(1)
if i < num_creatures:
velocities[i] = velocities[i] + acceleration
if velocities[i] > max_velocities[i]:
velocities[i] = max_velocities[i]
p_x_new[i] = p_x[i] + (math.cos(1.0) * velocities[i])
p_y_new[i] = p_y[i] + (math.sin(1.0) * velocities[i])
for j in range(i, num_creatures):
delta_x = p_x[j] - p_x[i]
delta_y = p_y[j] - p_y[i]
distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
sum_of_radii = radii[types[i]] + radii[types[i]]
if distance_squared < sum_of_radii * sum_of_radii:
pass
acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3
max_num_creatures = 1500000
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 2
p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)
velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 80000 // spacing):
for y in range(1, 6000 // spacing):
if num_creatures % 2 == 0:
types[num_creatures] = 0
else:
types[num_creatures] = 1
p_x[num_creatures] = x * spacing
p_y[num_creatures] = y * spacing
max_velocities[num_creatures] = 5
num_creatures += 1
device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_p_x_new = cuda.to_device(p_x)
device_p_y_new = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = (num_creatures // threadsperblock) + 1
for i in range(cycles):
if i % 2 == 0:
update[blockspergrid, threadsperblock](device_p_x, device_p_y, device_p_x_new, device_p_y_new, device_radii, device_types, device_velocities, device_max_velocities, acceleration, num_creatures)
else:
update[blockspergrid, threadsperblock](device_p_x_new, device_p_y_new, device_p_x, device_p_y, device_radii, device_types, device_velocities, device_max_velocities, acceleration, num_creatures)
print(device_p_x_new.copy_to_host())
print(device_p_x.copy_to_host())
$ python t11.py
[ 20.05402946 20.05402946 20.05402946 ..., 0. 0. 0. ]
[ 20.1620903 20.1620903 20.1620903 ..., 0. 0. 0. ]
$
仍然有必要在主机代码中保留 cycles
中的内核调用循环,因为我们仍然需要内核调用提供的全局同步。但是对于给定数量的 cycles
,这将减少内核调用开销的贡献。
使用此技术时,必须小心选择 cycles
以及使用来自 p_x
或 p_x_new
缓冲区的数据,以获得连贯的结果。
我正在尝试在 GPU 上并行化模拟的简单更新循环。基本上有一堆 "creatures" 由圆圈表示,在每个更新循环中它们将移动,然后将检查它们是否相交。半径是不同类型生物的半径。
import numpy as np
import math
from numba import cuda
@cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32, uint32)')
def update(p_x, p_y, radii, types, velocities, max_velocities, acceleration, num_creatures, cycles):
for c in range(cycles):
for i in range(num_creatures):
velocities[i] = velocities[i] + acceleration
if velocities[i] > max_velocities[i]:
velocities[i] = max_velocities[i]
p_x[i] = p_x[i] + (math.cos(1.0) * velocities[i])
p_y[i] = p_y[i] + (math.sin(1.0) * velocities[i])
for i in range(num_creatures):
for j in range(i, num_creatures):
delta_x = p_x[j] - p_x[i]
delta_y = p_y[j] - p_y[i]
distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
sum_of_radii = radii[types[i]] + radii[types[i]]
if distance_squared < sum_of_radii * sum_of_radii:
pass
acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3
max_num_creatures = 1500
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 1
p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)
velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 800 // spacing):
for y in range(1, 600 // spacing):
if num_creatures % 2 == 0:
types[num_creatures] = 0
else:
types[num_creatures] = 1
p_x[num_creatures] = x * spacing
p_y[num_creatures] = y * spacing
max_velocities[num_creatures] = 5
num_creatures += 1
device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = 16
update[blockspergrid, threadsperblock](device_p_x, device_p_y, device_radii, device_types, device_velocities, device_max_velocities,
acceleration, num_creatures, cycles)
print(device_p_x.copy_to_host())
math.cos 和 math.sin 中的 1.0 只是单个生物方向的占位符。
现在有多个线程,但它们执行相同的代码。 我必须对内核进行哪些更改才能将其并行化?
对我来说,最明显的并行化维度似乎是内核中 i
中的循环,即遍历 num_creatures
。所以我将描述如何做到这一点。
我们的目标是删除
num_creatures
上的循环,而是让循环的每次迭代都由单独的 CUDA 线程处理。这是可能的,因为在每个循环迭代中完成的工作(大部分)是 独立的 - 它不依赖于其他循环迭代的结果(但请参见下面的 2)。我们将 运行 进入的一个挑战是
num_creatures
中的第二个i
for 循环可能取决于第一个循环的结果。如果我们将所有内容都作为串行代码 运行ning 留在单个线程中,那么串行代码执行的本质会处理这种依赖性。但是我们想将其并行化。因此我们需要在num_creatures
中的第一个 for 循环和第二个 for 循环之间有一个 global sync。 CUDA 中一个简单、方便的全局同步是内核启动,因此我们将内核代码分成两个内核函数。我们将这些称为update1
和update2
这就提出了关于如何处理
cycles
中的总体循环的挑战。我们不能简单地在两个内核中复制该循环,因为这会改变功能行为 - 例如,我们会在计算delta_x
的单个计算之前计算cycles
更新到p_x
。这大概不是我们想要的。因此,为简单起见,我们将把这个循环从内核代码中取出并返回到主机代码中。然后主机代码将调用update1
和update2
内核进行cycles
次迭代。我们还想让内核处理适应不同大小的
num_creatures
。因此,我们将为threadsperblock
选择一个硬编码大小,但我们将根据num_creatures
的大小使启动的块数可变。为了促进这一点,我们需要在每个内核中使用 thread-check(初始 if 语句),以便 "extra" 线程不做任何事情。
有了这个描述,我们最终得到这样的结果:
$ cat t11.py
import numpy as np
import math
from numba import cuda
@cuda.jit('void(float32[:], float32[:], float32[:], float32[:], float32, uint32)')
def update1(p_x, p_y, velocities, max_velocities, acceleration, num_creatures):
i = cuda.grid(1)
if i < num_creatures:
velocities[i] = velocities[i] + acceleration
if velocities[i] > max_velocities[i]:
velocities[i] = max_velocities[i]
p_x[i] = p_x[i] + (math.cos(1.0) * velocities[i])
p_y[i] = p_y[i] + (math.sin(1.0) * velocities[i])
@cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], uint32)')
def update2(p_x, p_y, radii, types, num_creatures):
i = cuda.grid(1)
if i < num_creatures:
for j in range(i, num_creatures):
delta_x = p_x[j] - p_x[i]
delta_y = p_y[j] - p_y[i]
distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
sum_of_radii = radii[types[i]] + radii[types[i]]
if distance_squared < sum_of_radii * sum_of_radii:
pass
acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3
max_num_creatures = 1500
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 1
p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)
velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 800 // spacing):
for y in range(1, 600 // spacing):
if num_creatures % 2 == 0:
types[num_creatures] = 0
else:
types[num_creatures] = 1
p_x[num_creatures] = x * spacing
p_y[num_creatures] = y * spacing
max_velocities[num_creatures] = 5
num_creatures += 1
device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = (num_creatures // threadsperblock) + 1
for i in range(cycles):
update1[blockspergrid, threadsperblock](device_p_x, device_p_y, device_velocities, device_max_velocities, acceleration, num_creatures)
update2[blockspergrid, threadsperblock](device_p_x, device_p_y, device_radii, device_types, num_creatures)
print(device_p_x.copy_to_host())
$ python t11.py
[ 20.05402946 20.05402946 20.05402946 ..., 0. 0. 0. ]
$
产生与原始发布版本相同的输出(原始代码是 运行ning 16 个块,每块 64 个线程做完全相同的事情,并在写入相同数据时相互踩踏。所以我指的是原始发布版本,但 运行宁一个线程的一个块)。
请注意,当然还有其他并行化方法,可能还有其他可能的优化,但这应该为您提供 CUDA 工作的合理起点。
正如在 radii
会得到你想要的东西,但这也不是这个问题的重点。
那么所有这些性能明智的效果是什么?我们将再次比较原始发布的版本(t12.py
,如下)运行仅一个线程的一个块(不是 64 个线程的 16 个块,无论如何只会更糟)与发生的这个版本运行宁 18 个块,每块 64 个线程(t11.py
,如下):
$ nvprof --print-gpu-trace python t11.py
==3551== NVPROF is profiling process 3551, command: python t11.py
[ 20.05402946 20.05402946 20.05402946 ..., 0. 0. 0. ]
==3551== Profiling application: python t11.py
==3551== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name
446.77ms 1.8240us - - - - - 7.8125KB 4.0847GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
446.97ms 1.7600us - - - - - 7.8125KB 4.2333GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
447.35ms 1.2160us - - - - - 12B 9.4113MB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
447.74ms 1.3440us - - - - - 1.9531KB 1.3859GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
447.93ms 1.5040us - - - - - 5.8594KB 3.7154GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
448.13ms 1.5360us - - - - - 5.8594KB 3.6380GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
448.57ms 5.4720us (18 1 1) (64 1 1) 36 0B 0B - - - - Quadro K2000 (0 1 7 cudapy::__main__::update11(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, float, unsigned int) [49]
448.82ms 1.1200us (18 1 1) (64 1 1) 8 0B 0B - - - - Quadro K2000 (0 1 7 cudapy::__main__::update22(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<unsigned char, int=1, A, mutable, aligned>, unsigned int) [50]
448.90ms 2.1120us - - - - - 7.8125KB 3.5277GB/s Device Pageable Quadro K2000 (0 1 7 [CUDA memcpy DtoH]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
$ python t12.py
[ 20.05402946 20.05402946 20.05402946 ..., 0. 0. 0. ]
$ nvprof --print-gpu-trace python t12.py
==3604== NVPROF is profiling process 3604, command: python t12.py
[ 20.05402946 20.05402946 20.05402946 ..., 0. 0. 0. ]
==3604== Profiling application: python t12.py
==3604== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name
296.22ms 1.8240us - - - - - 7.8125KB 4.0847GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
296.41ms 1.7920us - - - - - 7.8125KB 4.1577GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
296.79ms 1.2160us - - - - - 12B 9.4113MB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
297.21ms 1.3440us - - - - - 1.9531KB 1.3859GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
297.40ms 1.5040us - - - - - 5.8594KB 3.7154GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
297.60ms 1.5360us - - - - - 5.8594KB 3.6380GB/s Pageable Device Quadro K2000 (0 1 7 [CUDA memcpy HtoD]
298.05ms 1.8453ms (1 1 1) (1 1 1) 36 0B 0B - - - - Quadro K2000 (0 1 7 cudapy::__main__::update1(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<unsigned char, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, float, unsigned int, unsigned int) [38]
299.91ms 2.1120us - - - - - 7.8125KB 3.5277GB/s Device Pageable Quadro K2000 (0 1 7 [CUDA memcpy DtoH]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
$
我们看到分析器报告原始 t12.py
版本,有一个 update
内核 运行ning,有 1 个块和 1 个线程,它正在占用1.8453 毫秒。对于在此答案中发布的修改后的 t11.py
版本,探查器报告了 18 个块,每个块有 64 个线程,对于 update1
和 update2
内核,这两个内核的组合执行时间是大约 5.47 + 1.12 = 6.59 微秒。
编辑:
根据评论中的一些讨论,应该可以将两个内核组合成一个内核,在 p_x
和 p_y
上使用双缓冲方案:
$ cat t11.py
import numpy as np
import math
from numba import cuda
@cuda.jit('void(float32[:], float32[:], float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32)')
def update(p_x, p_y, p_x_new, p_y_new, radii, types, velocities, max_velocities, acceleration, num_creatures):
i = cuda.grid(1)
if i < num_creatures:
velocities[i] = velocities[i] + acceleration
if velocities[i] > max_velocities[i]:
velocities[i] = max_velocities[i]
p_x_new[i] = p_x[i] + (math.cos(1.0) * velocities[i])
p_y_new[i] = p_y[i] + (math.sin(1.0) * velocities[i])
for j in range(i, num_creatures):
delta_x = p_x[j] - p_x[i]
delta_y = p_y[j] - p_y[i]
distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
sum_of_radii = radii[types[i]] + radii[types[i]]
if distance_squared < sum_of_radii * sum_of_radii:
pass
acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3
max_num_creatures = 1500000
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 2
p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)
velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 80000 // spacing):
for y in range(1, 6000 // spacing):
if num_creatures % 2 == 0:
types[num_creatures] = 0
else:
types[num_creatures] = 1
p_x[num_creatures] = x * spacing
p_y[num_creatures] = y * spacing
max_velocities[num_creatures] = 5
num_creatures += 1
device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_p_x_new = cuda.to_device(p_x)
device_p_y_new = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = (num_creatures // threadsperblock) + 1
for i in range(cycles):
if i % 2 == 0:
update[blockspergrid, threadsperblock](device_p_x, device_p_y, device_p_x_new, device_p_y_new, device_radii, device_types, device_velocities, device_max_velocities, acceleration, num_creatures)
else:
update[blockspergrid, threadsperblock](device_p_x_new, device_p_y_new, device_p_x, device_p_y, device_radii, device_types, device_velocities, device_max_velocities, acceleration, num_creatures)
print(device_p_x_new.copy_to_host())
print(device_p_x.copy_to_host())
$ python t11.py
[ 20.05402946 20.05402946 20.05402946 ..., 0. 0. 0. ]
[ 20.1620903 20.1620903 20.1620903 ..., 0. 0. 0. ]
$
仍然有必要在主机代码中保留 cycles
中的内核调用循环,因为我们仍然需要内核调用提供的全局同步。但是对于给定数量的 cycles
,这将减少内核调用开销的贡献。
使用此技术时,必须小心选择 cycles
以及使用来自 p_x
或 p_x_new
缓冲区的数据,以获得连贯的结果。