Cuda 并行化内核

Cuda Parallelize Kernel

我正在尝试在 GPU 上并行化模拟的简单更新循环。基本上有一堆 "creatures" 由圆圈表示,在每个更新循环中它们将移动,然后将检查它们是否相交。半径是不同类型生物的半径。

import numpy as np
import math
from numba import cuda


@cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32, uint32)')
def update(p_x, p_y, radii, types, velocities, max_velocities, acceleration, num_creatures, cycles):
    for c in range(cycles):
        for i in range(num_creatures):
            velocities[i] = velocities[i] + acceleration
            if velocities[i] > max_velocities[i]:
                velocities[i] = max_velocities[i]
            p_x[i] = p_x[i] + (math.cos(1.0) * velocities[i])
            p_y[i] = p_y[i] + (math.sin(1.0) * velocities[i])
        for i in range(num_creatures):
            for j in range(i, num_creatures):
                delta_x = p_x[j] - p_x[i]
                delta_y = p_y[j] - p_y[i]
                distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
                sum_of_radii = radii[types[i]] + radii[types[i]]
                if distance_squared < sum_of_radii * sum_of_radii:
                    pass


acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3

max_num_creatures = 1500
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 1


p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)

velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 800 // spacing):
    for y in range(1, 600 // spacing):
        if num_creatures % 2 == 0:
            types[num_creatures] = 0
        else:
            types[num_creatures] = 1
        p_x[num_creatures] = x * spacing
        p_y[num_creatures] = y * spacing
        max_velocities[num_creatures] = 5
        num_creatures += 1


device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = 16
update[blockspergrid, threadsperblock](device_p_x, device_p_y, device_radii, device_types, device_velocities, device_max_velocities,
        acceleration, num_creatures, cycles)
print(device_p_x.copy_to_host())

math.cos 和 math.sin 中的 1.0 只是单个生物方向的占位符。

现在有多个线程,但它们执行相同的代码。 我必须对内核进行哪些更改才能将其并行化?

对我来说,最明显的并行化维度似乎是内核中 i 中的循环,即遍历 num_creatures。所以我将描述如何做到这一点。

  1. 我们的目标是删除 num_creatures 上的循环,而是让循环的每次迭代都由单独的 CUDA 线程处理。这是可能的,因为在每个循环迭代中完成的工作(大部分)是 独立的 - 它不依赖于其他循环迭代的结果(但请参见下面的 2)。

  2. 我们将 运行 进入的一个挑战是 num_creatures 中的第二个 i for 循环可能取决于第一个循环的结果。如果我们将所有内容都作为串行代码 运行ning 留在单个线程中,那么串行代码执行的本质会处理这种依赖性。但是我们想将其并行化。因此我们需要在 num_creatures 中的第一个 for 循环和第二个 for 循环之间有一个 global sync。 CUDA 中一个简单、方便的全局同步是内核启动,因此我们将内核代码分成两个内核函数。我们将这些称为 update1update2

  3. 这就提出了关于如何处理 cycles 中的总体循环的挑战。我们不能简单地在两个内核中复制该循环,因为这会改变功能行为 - 例如,我们会在计算 delta_x 的单个计算之前计算 cycles 更新到 p_x。这大概不是我们想要的。因此,为简单起见,我们将把这个循环从内核代码中取出并返回到主机代码中。然后主机代码将调用 update1update2 内核进行 cycles 次迭代。

  4. 我们还想让内核处理适应不同大小的num_creatures。因此,我们将为 threadsperblock 选择一个硬编码大小,但我们将根据 num_creatures 的大小使启动的块数可变。为了促进这一点,我们需要在每个内核中使用 thread-check(初始 if 语句),以便 "extra" 线程不做任何事情。

有了这个描述,我们最终得到这样的结果:

$ cat t11.py
import numpy as np
import math
from numba import cuda


@cuda.jit('void(float32[:], float32[:], float32[:], float32[:], float32, uint32)')
def update1(p_x, p_y, velocities, max_velocities, acceleration, num_creatures):
    i = cuda.grid(1)
    if i < num_creatures:
            velocities[i] = velocities[i] + acceleration
            if velocities[i] > max_velocities[i]:
                velocities[i] = max_velocities[i]
            p_x[i] = p_x[i] + (math.cos(1.0) * velocities[i])
            p_y[i] = p_y[i] + (math.sin(1.0) * velocities[i])

@cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], uint32)')
def update2(p_x, p_y, radii, types, num_creatures):
    i = cuda.grid(1)
    if i < num_creatures:
            for j in range(i, num_creatures):
                delta_x = p_x[j] - p_x[i]
                delta_y = p_y[j] - p_y[i]
                distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
                sum_of_radii = radii[types[i]] + radii[types[i]]
                if distance_squared < sum_of_radii * sum_of_radii:
                    pass


acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3

max_num_creatures = 1500
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 1


p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)

velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 800 // spacing):
    for y in range(1, 600 // spacing):
        if num_creatures % 2 == 0:
            types[num_creatures] = 0
        else:
            types[num_creatures] = 1
        p_x[num_creatures] = x * spacing
        p_y[num_creatures] = y * spacing
        max_velocities[num_creatures] = 5
        num_creatures += 1


device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = (num_creatures // threadsperblock) + 1
for i in range(cycles):
    update1[blockspergrid, threadsperblock](device_p_x, device_p_y, device_velocities, device_max_velocities, acceleration, num_creatures)
    update2[blockspergrid, threadsperblock](device_p_x, device_p_y, device_radii, device_types, num_creatures)
print(device_p_x.copy_to_host())
$ python t11.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
$

产生与原始发布版本相同的输出(原始代码是 运行ning 16 个块,每块 64 个线程做完全相同的事情,并在写入相同数据时相互踩踏。所以我指的是原始发布版本,但 运行宁一个线程的一个块)。

请注意,当然还有其他并行化方法,可能还有其他可能的优化,但这应该为您提供 CUDA 工作的合理起点。

正如在 中提到的,这里的第二个内核确实没有做任何有用的事情,但我认为这只是未来工作的一个占位符。我也不确定你在这里使用 radii 会得到你想要的东西,但这也不是这个问题的重点。

那么所有这些性能明智的效果是什么?我们将再次比较原始发布的版本(t12.py,如下)运行仅一个线程的一个块(不是 64 个线程的 16 个块,无论如何只会更糟)与发生的这个版本运行宁 18 个块,每块 64 个线程(t11.py,如下):

$ nvprof --print-gpu-trace python t11.py
==3551== NVPROF is profiling process 3551, command: python t11.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
==3551== Profiling application: python t11.py
==3551== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
446.77ms  1.8240us                    -               -         -         -         -  7.8125KB  4.0847GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
446.97ms  1.7600us                    -               -         -         -         -  7.8125KB  4.2333GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
447.35ms  1.2160us                    -               -         -         -         -       12B  9.4113MB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
447.74ms  1.3440us                    -               -         -         -         -  1.9531KB  1.3859GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
447.93ms  1.5040us                    -               -         -         -         -  5.8594KB  3.7154GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
448.13ms  1.5360us                    -               -         -         -         -  5.8594KB  3.6380GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
448.57ms  5.4720us             (18 1 1)        (64 1 1)        36        0B        0B         -           -           -           -  Quadro K2000 (0         1         7  cudapy::__main__::update11(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, float, unsigned int) [49]
448.82ms  1.1200us             (18 1 1)        (64 1 1)         8        0B        0B         -           -           -           -  Quadro K2000 (0         1         7  cudapy::__main__::update22(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<unsigned char, int=1, A, mutable, aligned>, unsigned int) [50]
448.90ms  2.1120us                    -               -         -         -         -  7.8125KB  3.5277GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy

$ python t12.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
$ nvprof --print-gpu-trace python t12.py
==3604== NVPROF is profiling process 3604, command: python t12.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
==3604== Profiling application: python t12.py
==3604== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
296.22ms  1.8240us                    -               -         -         -         -  7.8125KB  4.0847GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
296.41ms  1.7920us                    -               -         -         -         -  7.8125KB  4.1577GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
296.79ms  1.2160us                    -               -         -         -         -       12B  9.4113MB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
297.21ms  1.3440us                    -               -         -         -         -  1.9531KB  1.3859GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
297.40ms  1.5040us                    -               -         -         -         -  5.8594KB  3.7154GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
297.60ms  1.5360us                    -               -         -         -         -  5.8594KB  3.6380GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
298.05ms  1.8453ms              (1 1 1)         (1 1 1)        36        0B        0B         -           -           -           -  Quadro K2000 (0         1         7  cudapy::__main__::update1(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<unsigned char, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, float, unsigned int, unsigned int) [38]
299.91ms  2.1120us                    -               -         -         -         -  7.8125KB  3.5277GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
$

我们看到分析器报告原始 t12.py 版本,有一个 update 内核 运行ning,有 1 个块和 1 个线程,它正在占用1.8453 毫秒。对于在此答案中发布的修改后的 t11.py 版本,探查器报告了 18 个块,每个块有 64 个线程,对于 update1update2 内核,这两个内核的组合执行时间是大约 5.47 + 1.12 = 6.59 微秒。

编辑: 根据评论中的一些讨论,应该可以将两个内核组合成一个内核,在 p_xp_y 上使用双缓冲方案:

$ cat t11.py
import numpy as np
import math
from numba import cuda


@cuda.jit('void(float32[:], float32[:], float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32)')
def update(p_x, p_y, p_x_new, p_y_new, radii, types, velocities, max_velocities, acceleration, num_creatures):
    i = cuda.grid(1)
    if i < num_creatures:
            velocities[i] = velocities[i] + acceleration
            if velocities[i] > max_velocities[i]:
                velocities[i] = max_velocities[i]
            p_x_new[i] = p_x[i] + (math.cos(1.0) * velocities[i])
            p_y_new[i] = p_y[i] + (math.sin(1.0) * velocities[i])
            for j in range(i, num_creatures):
                delta_x = p_x[j] - p_x[i]
                delta_y = p_y[j] - p_y[i]
                distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
                sum_of_radii = radii[types[i]] + radii[types[i]]
                if distance_squared < sum_of_radii * sum_of_radii:
                    pass


acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3

max_num_creatures = 1500000
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 2


p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)

velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 80000 // spacing):
    for y in range(1, 6000 // spacing):
        if num_creatures % 2 == 0:
            types[num_creatures] = 0
        else:
            types[num_creatures] = 1
        p_x[num_creatures] = x * spacing
        p_y[num_creatures] = y * spacing
        max_velocities[num_creatures] = 5
        num_creatures += 1


device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_p_x_new = cuda.to_device(p_x)
device_p_y_new = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = (num_creatures // threadsperblock) + 1
for i in range(cycles):
    if i % 2 == 0:
        update[blockspergrid, threadsperblock](device_p_x, device_p_y, device_p_x_new, device_p_y_new, device_radii, device_types,  device_velocities, device_max_velocities, acceleration, num_creatures)
    else:
        update[blockspergrid, threadsperblock](device_p_x_new, device_p_y_new, device_p_x, device_p_y, device_radii, device_types,  device_velocities, device_max_velocities, acceleration, num_creatures)

print(device_p_x_new.copy_to_host())
print(device_p_x.copy_to_host())
$ python t11.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
[ 20.1620903  20.1620903  20.1620903 ...,   0.          0.          0.       ]
$

仍然有必要在主机代码中保留 cycles 中的内核调用循环,因为我们仍然需要内核调用提供的全局同步。但是对于给定数量的 cycles,这将减少内核调用开销的贡献。

使用此技术时,必须小心选择 cycles 以及使用来自 p_xp_x_new 缓冲区的数据,以获得连贯的结果。