使用 CUDA atomicInc 获取唯一索引

Question

我有 CUDA 内核，基本上每个线程都有一个值，它需要将该值添加到共享内存中的一个或多个列表中。因此，对于这些列表中的每一个，它都需要获取一个索引值（对于该列表是唯一的）来放置值。

真正的代码是不同的，但是有这样的列表：

typedef struct {
    unsigned int numItems;
    float items[MAX_NUM_ITEMS];
} List;
__shared__ List lists[NUM_LISTS];

初始值numItems全部设置为0，然后一个__syncthreads()就搞定了。

要将其值添加到列表中，每个线程都会执行以下操作：

for(int list = 0; list < NUM_LISTS; ++list) {
    if(should_add_to_list(threadIdx, list)) {
        unsigned int index = atomicInc(&lists[list].numItems, 0xffffffff);
        assert(index < MAX_NUM_ITEMS); // always true
        lists[list].items[index] = my_value;
    }
}

这在大多数情况下都有效，但似乎在内核的其他部分进行一些不相关的更改时（例如不检查始终成功的断言），有时两个线程会为一个列表获得相同的索引，或者索引被跳过。然而，numSamples 的最终值总是正确的。

但是，当对 atomicInc_ 使用以下自定义实现时，它似乎可以正常工作：

__device__ static inline uint32_t atomicInc_(uint32_t* ptr) {
    uint32_t value;
    do {
        value = *ptr;
    } while(atomicCAS(ptr, value, value + 1) != value);
    return value;
}

这两个 atomicInc 函数是否等价，以这种方式使用 atomicInc 获取唯一索引是否有效？

根据CUDA programming guide，原子函数并不意味着内存排序约束，不同线程可以同时访问不同列表的numSamples：这会导致它失败吗？

编辑：

真正的内核是这样的：

基本上有一个 spot 块 的列表，其中包含 spots。每个 spot 都有 XY 坐标 (col, row)。内核需要为每个点找到它周围某个window（col/row差异）内的点，并将它们放入共享内存中的列表中。

使用固定数量的 warp 调用内核。一个 CUDA 块对应一组 spot 块。（此处 3）这些被称为 local spot blocks.

首先它从块的 3 个点块中取出点，并将它们复制到共享内存中 (localSpots[])。为此，它为每个斑点块使用一个扭曲，以便可以合并读取斑点。 warp 中的每个线程都是本地 spot 块中的一个 spot。 spot 块索引在这里是硬编码的 (blocks[])。

然后它通过 周围的 点块：这些是所有可能包含与 本地点中的点足够近的点的点块块。周围的 spot 块的索引也在这里硬编码 (sblock[])。

在此示例中，它仅为此使用第一个扭曲，并迭代遍历 sblocks[]。经纱中的每根线都是周围斑点块中的一个斑点。它还遍历所有本地点的列表。如果线程的点与本地点足够近：它会将其插入到本地点的列表中，使用 atomicInc 获取索引。

执行时，printf 显示对于给定的局部点（这里是行=37，列=977 的点），有时会重复或跳过索引。

真正的代码更多complex/optimized，但是这段代码已经有问题了。这里它也只有运行s 一个 CUDA 块。

#include <assert.h>
#include <stdio.h>

#define MAX_NUM_SPOTS_IN_WINDOW 80

__global__ void Kernel(
    const uint16_t* blockNumSpotsBuffer,
    XGPU_SpotProcessingBlockSpotDataBuffers blockSpotsBuffers,
    size_t blockSpotsBuffersElementPitch,
    int2 unused1,
    int2 unused2,
    int unused3 ) {
    typedef unsigned int uint;

    if(blockIdx.x!=30 || blockIdx.y!=1) return;

    int window = 5;

    ASSERT(blockDim.x % WARP_SIZE == 0);
    ASSERT(blockDim.y == 1);

    uint numWarps = blockDim.x / WARP_SIZE;
    uint idxWarp = threadIdx.x / WARP_SIZE;
    int idxThreadInWarp = threadIdx.x % WARP_SIZE;

    struct Spot {
        int16_t row;
        int16_t col;
        volatile unsigned int numSamples;
        float signalSamples[MAX_NUM_SPOTS_IN_WINDOW];
    };

    __shared__ uint numLocalSpots;
    __shared__ Spot localSpots[3 * 32];

    numLocalSpots = 0;

    __syncthreads();

    ASSERT(numWarps >= 3);
    int blocks[3] = {174, 222, 270};
    if(idxWarp < 3) {
        uint spotBlockIdx = blocks[idxWarp];
        ASSERT(spotBlockIdx < numSpotBlocks.x * numSpotBlocks.y);

        uint numSpots = blockNumSpotsBuffer[spotBlockIdx];
        ASSERT(numSpots < WARP_SIZE);

        size_t inOffset = (spotBlockIdx * blockSpotsBuffersElementPitch) + idxThreadInWarp;

        uint outOffset;
        if(idxThreadInWarp == 0) outOffset = atomicAdd(&numLocalSpots, numSpots);
        outOffset = __shfl_sync(0xffffffff, outOffset, 0, 32);

        if(idxThreadInWarp < numSpots) {
            Spot* outSpot = &localSpots[outOffset + idxThreadInWarp];
            outSpot->numSamples = 0;

            uint32_t coord = blockSpotsBuffers.coord[inOffset];
            UnpackCoordinates(coord, &outSpot->row, &outSpot->col);
        }
    }



    __syncthreads();


    int sblocks[] = { 29,30,31,77,78,79,125,126,127,173,174,175,221,222,223,269,270,271,317,318,319,365,366,367,413,414,415 };
    if(idxWarp == 0) for(int block = 0; block < sizeof(sblocks)/sizeof(int); ++block) {
        uint spotBlockIdx = sblocks[block];
        ASSERT(spotBlockIdx < numSpotBlocks.x * numSpotBlocks.y);

        uint numSpots = blockNumSpotsBuffer[spotBlockIdx];
        uint idxThreadInWarp = threadIdx.x % WARP_SIZE;
        if(idxThreadInWarp >= numSpots) continue;

        size_t inOffset = (spotBlockIdx * blockSpotsBuffersElementPitch) + idxThreadInWarp;

        uint32_t coord = blockSpotsBuffers.coord[inOffset];
        if(coord == 0) return; // invalid surrounding spot

        int16_t row, col;
        UnpackCoordinates(coord, &row, &col);

        for(int idxLocalSpot = 0; idxLocalSpot < numLocalSpots; ++idxLocalSpot) {
            Spot* localSpot = &localSpots[idxLocalSpot];

            if(localSpot->row == 0 && localSpot->col == 0) continue;
            if((abs(localSpot->row - row) >= window) && (abs(localSpot->col - col) >= window)) continue;

            int index = atomicInc_block((unsigned int*)&localSpot->numSamples, 0xffffffff);
            if(localSpot->row == 37 && localSpot->col == 977) printf("%02d  ", index); // <-- sometimes indices are skipped or duplicated

            if(index >= MAX_NUM_SPOTS_IN_WINDOW) continue; // index out of bounds, discard value for median calculation
            localSpot->signalSamples[index] = blockSpotsBuffers.signal[inOffset];
        }
    } }

输出如下所示：

00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20  21  22  23  23                                                                                                                   
00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24                                                                                                                 
00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24        
00  01  02  02  03  03  04  05  06  07  08  09  10  11  12  06  13  14  15  16  17  18  19  20  21        
00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24        
00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24        
00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20  21  22  23  23        
00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24        
00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24

每一行都是一次执行的输出（内核是运行多次）。预计指数会以不同的顺序出现。但是例如在倒数第三行，索引 23 被重复。

使用 atomicCAS 似乎可以解决这个问题。在外部 for 循环的执行之间使用 __syncwarp() 似乎也可以修复它。但不清楚为什么，如果这样总能解决问题。

编辑 2： 这是一个显示问题的完整程序 (main.cu)：

https://pastebin.com/cDqYmjGb

CMakeLists.txt:

https://pastebin.com/iB9mbUJw

必须用-DCMAKE_BUILD_TYPE=Release编译。

它产生这个输出：

00(0:00000221E40003E0)
01(2:00000221E40003E0)
02(7:00000221E40003E0)
03(1:00000221E40003E0)
03(2:00000221E40003E0)
04(3:00000221E40003E0)
04(1:00000221E40003E0)
05(4:00000221E40003E0)
06(6:00000221E40003E0)
07(2:00000221E40003E0)
08(3:00000221E40003E0)
09(6:00000221E40003E0)
10(3:00000221E40003E0)
11(5:00000221E40003E0)
12(0:00000221E40003E0)
13(1:00000221E40003E0)
14(3:00000221E40003E0)
15(1:00000221E40003E0)
16(0:00000221E40003E0)
17(3:00000221E40003E0)
18(0:00000221E40003E0)
19(2:00000221E40003E0)
20(4:00000221E40003E0)
21(4:00000221E40003E0)
22(1:00000221E40003E0)

例如，带有 03 的行表明两个线程（1 和 2）在同一计数器（在 0x00000221E40003E0 处）调用 atomicInc_block 后得到相同的结果 (3)。 =35=]

Answer 1

根据我的测试，此问题已在当前可用的 CUDA 11.4.1 here 和驱动程序 470.52.02 中修复。它也可能在 CUDA 11.4 和 11.3 的一些早期版本中得到修复，但问题出现在 CUDA 11.2 中。

使用 CUDA atomicInc 获取唯一索引

Using CUDA atomicInc to get unique indices

cuda

atomic

memory-barriers