__syncthreads() 后的 CUDA 竞赛检查危险

Question

我正在尝试在 CUDA 中并行处理矩阵。我需要根据给定向量计算矩阵的每一列，如果结果大于某个值，我将保留该列，否则该列将被删除以进行进一步计算。为避免复制和重组矩阵，我使用列索引来指示是否应将列用于进一步计算。

这个过程需要进行多次。每次需要检查所有列的子集。所以我创建了另一个矩阵来存储每次要处理的列索引。例如，如果我有一个 10 列的矩阵，我需要重复这个过程 4 次，column_indices 矩阵可能如下所示：

thrust::device_vector<int> column_indices( std::vector<int>( {
    0, 1, -1, -1, -1,   // 2 columns contains useful information
    5, 6, 7, -1, -1,    // 3 columns contains useful information
    9, 8, 7, 6, -1,     // 4 columns contains useful information
    4, 3, 2, 1, 0       // 5 columns contains useful information
} ) );

这只是一个简化的例子。在实际代码中，我必须处理一个大约有 500-1000 列的矩阵。因为不是每次都需要处理所有的列，而且列数很大，所以将每一列都传递给一个线程来处理可能不是一个好主意，因为这意味着可能有一半的线程是空闲的。

所以我决定使用动态并行 - 父内核检查需要多少线程来处理和启动具有确切线程数的子内核，并根据需要分配确切的共享内存。

这是我的代码：

#include <iostream>
#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>

__device__
float calculate( const float* v1, const float* v2, const int length )
{
    // mock calculation resulting 0.0 for even threads and 0.5 for odd threads
    return threadIdx.x % 2 == 0 ? 0.0f : 0.5f;
}

__global__
void child( float const* input_a, const int nrow, float const* input_b, int* columns, int* counts )
{
    extern __shared__ float results[];

    // input_a are a matrix stored in column-major order, and input_b is a vector
    int thread_column = columns[ threadIdx.x ];
    float const* thread_input = input_a+ thread_column * nrow;
    results[ threadIdx.x ] = calculate( thread_input, input_b, nrow );
    //--------------Discussion-----------
    //Race condition is gone if I replace the line above with this:
    //atomicExch( results + threadIdx.x, calculate( thread_input, input_b, nrow ) );
    //However it looks to me unnecessary as each thread is accessing a different address
    //-----------------------------------
    __syncthreads();

    if ( threadIdx.x == 0 ) {
        // sort the column indices in descending results order so all indices to be removed are at the end of the indices
        thrust::sort_by_key( thrust::seq, results, results + blockDim.x, columns, thrust::greater<float>() );
        // count the number of indices to be removed
        int remove_count = thrust::count( thrust::seq, results, results + blockDim.x, 0.0f );
        *counts -= remove_count;
    }
}

__global__
void parent( float const* inputs, const int nrow, float const* output, int* column_indices, int* column_counts, const int column_size )
{
    int row_per_group = blockDim.x;
    int group_num = blockIdx.x, row_num = threadIdx.x;
    int tid = group_num * row_per_group + row_num;

    int* indices_for_this_block = column_indices + tid * column_size;
    int* count_for_this_block = column_counts + tid;
    // launch child kernels to process the row
    int block_size = *count_for_this_block;
    if ( block_size > 0 ) {
        child<<< 1, block_size, sizeof( float ) * block_size >>>( inputs, nrow, output, indices_for_this_block, count_for_this_block );
        cudaDeviceSynchronize();
    }
}

int main()
{
    thrust::device_vector<int> column_indices( std::vector<int>( {
        0, 1, -1, -1, -1,   // 2 columns contains useful information
        5, 6, 7, -1, -1,    // 3 columns contains useful information
        9, 8, 7, 6, -1,     // 4 columns contains useful information
        4, 3, 2, 1, 0       // 5 columns contains useful information
    } ) );

    thrust::device_vector<int> column_count( std::vector<int>( { 2, 3, 4, 5 } ) );

    // Processing column_indices in two groups and each group process two rows
    // Because we are mocking the correlation results, we don't need real data, so we pass nullptr as the data pointer.
    parent<<< 2, 2 >>>(
        nullptr, 0, nullptr, column_indices.data().get(), column_count.data().get(), 5
    );
    //--------------Discussion-----------
    // Race condition is also gone if I launch parent kernel like this:
    //parent<<< 2, 2, sizeof( float ) * 5 >>>(
    //    nullptr, 0, nullptr, column_indices.data().get(), column_count.data().get(), 5
    //);
    // But when the total number of column is big, this approach will fail as it exceeds the maximum capacity of shared memory
    // (although only a fraction of the allocation is actually used).
    //-----------------------------------
    cudaDeviceSynchronize();

    std::cout << "Row #0: ";
    std::copy( column_indices.begin(), column_indices.begin() + column_count[ 0 ], std::ostream_iterator<int>( std::cout, ", " ) );
    std::cout << std::endl;

    std::cout << "Row #1: ";
    std::copy( column_indices.begin() + 5, column_indices.begin() + 5 + column_count[ 1 ], std::ostream_iterator<int>( std::cout, ", " ) );
    std::cout << std::endl;

    std::cout << "Row #2: ";
    std::copy( column_indices.begin() + 10, column_indices.begin() + 10 + column_count[ 2 ], std::ostream_iterator<int>( std::cout, ", " ) );
    std::cout << std::endl;

    std::cout << "Row #3: ";
    std::copy( column_indices.begin() + 15, column_indices.begin() + 15 + column_count[ 3 ], std::ostream_iterator<int>( std::cout, ", " ) );
    std::cout << std::endl;
}

运行上面的代码，我得到了正确的结果：

Row #0: 1,
Row #1: 6,
Row #2: 8, 6,
Row #3: 3, 1,

但是，cuda-memcheck 似乎抱怨潜在的竞争条件是这样的：

========= WARN:(Warp Level Programming) Potential RAW hazard detected at __shared__ 0x13 in block (0, 0, 0) :
=========     Write Thread (4, 0, 0) at 0x00000070 in /path_to_file/main.cu:23:child(float const *, int, float const *, int*, int*)
=========     Read Thread (0, 0, 0) at 0x00000648 in /usr/local/cuda/include/thrust/system/detail/sequential/insertion_sort.h:109:child(float const *, int, float const *, int*, int*)
=========     Current Value : 0

main.cu 中的第 23 行是这一行：

results[ threadIdx.x ] = calculate( thread_input, input_b, nrow );

阅读线程似乎与：

thrust::sort_by_key( thrust::seq, results, results + blockDim.x, columns, thrust::greater<float>() );

但为什么这会发生在由 __syncthreads() 分隔的两行之间？

我不明白为什么会这样。

在这个例子中，每个子块最多只有 5 个线程。
在让线程 0 处理计算结果之前，我调用了 __syncthreads()。
我的理解是共享内存对每个块都是私有的（也许这就是问题所在）。所以子内核的多次启动应该不会相互干扰。
如果我稍微修改我的代码（如代码中讨论部分所述），我可以删除赛车条件。但为什么这些有效而另一个无效？

谁能告诉我我做错了什么？非常感谢！

Answer 1

此时（通过 CUDA 8.0），cuda-memcheck racecheck 工具 does not support dynamic parallelism。

__syncthreads() 后的 CUDA 竞赛检查危险

CUDA Race Check Hazard after __syncthreads()

c++

cuda

gpu

thrust