为什么即使使用异步流，cudaMemcpyAsync 和内核启动也会阻塞？

Question

考虑以下用于在非阻塞 GPU 流上排队一些工作的程序：

#include <iostream>

using clock_value_t = long long;

__device__ void gpu_sleep(clock_value_t sleep_cycles) {
    clock_value_t start = clock64();
    clock_value_t cycles_elapsed;
    do { cycles_elapsed = clock64() - start; }
    while (cycles_elapsed < sleep_cycles);
}

void callback(cudaStream_t, cudaError_t, void *ptr) { 
    *(reinterpret_cast<bool *>(ptr)) = true; 
}

__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }

int main() {
    const clock_value_t duration_in_clocks = 1e6;
    const size_t buffer_size = 1e7;
    bool callback_executed = false;
    cudaStream_t stream;
    auto host_ptr = std::unique_ptr<char[]>(new char[buffer_size]);
    char* device_ptr;
    cudaMalloc(&device_ptr, buffer_size);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    cudaMemcpyAsync(device_ptr, host_ptr.get(), buffer_size, cudaMemcpyDefault, stream);
    dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
    cudaMemcpyAsync(host_ptr.get(), device_ptr, buffer_size, cudaMemcpyDefault, stream);
    cudaStreamAddCallback(
        stream, callback, &callback_executed, 0 /* fixed and meaningless */);
    snapshot = callback_executed;
    std::cout << "Right after we finished enqueuing work, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
    cudaStreamSynchronize(stream);
    snapshot = callback_executed;
    std::cout << "After cudaStreamSynchronize, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}

缓冲区的大小和内核睡眠周期的长度足够长，因为它们与 CPU 线程并行执行，它应该在它们结束之前很好地完成入队（ 8ms+8ms用于复制，20ms用于内核）。

然而，查看下面的跟踪，似乎两个 cudaMemcpyAsync() 实际上是同步的，即它们阻塞直到（非阻塞）流实际完成复制。这是有意的行为吗？这似乎与relevant section of the CUDA Runtime API documentation相矛盾。这有什么意义？

跟踪：（编号行，使用时间）：

      1 "Start"        "Duration"    "Grid X"                             "Grid Y"  "Grid Z"    "Block X"   "Block Y"                       "Block Z"  
    104 14102.830000   59264.347000  "cudaMalloc"
    105 73368.351000   19.886000     "cudaStreamCreateWithFlags"
    106 73388.and 20 ms for the kernel).

然而，查看下面的跟踪，似乎两个 cudaMemcpyAsync() 实际上是同步的，即它们阻塞直到（非阻塞）流实际完成复制。这是有意的行为吗？它似乎与 CUDA Runtime API 文档的相关部分相矛盾。它有什么意义？

850000   8330.257000   "cudaMemcpyAsync"
        107 73565.702000   8334.265000   47.683716                            5.587311  "Pageable"  "Device"    "GeForce GTX 650 Ti BOOST (0)"  "1"        
        108 81721.124000   2.394000      "cudaConfigureCall"
        109 81723.865000   3.585000      "cudaSetupArgument"
        110 81729.332000   30.742000     "cudaLaunch (dummy(__int64) [107])"
        111 81760.604000   39589.422000  "cudaMemcpyAsync"
        112 81906.303000   20157.648000  128                                  1         1           128         1                               1          
        113 102073.103000  18736.208000  47.683716                            2.485355  "Device"    "Pageable"  "GeForce GTX 650 Ti BOOST (0)"  "1"        
        114 121351.936000  5.560000      "cudaStreamSynchronize"

Answer 1

碰巧 CUDA 内存副本 仅在严格条件下是异步的 ，正如@RobinThoni 亲切指出的那样。对于有问题的代码，问题主要在于未固定（即分页）主机内存的使用。

引用运行时 API 文档的单独部分（强调我的）：

2. API synchronization behavior

The API provides memcpy/memset functions in both synchronous and asynchronous forms, the latter having an "Async" suffix. This is a misnomer as each function may exhibit synchronous or asynchronous behavior depending on the arguments passed to the function.

...

Asynchronous

For transfers from device memory to pageable host memory, the function will return only once the copy has completed.

这只是一半！确实如此

对于从可分页主机内存到设备内存的传输，数据将首先在固定主机内存中暂存，然后复制到设备；并且该功能将 return 只有在分期发生后。

Answer 2

这看起来很奇怪，所以我联系了 CUDA 驱动程序团队的某个人，他确认文档是正确的。我也能够确认它：

#include <iostream>
#include <memory>

using clock_value_t = long long;

__device__ void gpu_sleep(clock_value_t sleep_cycles) {
    clock_value_t start = clock64();
    clock_value_t cycles_elapsed;
    do { cycles_elapsed = clock64() - start; }
    while (cycles_elapsed < sleep_cycles);
}

void callback(cudaStream_t, cudaError_t, void *ptr) { 
    *(reinterpret_cast<bool *>(ptr)) = true; 
}

__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }

int main(int argc, char* argv[]) {
  cudaFree(0);
  struct timespec start, stop;
    const clock_value_t duration_in_clocks = 1e6;
    const size_t buffer_size = 2 * 1024 * 1024 * (size_t)1024;
    bool callback_executed = false;
    cudaStream_t stream;
    void* host_ptr;
    if (argc == 1){
      host_ptr = malloc(buffer_size);
    }
    else {
      cudaMallocHost(&host_ptr, buffer_size, 0);
    }
    char* device_ptr;
    cudaMalloc(&device_ptr, buffer_size);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
    cudaMemcpyAsync(device_ptr, host_ptr, buffer_size, cudaMemcpyDefault, stream);
    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
    double result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
    std::cout << "Elapsed: " << result / 1000 / 1000<< std::endl;

    dummy<<<128, 128, 0, stream>>>(duration_in_clocks);

    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
    cudaMemcpyAsync(host_ptr, device_ptr, buffer_size, cudaMemcpyDefault, stream);
    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
    result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
    std::cout << "Elapsed: " << result / 1000 / 1000 << std::endl;

    cudaStreamAddCallback(
        stream, callback, &callback_executed, 0 /* fixed and meaningless */);
    auto snapshot = callback_executed;
    std::cout << "Right after we finished enqueuing work, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
    cudaStreamSynchronize(stream);
    snapshot = callback_executed;
    std::cout << "After cudaStreamSynchronize, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}

这基本上是您的代码，稍作修改：

时间测量
从可分页或固定内存分配的开关
2 GiB 的缓冲区大小以确保可测量的复制时间
cudaFree(0) 强制 CUDA 惰性初始化。

结果如下：

$ nvcc -std=c++11 main.cu -lrt

$ ./a.out # using pageable memory
Elapsed: 0.360828 # (memcpyDtoH pageable -> device, fully async)
Elapsed: 5.20288 # (memcpyHtoD device -> pageable, sync)

$ ./a.out 1 # using pinned memory
Elapsed: 4.412e-06 # (memcpyDtoH pinned -> device, fully async)
Elapsed: 7.127e-06 # (memcpyDtoH device -> pinned, fully async)

从可分页复制到设备时速度较慢，但确实是异步的。

我为我的错误感到抱歉。我删除了我之前的评论以避免混淆人们。

为什么即使使用异步流，cudaMemcpyAsync 和内核启动也会阻塞？

Why do cudaMemcpyAsync and kernel launches block even with an asynchronous stream?

asynchronous

cuda

cuda-streams

2. API synchronization behavior