为什么cuda内核可以访问主机内存?

why cuda kernel can access host memory?

我在cuda内核中直接访问host mem,没有发现错误,这是为什么?

我试图通过 documentation 变得更聪明。

Allocates size bytes of host memory that is page-locked and accessible to the device. The driver tracks the virtual memory ranges allocated with this function and automatically accelerates calls to functions such as cudaMemcpy*(). Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than pageable memory obtained with functions such as malloc().

为什么很多cuda程序在cudaMallocHost后面加上cudaMemcpy?

#include <stdio.h>
#include <assert.h>

#define N 64

// cuda kernel access host mem a/b
__global__ void gpu(int *a, int *b, int *c_gpu) {
    int r = blockDim.x * blockIdx.x + threadIdx.x;
    int c = blockDim.y * blockIdx.y + threadIdx.y;

    if (r < N && c < N) {
        c_gpu[r * N + c] = a[r * N + c] + b[r * N + c];
    }
}

// cpu function
void cpu(int *a, int *b, int *c_cpu) {
    for (int r = 0; r < N; r++) {
        for (int c = 0; c < N; c++) {
            c_cpu[r * N + c] = a[r * N + c] + b[r * N + c];
        }
    }
}

int main() {
    int *a, *b, *c_cpu, *c_gpu, *c_gpu_cpu;
    size_t size = N * N * sizeof(int);

    cudaMallocHost(&a, size);
    cudaMallocHost(&b, size);
    cudaMallocHost(&c_cpu, size);
    cudaMallocHost(&c_gpu_cpu, size);
    cudaMalloc(&c_gpu, size);

    for (int r = 0; r < N; r++) {
        for (int c = 0; c < N; c++) {
            a[r * N + c] = r;
            b[r * N + c] = c;
            c_gpu_cpu[r * N + c] = 0;
            c_cpu[r * N + c] = 0;
        }
    }

    cpu(a, b, c_cpu);
    dim3 threads(16, 16, 1);
    dim3 blocks((N + threads.x - 1) / threads.x, (N + threads.y - 1) / threads.y, 1);

    gpu<<<blocks, threads>>>(a, b, c_gpu); // access cpu host mem
    cudaError_t err = cudaGetLastError();  
    if (err != cudaSuccess) {
        printf("Error: %s\n", cudaGetErrorString(err));
    }
    cudaDeviceSynchronize();

    cudaFreeHost(a);
    cudaFreeHost(b);
    cudaFreeHost(c_cpu);
    cudaFreeHost(c_gpu_cpu);
    cudaFree(c_gpu);
}

Why do many cuda programs add cudaMemcpy after cudaMallocHost?

因为很多CUDA程序都是在统一内存系统出现之前写的,那时候cudaMallocHost分配了page locked memory。该页面锁定内存仍然需要 API 调用进行复制。 "...直接由设备访问"表示GPU可以使用DMA通过PCI express总线对内存进行读写,而主机内存管理器不需要做任何事情,这比传统的可分页主机内存快得多。

随着 GPU 和主机架构以及操作系统的发展,GPU 可以通过某些 GPU 硬件和驱动程序魔法直接访问某些 系统上的主机内存。但这不是普遍的。您发布的代码不会 运行 在每个 CUDA 系统上都正确运行,如果它在您的系统上运行正常(您的错误检查有缺陷,它实际上可能根本无法运行)。