为什么cuda内核可以访问主机内存?
why cuda kernel can access host memory?
我在cuda内核中直接访问host mem,没有发现错误,这是为什么?
我试图通过 documentation 变得更聪明。
Allocates size bytes of host memory that is page-locked and accessible to the device. The driver tracks the virtual memory ranges allocated with this function and automatically accelerates calls to functions such as cudaMemcpy*(). Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than pageable memory obtained with functions such as malloc().
为什么很多cuda程序在cudaMallocHost后面加上cudaMemcpy?
#include <stdio.h>
#include <assert.h>
#define N 64
// cuda kernel access host mem a/b
__global__ void gpu(int *a, int *b, int *c_gpu) {
int r = blockDim.x * blockIdx.x + threadIdx.x;
int c = blockDim.y * blockIdx.y + threadIdx.y;
if (r < N && c < N) {
c_gpu[r * N + c] = a[r * N + c] + b[r * N + c];
}
}
// cpu function
void cpu(int *a, int *b, int *c_cpu) {
for (int r = 0; r < N; r++) {
for (int c = 0; c < N; c++) {
c_cpu[r * N + c] = a[r * N + c] + b[r * N + c];
}
}
}
int main() {
int *a, *b, *c_cpu, *c_gpu, *c_gpu_cpu;
size_t size = N * N * sizeof(int);
cudaMallocHost(&a, size);
cudaMallocHost(&b, size);
cudaMallocHost(&c_cpu, size);
cudaMallocHost(&c_gpu_cpu, size);
cudaMalloc(&c_gpu, size);
for (int r = 0; r < N; r++) {
for (int c = 0; c < N; c++) {
a[r * N + c] = r;
b[r * N + c] = c;
c_gpu_cpu[r * N + c] = 0;
c_cpu[r * N + c] = 0;
}
}
cpu(a, b, c_cpu);
dim3 threads(16, 16, 1);
dim3 blocks((N + threads.x - 1) / threads.x, (N + threads.y - 1) / threads.y, 1);
gpu<<<blocks, threads>>>(a, b, c_gpu); // access cpu host mem
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(err));
}
cudaDeviceSynchronize();
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c_cpu);
cudaFreeHost(c_gpu_cpu);
cudaFree(c_gpu);
}
Why do many cuda programs add cudaMemcpy after cudaMallocHost?
因为很多CUDA程序都是在统一内存系统出现之前写的,那时候cudaMallocHost
分配了page locked memory。该页面锁定内存仍然需要 API 调用进行复制。 "...直接由设备访问"表示GPU可以使用DMA通过PCI express总线对内存进行读写,而主机内存管理器不需要做任何事情,这比传统的可分页主机内存快得多。
随着 GPU 和主机架构以及操作系统的发展,GPU 可以通过某些 GPU 硬件和驱动程序魔法直接访问某些 系统上的主机内存。但这不是普遍的。您发布的代码不会 运行 在每个 CUDA 系统上都正确运行,如果它在您的系统上运行正常(您的错误检查有缺陷,它实际上可能根本无法运行)。
我在cuda内核中直接访问host mem,没有发现错误,这是为什么?
我试图通过 documentation 变得更聪明。
Allocates size bytes of host memory that is page-locked and accessible to the device. The driver tracks the virtual memory ranges allocated with this function and automatically accelerates calls to functions such as cudaMemcpy*(). Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than pageable memory obtained with functions such as malloc().
为什么很多cuda程序在cudaMallocHost后面加上cudaMemcpy?
#include <stdio.h>
#include <assert.h>
#define N 64
// cuda kernel access host mem a/b
__global__ void gpu(int *a, int *b, int *c_gpu) {
int r = blockDim.x * blockIdx.x + threadIdx.x;
int c = blockDim.y * blockIdx.y + threadIdx.y;
if (r < N && c < N) {
c_gpu[r * N + c] = a[r * N + c] + b[r * N + c];
}
}
// cpu function
void cpu(int *a, int *b, int *c_cpu) {
for (int r = 0; r < N; r++) {
for (int c = 0; c < N; c++) {
c_cpu[r * N + c] = a[r * N + c] + b[r * N + c];
}
}
}
int main() {
int *a, *b, *c_cpu, *c_gpu, *c_gpu_cpu;
size_t size = N * N * sizeof(int);
cudaMallocHost(&a, size);
cudaMallocHost(&b, size);
cudaMallocHost(&c_cpu, size);
cudaMallocHost(&c_gpu_cpu, size);
cudaMalloc(&c_gpu, size);
for (int r = 0; r < N; r++) {
for (int c = 0; c < N; c++) {
a[r * N + c] = r;
b[r * N + c] = c;
c_gpu_cpu[r * N + c] = 0;
c_cpu[r * N + c] = 0;
}
}
cpu(a, b, c_cpu);
dim3 threads(16, 16, 1);
dim3 blocks((N + threads.x - 1) / threads.x, (N + threads.y - 1) / threads.y, 1);
gpu<<<blocks, threads>>>(a, b, c_gpu); // access cpu host mem
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(err));
}
cudaDeviceSynchronize();
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c_cpu);
cudaFreeHost(c_gpu_cpu);
cudaFree(c_gpu);
}
Why do many cuda programs add cudaMemcpy after cudaMallocHost?
因为很多CUDA程序都是在统一内存系统出现之前写的,那时候cudaMallocHost
分配了page locked memory。该页面锁定内存仍然需要 API 调用进行复制。 "...直接由设备访问"表示GPU可以使用DMA通过PCI express总线对内存进行读写,而主机内存管理器不需要做任何事情,这比传统的可分页主机内存快得多。
随着 GPU 和主机架构以及操作系统的发展,GPU 可以通过某些 GPU 硬件和驱动程序魔法直接访问某些 系统上的主机内存。但这不是普遍的。您发布的代码不会 运行 在每个 CUDA 系统上都正确运行,如果它在您的系统上运行正常(您的错误检查有缺陷,它实际上可能根本无法运行)。