CudaMallocManaged 是否在设备上分配内存?
Does CudaMallocManaged allocate memory on the device?
我正在使用统一内存来简化对 CPU 和 GPU 上数据的访问。据我所知,cudaMallocManaged 应该在设备上分配内存。我写了一个简单的代码来检查:
#define TYPE float
#define BDIMX 16
#define BDIMY 16
#include <cuda.h>
#include <cstdio>
#include <iostream>
__global__ void kernel(TYPE *g_output, TYPE *g_input, const int dimx, const int dimy)
{
__shared__ float s_data[BDIMY][BDIMX];
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
int in_idx = iy * dimx + ix; // index for reading input
int tx = threadIdx.x; // thread’s x-index into corresponding shared memory tile
int ty = threadIdx.y; // thread’s y-index into corresponding shared memory tile
s_data[ty][tx] = g_input[in_idx];
__syncthreads();
g_output[in_idx] = s_data[ty][tx] * 1.3;
}
int main(){
int size_x = 16, size_y = 16;
dim3 numTB;
numTB.x = (int)ceil((double)(size_x)/(double)BDIMX) ;
numTB.y = (int)ceil((double)(size_y)/(double)BDIMY) ;
dim3 tbSize;
tbSize.x = BDIMX;
tbSize.y = BDIMY;
float* a,* a_out;
cudaMallocManaged((void**)&a, size_x * size_y * sizeof(TYPE));
cudaMallocManaged((void**)&a_out, size_x * size_y * sizeof(TYPE));
kernel <<<numTB, tbSize>>>(a_out, a, size_x, size_y);
cudaDeviceSynchronize();
return 0;
}
所以我什至没有访问 CPU 上的数据以避免任何页面错误,因此内存应该在设备内存上。然而,当我 运行 nvprof 处理这段代码时,我得到以下结果:
invocations Metric Name Metric Description Min Max Avg
Device "Tesla K40c (0)"
Kernel: kernel(float*, float*, int, int)
1 local_load_transactions Local Load Transactions 0 0 0
1 local_store_transactions Local Store Transactions 0 0 0
1 shared_load_transactions Shared Load Transactions 8 8 8
1 shared_store_transactions Shared Store Transactions 8 8 8
1 gld_transactions Global Load Transactions 8 8 8
1 gst_transactions Global Store Transactions 8 8 8
1 sysmem_read_transactions System Memory Read Transactions 32 32 32
1 sysmem_write_transactions System Memory Write Transactions 34 34 34
1 tex_cache_transactions Texture Cache Transactions 0 0 0
1 dram_read_transactions Device Memory Read Transactions 0 0 0
1 dram_write_transactions Device Memory Write Transactions 0 0 0
很明显,数组是在系统内存而不是设备内存上分配的。我在这里错过了什么?
托管内存确实会在 GPU 上分配物理内存。您可以通过对代码执行类似以下操作来确认自己是这种情况:
#include <iostream>
void report_gpu_mem()
{
size_t free, total;
cudaMemGetInfo(&free, &total);
std::cout << "Free = " << free << " Total = " << total <<std::endl;
}
int main()
{
float* a,* a_out;
size_t sz = 1 << 24; // 16Mb
report_gpu_mem();
cudaMallocManaged((void**)&a, sz);
report_gpu_mem();
cudaMallocManaged((void**)&a_out, sz);
report_gpu_mem();
cudaFree(a);
report_gpu_mem();
cudaFree(a_out);
report_gpu_mem();
return cudaDeviceReset();
}
现在为两个托管分配中的每一个分配 16Mb,然后释放它们。没有主机或设备访问发生,因此不应有触发传输或同步。大小足够大,应该超过 GPU 内存管理器的最小粒度并触发可见空闲内存的变化。编译和 运行 它会这样做:
$ nvcc -arch=sm_52 sleepy.cu
$ CUDA_VISIBLE_DEVICES="0" ./a.out
Free = 4211929088 Total = 4294770688
Free = 4194869248 Total = 4294770688
Free = 4178092032 Total = 4294770688
Free = 4194869248 Total = 4294770688
Free = 4211654656 Total = 4294770688
GPU 上的物理可用内存显然在每次 alloc/free 增加和减少 16Mb。
我正在使用统一内存来简化对 CPU 和 GPU 上数据的访问。据我所知,cudaMallocManaged 应该在设备上分配内存。我写了一个简单的代码来检查:
#define TYPE float
#define BDIMX 16
#define BDIMY 16
#include <cuda.h>
#include <cstdio>
#include <iostream>
__global__ void kernel(TYPE *g_output, TYPE *g_input, const int dimx, const int dimy)
{
__shared__ float s_data[BDIMY][BDIMX];
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
int in_idx = iy * dimx + ix; // index for reading input
int tx = threadIdx.x; // thread’s x-index into corresponding shared memory tile
int ty = threadIdx.y; // thread’s y-index into corresponding shared memory tile
s_data[ty][tx] = g_input[in_idx];
__syncthreads();
g_output[in_idx] = s_data[ty][tx] * 1.3;
}
int main(){
int size_x = 16, size_y = 16;
dim3 numTB;
numTB.x = (int)ceil((double)(size_x)/(double)BDIMX) ;
numTB.y = (int)ceil((double)(size_y)/(double)BDIMY) ;
dim3 tbSize;
tbSize.x = BDIMX;
tbSize.y = BDIMY;
float* a,* a_out;
cudaMallocManaged((void**)&a, size_x * size_y * sizeof(TYPE));
cudaMallocManaged((void**)&a_out, size_x * size_y * sizeof(TYPE));
kernel <<<numTB, tbSize>>>(a_out, a, size_x, size_y);
cudaDeviceSynchronize();
return 0;
}
所以我什至没有访问 CPU 上的数据以避免任何页面错误,因此内存应该在设备内存上。然而,当我 运行 nvprof 处理这段代码时,我得到以下结果:
invocations Metric Name Metric Description Min Max Avg
Device "Tesla K40c (0)"
Kernel: kernel(float*, float*, int, int)
1 local_load_transactions Local Load Transactions 0 0 0
1 local_store_transactions Local Store Transactions 0 0 0
1 shared_load_transactions Shared Load Transactions 8 8 8
1 shared_store_transactions Shared Store Transactions 8 8 8
1 gld_transactions Global Load Transactions 8 8 8
1 gst_transactions Global Store Transactions 8 8 8
1 sysmem_read_transactions System Memory Read Transactions 32 32 32
1 sysmem_write_transactions System Memory Write Transactions 34 34 34
1 tex_cache_transactions Texture Cache Transactions 0 0 0
1 dram_read_transactions Device Memory Read Transactions 0 0 0
1 dram_write_transactions Device Memory Write Transactions 0 0 0
很明显,数组是在系统内存而不是设备内存上分配的。我在这里错过了什么?
托管内存确实会在 GPU 上分配物理内存。您可以通过对代码执行类似以下操作来确认自己是这种情况:
#include <iostream>
void report_gpu_mem()
{
size_t free, total;
cudaMemGetInfo(&free, &total);
std::cout << "Free = " << free << " Total = " << total <<std::endl;
}
int main()
{
float* a,* a_out;
size_t sz = 1 << 24; // 16Mb
report_gpu_mem();
cudaMallocManaged((void**)&a, sz);
report_gpu_mem();
cudaMallocManaged((void**)&a_out, sz);
report_gpu_mem();
cudaFree(a);
report_gpu_mem();
cudaFree(a_out);
report_gpu_mem();
return cudaDeviceReset();
}
现在为两个托管分配中的每一个分配 16Mb,然后释放它们。没有主机或设备访问发生,因此不应有触发传输或同步。大小足够大,应该超过 GPU 内存管理器的最小粒度并触发可见空闲内存的变化。编译和 运行 它会这样做:
$ nvcc -arch=sm_52 sleepy.cu
$ CUDA_VISIBLE_DEVICES="0" ./a.out
Free = 4211929088 Total = 4294770688
Free = 4194869248 Total = 4294770688
Free = 4178092032 Total = 4294770688
Free = 4194869248 Total = 4294770688
Free = 4211654656 Total = 4294770688
GPU 上的物理可用内存显然在每次 alloc/free 增加和减少 16Mb。