Cuda Allocation/Memory 大图的tTime
Cuda Allocation/Memory tTime of Large Image
我正在使用 CUDA 执行图像处理。根据我的时间安排,分配时间最长。一张大图像需要 0.00908 秒才能将数据分配和复制到 gpu 内存中。
这是正常的时间吗?我做错了什么吗?
clock_t t = clock();
float * dData;
unsigned int nBytes = width*height*sizeof(float);
cudaMalloc( (void**)&dData, nBytes );
cudaMemcpy( dData, Data, nBytes, cudaMemcpyHostToDevice );
t = clock()-t;
printf( "Allocation to device: %f\n", ((float)t/CLOCKS_PER_SEC) );
确保您是在发布版中编译而不是调试。值在 JEDEC 中。
#include <stdio.h>
#include <cuda.h>
// main routine
int main ()
{
float time;
cudaEvent_t start, stop;
for(size_t size=32; size<1024*1024*1024; size*=2){
float* d_Data;
float* h_Data = new float[size];
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMalloc( (void**)&d_Data, size*sizeof(float) );
cudaMemcpy( d_Data, h_Data, size, cudaMemcpyHostToDevice );
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
if(size>1024*1024){
printf( "Allocation to device: %fms with size %dMB\n", time, (size*sizeof(float))/(1024*1024) );
}else if(size>1024){
printf( "Allocation to device: %fms with size %dKB\n", time, (size*sizeof(float))/1024);
}else{
printf( "Allocation to device: %fms with size %dB\n", time, size*sizeof(float) );
}
delete[] h_Data;
cudaFree(d_Data);
}
return 0;
}
我得到的时间是:
Allocation to device: 0.017504ms with size 128B
Allocation to device: 0.012608ms with size 256B
Allocation to device: 0.462656ms with size 512B
Allocation to device: 0.386432ms with size 1024B
Allocation to device: 0.492512ms with size 2048B
Allocation to device: 0.409568ms with size 4096B
Allocation to device: 0.419648ms with size 8KB
Allocation to device: 0.402144ms with size 16KB
Allocation to device: 0.562240ms with size 32KB
Allocation to device: 0.460480ms with size 64KB
Allocation to device: 0.409376ms with size 128KB
Allocation to device: 0.492864ms with size 256KB
Allocation to device: 0.611424ms with size 512KB
Allocation to device: 0.577376ms with size 1024KB
Allocation to device: 0.722240ms with size 2048KB
Allocation to device: 1.174336ms with size 4096KB
Allocation to device: 0.995552ms with size 8MB
Allocation to device: 2.030592ms with size 16MB
Allocation to device: 3.876384ms with size 32MB
Allocation to device: 7.414432ms with size 64MB
Allocation to device: 15.325792ms with size 128MB
Allocation to device: 31.763008ms with size 256MB
Allocation to device: 65.624481ms with size 512MB
Allocation to device: 133.767838ms with size 1024MB
Allocation to device: 272.001404ms with size 2048MB
在 K20x 和 8 核 Ivy Bridge Xeon 上
我正在使用 CUDA 执行图像处理。根据我的时间安排,分配时间最长。一张大图像需要 0.00908 秒才能将数据分配和复制到 gpu 内存中。
这是正常的时间吗?我做错了什么吗?
clock_t t = clock();
float * dData;
unsigned int nBytes = width*height*sizeof(float);
cudaMalloc( (void**)&dData, nBytes );
cudaMemcpy( dData, Data, nBytes, cudaMemcpyHostToDevice );
t = clock()-t;
printf( "Allocation to device: %f\n", ((float)t/CLOCKS_PER_SEC) );
确保您是在发布版中编译而不是调试。值在 JEDEC 中。
#include <stdio.h>
#include <cuda.h>
// main routine
int main ()
{
float time;
cudaEvent_t start, stop;
for(size_t size=32; size<1024*1024*1024; size*=2){
float* d_Data;
float* h_Data = new float[size];
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMalloc( (void**)&d_Data, size*sizeof(float) );
cudaMemcpy( d_Data, h_Data, size, cudaMemcpyHostToDevice );
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
if(size>1024*1024){
printf( "Allocation to device: %fms with size %dMB\n", time, (size*sizeof(float))/(1024*1024) );
}else if(size>1024){
printf( "Allocation to device: %fms with size %dKB\n", time, (size*sizeof(float))/1024);
}else{
printf( "Allocation to device: %fms with size %dB\n", time, size*sizeof(float) );
}
delete[] h_Data;
cudaFree(d_Data);
}
return 0;
}
我得到的时间是:
Allocation to device: 0.017504ms with size 128B
Allocation to device: 0.012608ms with size 256B
Allocation to device: 0.462656ms with size 512B
Allocation to device: 0.386432ms with size 1024B
Allocation to device: 0.492512ms with size 2048B
Allocation to device: 0.409568ms with size 4096B
Allocation to device: 0.419648ms with size 8KB
Allocation to device: 0.402144ms with size 16KB
Allocation to device: 0.562240ms with size 32KB
Allocation to device: 0.460480ms with size 64KB
Allocation to device: 0.409376ms with size 128KB
Allocation to device: 0.492864ms with size 256KB
Allocation to device: 0.611424ms with size 512KB
Allocation to device: 0.577376ms with size 1024KB
Allocation to device: 0.722240ms with size 2048KB
Allocation to device: 1.174336ms with size 4096KB
Allocation to device: 0.995552ms with size 8MB
Allocation to device: 2.030592ms with size 16MB
Allocation to device: 3.876384ms with size 32MB
Allocation to device: 7.414432ms with size 64MB
Allocation to device: 15.325792ms with size 128MB
Allocation to device: 31.763008ms with size 256MB
Allocation to device: 65.624481ms with size 512MB
Allocation to device: 133.767838ms with size 1024MB
Allocation to device: 272.001404ms with size 2048MB
在 K20x 和 8 核 Ivy Bridge Xeon 上