设备 -> 主机与主机 -> cuda 中的设备复制性能
Device -> host vs host -> device copy performance in cuda
我是 CUDA 的新手,我的首要任务是实施性能指标。
我注意到,与将数据从设备复制到主机相比,使用推力矢量将数据从主机复制到设备花费的时间更少。谁能解释为什么?
int dimension = 1000000;
thrust::host_vector <int> host_Table (dimension);
tic2=get_time();
thrust::device_vector<int> device_Table =host_Table;
toc2=get_time();
tic3=get_time();
thrust::host_vector<int> host_TableCopiedFromDevice = device_Table;
toc3=get_time();
toc2-tic2 和 toc3-tic3 之间的差异非常大。
谢谢
首先,不要使用 CPU 计时器,记住最好使用 Cuda Event API for timing measurements. Also you may want to consider a warmup call before the timing (see here 以获得更多信息)。我认为@Robert Crovella 已经在他的评论中回答了你的问题,提到向量实例化可能是时差的原因。但为了证明这一点,我做了一个简单的测试,在有和没有矢量分配的两种情况下,我测量了设备到主机 (D2H) 和主机到设备 (H2D) 的传输时间。考虑这段代码,它基本上等于你的代码:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
int main(){
int dimension = 1000000;
// Some dummy vector to wake up device
thrust::device_vector<int> dummy_vec (dimension, 1);
// Create a Cuda event
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float elapsed = 0; // time in ms
thrust::host_vector <int> host_Table (dimension);
// H2D:
cudaEventRecord(start);
thrust::device_vector<int> device_Table = host_Table;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
std::cout<<"H2D elapsed time: " << elapsed << " ms"<< std::endl;
// D2H:
cudaEventRecord(start);
thrust::host_vector<int> host_TableCopiedFromDevice = device_Table;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
std::cout<<"D2H elapsed time: " << elapsed << " ms"<< std::endl;
}
运行 这在 Titan Black (Ubuntu, CUDA 10.1) 上给出了以下时间值:
H2D elapsed time: 1.76941 ms
D2H elapsed time: 3.80643 ms
你就在这里。 D2H 时间几乎是 H2D 的 2 倍。现在,在传输之前分配向量的相同代码:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
int main(){
int dimension = 1000000;
// Some dummy vector to wake up device
thrust::device_vector<int> dummy_vec (dimension, 1);
// Create a Cuda event
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float elapsed = 0; // time in ms
// initialized vectors
thrust::host_vector <int> h_vec (dimension, 1);
thrust::device_vector <int> d_vec (dimension);
thrust::host_vector <int> h_vec_2 (dimension);
// H2D:
cudaEventRecord(start);
d_vec = h_vec;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
std::cout<<"H2D elapsed time: " << elapsed << " ms"<< std::endl;
// D2H:
cudaEventRecord(start);
h_vec_2 = d_vec;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
std::cout<<"D2H elapsed time: " << elapsed << " ms"<< std::endl;
}
给出:
H2D elapsed time: 1.7777 ms
D2H elapsed time: 1.54707 ms
如果我们排除其他因素,这证实了 H2D 和 D2H 内存传输实际上是差不多的。另一项可能会给您一些提示的调查是将 dimension
更改为 smaller/larger 值,然后查看这如何改变时差。
我是 CUDA 的新手,我的首要任务是实施性能指标。
我注意到,与将数据从设备复制到主机相比,使用推力矢量将数据从主机复制到设备花费的时间更少。谁能解释为什么?
int dimension = 1000000;
thrust::host_vector <int> host_Table (dimension);
tic2=get_time();
thrust::device_vector<int> device_Table =host_Table;
toc2=get_time();
tic3=get_time();
thrust::host_vector<int> host_TableCopiedFromDevice = device_Table;
toc3=get_time();
toc2-tic2 和 toc3-tic3 之间的差异非常大。
谢谢
首先,不要使用 CPU 计时器,记住最好使用 Cuda Event API for timing measurements. Also you may want to consider a warmup call before the timing (see here 以获得更多信息)。我认为@Robert Crovella 已经在他的评论中回答了你的问题,提到向量实例化可能是时差的原因。但为了证明这一点,我做了一个简单的测试,在有和没有矢量分配的两种情况下,我测量了设备到主机 (D2H) 和主机到设备 (H2D) 的传输时间。考虑这段代码,它基本上等于你的代码:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
int main(){
int dimension = 1000000;
// Some dummy vector to wake up device
thrust::device_vector<int> dummy_vec (dimension, 1);
// Create a Cuda event
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float elapsed = 0; // time in ms
thrust::host_vector <int> host_Table (dimension);
// H2D:
cudaEventRecord(start);
thrust::device_vector<int> device_Table = host_Table;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
std::cout<<"H2D elapsed time: " << elapsed << " ms"<< std::endl;
// D2H:
cudaEventRecord(start);
thrust::host_vector<int> host_TableCopiedFromDevice = device_Table;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
std::cout<<"D2H elapsed time: " << elapsed << " ms"<< std::endl;
}
运行 这在 Titan Black (Ubuntu, CUDA 10.1) 上给出了以下时间值:
H2D elapsed time: 1.76941 ms
D2H elapsed time: 3.80643 ms
你就在这里。 D2H 时间几乎是 H2D 的 2 倍。现在,在传输之前分配向量的相同代码:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
int main(){
int dimension = 1000000;
// Some dummy vector to wake up device
thrust::device_vector<int> dummy_vec (dimension, 1);
// Create a Cuda event
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float elapsed = 0; // time in ms
// initialized vectors
thrust::host_vector <int> h_vec (dimension, 1);
thrust::device_vector <int> d_vec (dimension);
thrust::host_vector <int> h_vec_2 (dimension);
// H2D:
cudaEventRecord(start);
d_vec = h_vec;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
std::cout<<"H2D elapsed time: " << elapsed << " ms"<< std::endl;
// D2H:
cudaEventRecord(start);
h_vec_2 = d_vec;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
std::cout<<"D2H elapsed time: " << elapsed << " ms"<< std::endl;
}
给出:
H2D elapsed time: 1.7777 ms
D2H elapsed time: 1.54707 ms
如果我们排除其他因素,这证实了 H2D 和 D2H 内存传输实际上是差不多的。另一项可能会给您一些提示的调查是将 dimension
更改为 smaller/larger 值,然后查看这如何改变时差。