CUDA:调用 thrust::for_each_n 后速度较慢 thrust::reduce
CUDA: slower thrust::reduce after calling thrust::for_each_n
我正在尝试使用 thrust
和 GK107 [GeForce GTX 650]
对数字求和。我很困惑地观察到 thrust::reduce
的执行时间在内存上初始化 device_vector<curandState>
后显着增加。
示例代码如下:
#include <iostream>
#include <stack>
#include <ctime>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/for_each.h>
#include <curand.h>
#include <curand_kernel.h>
struct tic_toc{
std::stack<clock_t> tictoc_stack;
inline void tic() { tictoc_stack.push(clock());}
inline void toc() {
std::cout << "Time elapsed: "
<< ((double)(clock() - tictoc_stack.top())) / CLOCKS_PER_SEC << "s"
<< std::endl;
tictoc_stack.pop();
}
};
struct curand_setup{
using init_tuple = thrust::tuple<int, curandState &>;
const unsigned long long seed;
curand_setup(unsigned long long _seed) : seed(_seed) {}
__device__ void operator()(init_tuple t){
curandState s;
int id = thrust::get<0>(t);
curand_init(seed, id, 0, &s);
thrust::get<1>(t) = s;
}
};
int main(int argc, char** argv){
int N = 1<<18;
std::cout << "N " << N << std::endl;
tic_toc tt;
thrust::device_vector<float> val(N,1);
tt.tic();
float mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
thrust::device_vector<curandState> rand_state(N);
auto rand_init_it = thrust::make_zip_iterator(
thrust::make_tuple(thrust::counting_iterator<int>(0),rand_state.begin()));
thrust::for_each_n(rand_init_it, N, curand_setup(0));
tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
return 0;
}
输出为:
Time elapsed: 0.000594s
Time elapsed: 5.60026s
Time elapsed: 0.001098s
我自己写kernel做求和或者复制数据到thrust::host_vector
再减去,情况没有改变
为什么thrust::reduce
刚初始化完thrust::device_vector<curandState>
就这么慢,有什么办法可以避免这个问题吗?我将不胜感激。
我的系统是 Linux Mint 18.3
,内核是 4.15.0-23-generic
。
nvcc --version
的输出:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2015 NVIDIA Corporation
Built on Tue_Aug_11_14:27:32_CDT_2015
Cuda compilation tools, release 7.5, V7.5.17
Why is thrust::reduce
so slow just after initializing
thrust::device_vector<curandState>
不是。你困惑的根源是你的时间测量,这是不正确的。
一般来说,thrust API 在设备上操作的调用在主机上是异步的。唯一的例外是 return 一个值的调用(thrust::reduce
是其中之一)。因此,代码中的中间调用不仅测量 thrust::reduce
的执行时间,而且还测量之前的 thrust::for_each_n
调用,而且之前的调用要慢得多。
您可以通过两种方式向自己确认这一点。如果你像这样修改推力代码:
tt.tic();
float mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
thrust::device_vector<curandState> rand_state(N);
auto rand_init_it = thrust::make_zip_iterator(
thrust::make_tuple(thrust::counting_iterator<int>(0),rand_state.begin()));
thrust::for_each_n(rand_init_it, N, curand_setup(0));
cudaDeviceSynchronize(); // wait until for_each is complete
tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
你应该得到这样的东西:
$ nvcc -arch=sm_52 -std=c++11 -o slow_thrust slow_thrust.cu
$ ./slow_thrust
N 262144
Time elapsed: 0.000471s
Time elapsed: 0.000621s
Time elapsed: 0.000448s
即当您使用 cudaDeviceSynchronize()
捕获先前调用的运行时间时,所有 reduce 调用的运行时间大致相同。或者,您可以在原始代码上使用分析工具,例如:
$ nvprof --print-gpu-trace ./slow_thrust
N 262144
==7870== NVPROF is profiling process 7870, command: ./slow_thrust
Time elapsed: 0.000521s
Time elapsed: 0.06983s
Time elapsed: 0.000538s
==7870== Profiling application: ./slow_thrust
==7870== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name
214.30ms 7.6800us (512 1 1) (256 1 1) 8 0B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<float>, float>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<float>, float>, unsigned long>(thrust::device_ptr<float>, float) [109]
214.56ms 5.8550us (52 1 1) (256 1 1) 29 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [128]
214.58ms 2.7200us (1 1 1) (256 1 1) 27 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [136]
214.60ms 1.1840us - - - - - 4B 3.2219MB/s Device Pageable GeForce GTX 970 1 7 [CUDA memcpy DtoH]
214.98ms 221.27us (512 1 1) (256 1 1) 20 0B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW>, unsigned long>(thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW) [151]
219.51ms 69.492ms (512 1 1) (256 1 1) 108 0B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::detail::normal_iterator<thrust::device_ptr<curandStateXORWOW>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<curand_setup, void>>, int>, thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::detail::normal_iterator<thrust::device_ptr<curandStateXORWOW>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<curand_setup, void>>, int>(thrust::use_default, thrust::use_default) [160]
289.00ms 9.5360us (52 1 1) (256 1 1) 29 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [179]
289.01ms 3.4880us (1 1 1) (256 1 1) 27 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [187]
289.07ms 1.3120us - - - - - 4B 2.9075MB/s Device Pageable GeForce GTX 970 1 7 [CUDA memcpy DtoH]
289.66ms 9.9520us (52 1 1) (256 1 1) 29 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [211]
289.68ms 3.3280us (1 1 1) (256 1 1) 27 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [219]
289.69ms 1.3120us - - - - - 4B 2.9075MB/s Device Pageable GeForce GTX 970 1 7 [CUDA memcpy DtoH]
在那里你可以看到组成一个 reduce 操作的三个调用每次累计花费 8-13 微秒,而 for_each_n
需要 69 毫秒才能完成。
我正在尝试使用 thrust
和 GK107 [GeForce GTX 650]
对数字求和。我很困惑地观察到 thrust::reduce
的执行时间在内存上初始化 device_vector<curandState>
后显着增加。
示例代码如下:
#include <iostream>
#include <stack>
#include <ctime>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/for_each.h>
#include <curand.h>
#include <curand_kernel.h>
struct tic_toc{
std::stack<clock_t> tictoc_stack;
inline void tic() { tictoc_stack.push(clock());}
inline void toc() {
std::cout << "Time elapsed: "
<< ((double)(clock() - tictoc_stack.top())) / CLOCKS_PER_SEC << "s"
<< std::endl;
tictoc_stack.pop();
}
};
struct curand_setup{
using init_tuple = thrust::tuple<int, curandState &>;
const unsigned long long seed;
curand_setup(unsigned long long _seed) : seed(_seed) {}
__device__ void operator()(init_tuple t){
curandState s;
int id = thrust::get<0>(t);
curand_init(seed, id, 0, &s);
thrust::get<1>(t) = s;
}
};
int main(int argc, char** argv){
int N = 1<<18;
std::cout << "N " << N << std::endl;
tic_toc tt;
thrust::device_vector<float> val(N,1);
tt.tic();
float mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
thrust::device_vector<curandState> rand_state(N);
auto rand_init_it = thrust::make_zip_iterator(
thrust::make_tuple(thrust::counting_iterator<int>(0),rand_state.begin()));
thrust::for_each_n(rand_init_it, N, curand_setup(0));
tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
return 0;
}
输出为:
Time elapsed: 0.000594s
Time elapsed: 5.60026s
Time elapsed: 0.001098s
我自己写kernel做求和或者复制数据到thrust::host_vector
再减去,情况没有改变
为什么thrust::reduce
刚初始化完thrust::device_vector<curandState>
就这么慢,有什么办法可以避免这个问题吗?我将不胜感激。
我的系统是 Linux Mint 18.3
,内核是 4.15.0-23-generic
。
nvcc --version
的输出:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2015 NVIDIA Corporation
Built on Tue_Aug_11_14:27:32_CDT_2015
Cuda compilation tools, release 7.5, V7.5.17
Why is
thrust::reduce
so slow just after initializingthrust::device_vector<curandState>
不是。你困惑的根源是你的时间测量,这是不正确的。
一般来说,thrust API 在设备上操作的调用在主机上是异步的。唯一的例外是 return 一个值的调用(thrust::reduce
是其中之一)。因此,代码中的中间调用不仅测量 thrust::reduce
的执行时间,而且还测量之前的 thrust::for_each_n
调用,而且之前的调用要慢得多。
您可以通过两种方式向自己确认这一点。如果你像这样修改推力代码:
tt.tic();
float mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
thrust::device_vector<curandState> rand_state(N);
auto rand_init_it = thrust::make_zip_iterator(
thrust::make_tuple(thrust::counting_iterator<int>(0),rand_state.begin()));
thrust::for_each_n(rand_init_it, N, curand_setup(0));
cudaDeviceSynchronize(); // wait until for_each is complete
tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
你应该得到这样的东西:
$ nvcc -arch=sm_52 -std=c++11 -o slow_thrust slow_thrust.cu
$ ./slow_thrust
N 262144
Time elapsed: 0.000471s
Time elapsed: 0.000621s
Time elapsed: 0.000448s
即当您使用 cudaDeviceSynchronize()
捕获先前调用的运行时间时,所有 reduce 调用的运行时间大致相同。或者,您可以在原始代码上使用分析工具,例如:
$ nvprof --print-gpu-trace ./slow_thrust
N 262144
==7870== NVPROF is profiling process 7870, command: ./slow_thrust
Time elapsed: 0.000521s
Time elapsed: 0.06983s
Time elapsed: 0.000538s
==7870== Profiling application: ./slow_thrust
==7870== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name
214.30ms 7.6800us (512 1 1) (256 1 1) 8 0B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<float>, float>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<float>, float>, unsigned long>(thrust::device_ptr<float>, float) [109]
214.56ms 5.8550us (52 1 1) (256 1 1) 29 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [128]
214.58ms 2.7200us (1 1 1) (256 1 1) 27 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [136]
214.60ms 1.1840us - - - - - 4B 3.2219MB/s Device Pageable GeForce GTX 970 1 7 [CUDA memcpy DtoH]
214.98ms 221.27us (512 1 1) (256 1 1) 20 0B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW>, unsigned long>(thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW) [151]
219.51ms 69.492ms (512 1 1) (256 1 1) 108 0B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::detail::normal_iterator<thrust::device_ptr<curandStateXORWOW>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<curand_setup, void>>, int>, thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::detail::normal_iterator<thrust::device_ptr<curandStateXORWOW>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<curand_setup, void>>, int>(thrust::use_default, thrust::use_default) [160]
289.00ms 9.5360us (52 1 1) (256 1 1) 29 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [179]
289.01ms 3.4880us (1 1 1) (256 1 1) 27 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [187]
289.07ms 1.3120us - - - - - 4B 2.9075MB/s Device Pageable GeForce GTX 970 1 7 [CUDA memcpy DtoH]
289.66ms 9.9520us (52 1 1) (256 1 1) 29 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [211]
289.68ms 3.3280us (1 1 1) (256 1 1) 27 44B 0B - - - - GeForce GTX 970 1 7 void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [219]
289.69ms 1.3120us - - - - - 4B 2.9075MB/s Device Pageable GeForce GTX 970 1 7 [CUDA memcpy DtoH]
在那里你可以看到组成一个 reduce 操作的三个调用每次累计花费 8-13 微秒,而 for_each_n
需要 69 毫秒才能完成。