使用推力比我自己的内核慢?
Using thrust is slower than my own kernel?
EIDT
按照 Robert 的建议更改代码,但推力仍然慢得多。
我使用的数据是基于两个.dat文件,所以代码中省略了。
原题
我在 GPU Tesla M6 上安装了两个复向量。我想计算两个向量的逐元素乘积,即 [x1*y1,...,xN*yN]。
两个向量的长度都是N = 720,896.
代码片段(已修改)
我用两种方法解决这个问题。
一种是使用带有类型转换和特定结构的推力:
#include <cstdio>
#include <cstdlib>
#include <sys/time.h>
#include "cuda_runtime.h"
#include "cuComplex.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/complex.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
using namespace std;
typedef thrust::complex<float> comThr;
// ---- struct for thrust ----//
struct Complex_Mul_Complex :public thrust::binary_function<comThr, comThr, comThr>
{
__host__ __device__
comThr operator() (comThr a, comThr b) const{
return a*b;
}
};
// ---- my kernel function ---- //
__global__ void HardamarProductOnDeviceCC(cuComplex *Result, cuComplex *A, cuComplex *B, int N)
{
unsigned int tid = threadIdx.x;
unsigned int index = threadIdx.x + blockDim.x * blockIdx.x;
if(index >= N)
return;
Result[index].x = cuCmul(A[index],B[index]).x;
Result[index].y = cuCmul(A[index],B[index]).y;
}
// ---- timing function ---- //
double seconds()
{
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
int main()
{
int N = 720896;
cuComplex *d_Data1, *d_Data2;
cudaMalloc(&d_Data1, N*sizeof(d_Data1[0]));
cudaMalloc(&d_Data2, N*sizeof(d_Data2[0]));
/************************************
* Version 1: type conversion twice *
************************************/
// step 1: type convert (cuComplex->thrust)
comThr *thr_temp1 = reinterpret_cast<comThr*>(d_Data1);
thrust::device_ptr<comThr> thr_d_Data1 = thrust::device_pointer_cast(thr_temp1);
comThr *thr_temp2 = reinterpret_cast<comThr*>(d_Data2);
thrust::device_ptr<comThr> thr_d_Data2 = thrust::device_pointer_cast(thr_temp2);
// step 2: product and timing
Complex_Mul_Complex op_dot;
double iStart = cpuSecond(); // timing class
for(int i=0;i<1000;i++){ // loop 1000 times to get accurate time consumption
thrust::transform(thrust::device,thr_d_Data1,thr_d_Data1+N,
thr_d_Data2,thr_d_Data1,op_dot);
}
cudaDeviceSynchronize();
double iElapse = cpuSecond() - iStart;
cout << "thrust time consume: " << iElapse <<endl;
/************************************************
* Version 2: dot product using kernel function *
************************************************/
int blockSize;
int minGridSize;
int gridSize;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, HardamarProductOnDeviceCC, 0, 0);
gridSize = (N+blockSize-1)/blockSize;
dim3 grid(gridSize);
dim3 block(blockSize);
iStart = cpuSecond();
for(int i=0;i<1000;i++){
HardamarProductOnDeviceCC<<<grid,block>>>(d_Data1,d_Data1,d_Data2,N);
}
cudaDeviceSynchronize();
iElapse = cpuSecond() - iStart;
cout << "kernel time consume: " << iElapse <<endl;
}
Result:
thrust time consume: 25.6063
kernel time consume: 2.87929
我的问题
添加cudaDeviceSynchronize()
后,thrust版本似乎比内核版本慢很多。
有一个 "golden rule" 使用库而不是编写自己的内核函数。但我想知道为什么在这种情况下 thrust 版本比较慢?
CUDA 内核启动是异步的。这意味着控制权返回到主机线程,以便它可以在内核启动后 在内核甚至开始执行 .
之前继续执行下一行代码
cuda
标签上的许多问题都涵盖了这一点。这是对 CUDA 代码进行计时时的常见错误。它会影响您计时推力代码的方式以及您计时普通 CUDA 代码的方式。通常的解决方案是 在关闭计时区域 之前插入一个 cudaDeviceSynchronize() 调用。这可确保在您完成时序测量时所有 CUDA activity 都已完成。
当我用适当的计时方法把你的东西变成完整的代码时,推力代码实际上更快了。您的内核设计效率低下。这是我的代码版本,运行 在 Tesla P100 上的 CUDA 10 上,表明两种情况之间的时间几乎相同:
$ cat t469.cu
#include <thrust/transform.h>
#include <thrust/complex.h>
#include <thrust/device_ptr.h>
#include <thrust/execution_policy.h>
#include <cuComplex.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <cstdlib>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
typedef thrust::complex<float> comThr;
struct Complex_Mul_Complex :public thrust::binary_function<comThr, comThr, comThr>
{
__host__ __device__
comThr operator() (comThr a, comThr b) const{
return a*b;
}
};
double cpuSecond(){
long long dt = dtime_usec(0);
return dt/(double)USECPSEC;
}
__global__ void HardamarProductOnDeviceCC(cuComplex *Result, cuComplex *A, cuComplex *B, int N)
{
unsigned int index = threadIdx.x + blockDim.x * blockIdx.x;
if(index < N)
Result[index] = cuCmulf(A[index],B[index]);
}
int main(){
int N = 720896;
cuComplex *d_Data1, *d_Data2;
cudaMalloc(&d_Data1, N*sizeof(d_Data1[0]));
cudaMalloc(&d_Data2, N*sizeof(d_Data2[0]));
// step 1: type convert (cuComplex->thrust)
comThr *thr_temp1 = reinterpret_cast<comThr*>(d_Data1);
thrust::device_ptr<comThr> thr_d_Data1 = thrust::device_pointer_cast(thr_temp1);
comThr *thr_temp2 = reinterpret_cast<comThr*>(d_Data2);
thrust::device_ptr<comThr> thr_d_Data2 = thrust::device_pointer_cast(thr_temp2);
// step 2: product and timing
Complex_Mul_Complex op_dot;
double iStart = cpuSecond(); // timing class
for(int i=0;i<1000;i++){ // loop 1000 times to get accurate time consumption
thrust::transform(thrust::device,thr_d_Data1,thr_d_Data1+N,
thr_d_Data2,thr_d_Data1,op_dot);
}
cudaDeviceSynchronize();
double iElapse = cpuSecond() - iStart;
std::cout << "thrust time consume: " << iElapse <<std::endl;
int blockSize;
int minGridSize;
int gridSize;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, HardamarProductOnDeviceCC, 0, 0);
gridSize = (N+blockSize-1)/blockSize;
std::cout << "gridsize: " << gridSize << " blocksize: " << blockSize << std::endl;
dim3 grid(gridSize);
dim3 block(blockSize);
iStart = cpuSecond();
for(int i=0;i<1000;i++){
HardamarProductOnDeviceCC<<<grid,block>>>(d_Data1,d_Data1,d_Data2,N);
}
cudaDeviceSynchronize();
iElapse = cpuSecond() - iStart;
std::cout << "kernel time consume: " << iElapse <<std::endl;
}
$ nvcc -o t469 t469.cu
$ ./t469
thrust time consume: 0.033937
gridsize: 704 blocksize: 1024
kernel time consume: 0.0337021
$
注意:为了证明我回答的正确性,提供完整的代码对我来说很重要。如果你需要别人的帮助,我给你的建议是提供一个完整的代码,而不是一些零零碎碎的东西,必须拼凑起来,然后通过添加include等方式转换成完整的代码,欢迎你按自己的意愿去做,当然,但是如果你让别人更容易帮助你,你可能会发现你更容易得到帮助。
EIDT
按照 Robert 的建议更改代码,但推力仍然慢得多。
我使用的数据是基于两个.dat文件,所以代码中省略了。
原题
我在 GPU Tesla M6 上安装了两个复向量。我想计算两个向量的逐元素乘积,即 [x1*y1,...,xN*yN]。 两个向量的长度都是N = 720,896.
代码片段(已修改)
我用两种方法解决这个问题。 一种是使用带有类型转换和特定结构的推力:
#include <cstdio>
#include <cstdlib>
#include <sys/time.h>
#include "cuda_runtime.h"
#include "cuComplex.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/complex.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
using namespace std;
typedef thrust::complex<float> comThr;
// ---- struct for thrust ----//
struct Complex_Mul_Complex :public thrust::binary_function<comThr, comThr, comThr>
{
__host__ __device__
comThr operator() (comThr a, comThr b) const{
return a*b;
}
};
// ---- my kernel function ---- //
__global__ void HardamarProductOnDeviceCC(cuComplex *Result, cuComplex *A, cuComplex *B, int N)
{
unsigned int tid = threadIdx.x;
unsigned int index = threadIdx.x + blockDim.x * blockIdx.x;
if(index >= N)
return;
Result[index].x = cuCmul(A[index],B[index]).x;
Result[index].y = cuCmul(A[index],B[index]).y;
}
// ---- timing function ---- //
double seconds()
{
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
int main()
{
int N = 720896;
cuComplex *d_Data1, *d_Data2;
cudaMalloc(&d_Data1, N*sizeof(d_Data1[0]));
cudaMalloc(&d_Data2, N*sizeof(d_Data2[0]));
/************************************
* Version 1: type conversion twice *
************************************/
// step 1: type convert (cuComplex->thrust)
comThr *thr_temp1 = reinterpret_cast<comThr*>(d_Data1);
thrust::device_ptr<comThr> thr_d_Data1 = thrust::device_pointer_cast(thr_temp1);
comThr *thr_temp2 = reinterpret_cast<comThr*>(d_Data2);
thrust::device_ptr<comThr> thr_d_Data2 = thrust::device_pointer_cast(thr_temp2);
// step 2: product and timing
Complex_Mul_Complex op_dot;
double iStart = cpuSecond(); // timing class
for(int i=0;i<1000;i++){ // loop 1000 times to get accurate time consumption
thrust::transform(thrust::device,thr_d_Data1,thr_d_Data1+N,
thr_d_Data2,thr_d_Data1,op_dot);
}
cudaDeviceSynchronize();
double iElapse = cpuSecond() - iStart;
cout << "thrust time consume: " << iElapse <<endl;
/************************************************
* Version 2: dot product using kernel function *
************************************************/
int blockSize;
int minGridSize;
int gridSize;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, HardamarProductOnDeviceCC, 0, 0);
gridSize = (N+blockSize-1)/blockSize;
dim3 grid(gridSize);
dim3 block(blockSize);
iStart = cpuSecond();
for(int i=0;i<1000;i++){
HardamarProductOnDeviceCC<<<grid,block>>>(d_Data1,d_Data1,d_Data2,N);
}
cudaDeviceSynchronize();
iElapse = cpuSecond() - iStart;
cout << "kernel time consume: " << iElapse <<endl;
}
Result:
thrust time consume: 25.6063
kernel time consume: 2.87929
我的问题
添加cudaDeviceSynchronize()
后,thrust版本似乎比内核版本慢很多。
有一个 "golden rule" 使用库而不是编写自己的内核函数。但我想知道为什么在这种情况下 thrust 版本比较慢?
CUDA 内核启动是异步的。这意味着控制权返回到主机线程,以便它可以在内核启动后 在内核甚至开始执行 .
之前继续执行下一行代码cuda
标签上的许多问题都涵盖了这一点。这是对 CUDA 代码进行计时时的常见错误。它会影响您计时推力代码的方式以及您计时普通 CUDA 代码的方式。通常的解决方案是 在关闭计时区域 之前插入一个 cudaDeviceSynchronize() 调用。这可确保在您完成时序测量时所有 CUDA activity 都已完成。
当我用适当的计时方法把你的东西变成完整的代码时,推力代码实际上更快了。您的内核设计效率低下。这是我的代码版本,运行 在 Tesla P100 上的 CUDA 10 上,表明两种情况之间的时间几乎相同:
$ cat t469.cu
#include <thrust/transform.h>
#include <thrust/complex.h>
#include <thrust/device_ptr.h>
#include <thrust/execution_policy.h>
#include <cuComplex.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <cstdlib>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
typedef thrust::complex<float> comThr;
struct Complex_Mul_Complex :public thrust::binary_function<comThr, comThr, comThr>
{
__host__ __device__
comThr operator() (comThr a, comThr b) const{
return a*b;
}
};
double cpuSecond(){
long long dt = dtime_usec(0);
return dt/(double)USECPSEC;
}
__global__ void HardamarProductOnDeviceCC(cuComplex *Result, cuComplex *A, cuComplex *B, int N)
{
unsigned int index = threadIdx.x + blockDim.x * blockIdx.x;
if(index < N)
Result[index] = cuCmulf(A[index],B[index]);
}
int main(){
int N = 720896;
cuComplex *d_Data1, *d_Data2;
cudaMalloc(&d_Data1, N*sizeof(d_Data1[0]));
cudaMalloc(&d_Data2, N*sizeof(d_Data2[0]));
// step 1: type convert (cuComplex->thrust)
comThr *thr_temp1 = reinterpret_cast<comThr*>(d_Data1);
thrust::device_ptr<comThr> thr_d_Data1 = thrust::device_pointer_cast(thr_temp1);
comThr *thr_temp2 = reinterpret_cast<comThr*>(d_Data2);
thrust::device_ptr<comThr> thr_d_Data2 = thrust::device_pointer_cast(thr_temp2);
// step 2: product and timing
Complex_Mul_Complex op_dot;
double iStart = cpuSecond(); // timing class
for(int i=0;i<1000;i++){ // loop 1000 times to get accurate time consumption
thrust::transform(thrust::device,thr_d_Data1,thr_d_Data1+N,
thr_d_Data2,thr_d_Data1,op_dot);
}
cudaDeviceSynchronize();
double iElapse = cpuSecond() - iStart;
std::cout << "thrust time consume: " << iElapse <<std::endl;
int blockSize;
int minGridSize;
int gridSize;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, HardamarProductOnDeviceCC, 0, 0);
gridSize = (N+blockSize-1)/blockSize;
std::cout << "gridsize: " << gridSize << " blocksize: " << blockSize << std::endl;
dim3 grid(gridSize);
dim3 block(blockSize);
iStart = cpuSecond();
for(int i=0;i<1000;i++){
HardamarProductOnDeviceCC<<<grid,block>>>(d_Data1,d_Data1,d_Data2,N);
}
cudaDeviceSynchronize();
iElapse = cpuSecond() - iStart;
std::cout << "kernel time consume: " << iElapse <<std::endl;
}
$ nvcc -o t469 t469.cu
$ ./t469
thrust time consume: 0.033937
gridsize: 704 blocksize: 1024
kernel time consume: 0.0337021
$
注意:为了证明我回答的正确性,提供完整的代码对我来说很重要。如果你需要别人的帮助,我给你的建议是提供一个完整的代码,而不是一些零零碎碎的东西,必须拼凑起来,然后通过添加include等方式转换成完整的代码,欢迎你按自己的意愿去做,当然,但是如果你让别人更容易帮助你,你可能会发现你更容易得到帮助。