推力执行策略将内核发布到默认流
Thrust execution policy issues kernel to default stream
我目前正在设计一个简短的教程,展示 Thrust 模板库的各个方面和功能。
不幸的是,我为展示如何使用 cuda 流来使用 copy/compute 并发而编写的代码似乎存在问题。
我的代码可以在这里找到,在 asynchronousLaunch 目录中:
https://github.com/gnthibault/Cuda_Thrust_Introduction/tree/master/AsynchronousLaunch
这里是生成问题的代码摘要:
//STL
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <vector>
#include <functional>
//Thrust
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/scan.h>
//Cuda
#include <cuda_runtime.h>
//Local
#include "AsynchronousLaunch.cu.h"
int main( int argc, char* argv[] )
{
const size_t fullSize = 1024*1024*64;
const size_t halfSize = fullSize/2;
//Declare one host std::vector and initialize it with random values
std::vector<float> hostVector( fullSize );
std::generate(hostVector.begin(), hostVector.end(), normalRandomFunctor<float>(0.f,1.f) );
//And two device vector of Half size
thrust::device_vector<float> deviceVector0( halfSize );
thrust::device_vector<float> deviceVector1( halfSize );
//Declare and initialize also two cuda stream
cudaStream_t stream0, stream1;
cudaStreamCreate( &stream0 );
cudaStreamCreate( &stream1 );
//Now, we would like to perform an alternate scheme copy/compute
for( int i = 0; i < 10; i++ )
{
//Wait for the end of the copy to host before starting to copy back to device
cudaStreamSynchronize(stream0);
//Warning: thrust::copy does not handle asynchronous behaviour for host/device copy, you must use cudaMemcpyAsync to do so
cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector0.data()), thrust::raw_pointer_cast(hostVector.data()), halfSize*sizeof(float), cudaMemcpyHostToDevice, stream0);
cudaStreamSynchronize(stream1);
//second copy is most likely to occur sequentially after the first one
cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector1.data()), thrust::raw_pointer_cast(hostVector.data())+halfSize, halfSize*sizeof(float), cudaMemcpyHostToDevice, stream1);
//Compute on device, here inclusive scan, for histogram equalization for instance
thrust::transform( thrust::cuda::par.on(stream0), deviceVector0.begin(), deviceVector0.end(), deviceVector0.begin(), computeFunctor<float>() );
thrust::transform( thrust::cuda::par.on(stream1), deviceVector1.begin(), deviceVector1.end(), deviceVector1.begin(), computeFunctor<float>() );
//Copy back to host
cudaMemcpyAsync(thrust::raw_pointer_cast(hostVector.data()), thrust::raw_pointer_cast(deviceVector0.data()), halfSize*sizeof(float), cudaMemcpyDeviceToHost, stream0);
cudaMemcpyAsync(thrust::raw_pointer_cast(hostVector.data())+halfSize, thrust::raw_pointer_cast(deviceVector1.data()), halfSize*sizeof(float), cudaMemcpyDeviceToHost, stream1);
}
//Full Synchronize before exit
cudaDeviceSynchronize();
cudaStreamDestroy( stream0 );
cudaStreamDestroy( stream1 );
return EXIT_SUCCESS;
}
以下是通过 nvidia 视觉配置文件观察到的程序实例的结果:
如您所见,cudamemcopy(棕色)都发布到流 13 和 14,但是 Thrust 从 thrust::transform 生成的内核发布到默认流(捕获中的蓝色)
顺便说一句,我使用的是 cuda 工具包版本 7.0.28,GTX680 和 gcc 4.8.2。
如果有人能告诉我我的代码有什么问题,我将不胜感激。
提前致谢
编辑:这是我认为是解决方案的代码:
//STL
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <functional>
#include <vector>
//Thrust
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/execution_policy.h>
//Cuda
#include <cuda_runtime.h>
//Local definitions
template<typename T>
struct computeFunctor
{
__host__ __device__
computeFunctor() {}
__host__ __device__
T operator()( T in )
{
//Naive functor that generates expensive but useless instructions
T a = cos(in);
for(int i = 0; i < 350; i++ )
{
a+=cos(in);
}
return a;
}
};
int main( int argc, char* argv[] )
{
const size_t fullSize = 1024*1024*2;
const size_t nbOfStrip = 4;
const size_t stripSize = fullSize/nbOfStrip;
//Allocate host pinned memory in order to use asynchronous api and initialize it with random values
float* hostVector;
cudaMallocHost(&hostVector,fullSize*sizeof(float));
std::fill(hostVector, hostVector+fullSize, 1.0f );
//And one device vector of the same size
thrust::device_vector<float> deviceVector( fullSize );
//Declare and initialize also two cuda stream
std::vector<cudaStream_t> vStream(nbOfStrip);
for( auto it = vStream.begin(); it != vStream.end(); it++ )
{
cudaStreamCreate( &(*it) );
}
//Now, we would like to perform an alternate scheme copy/compute in a loop using the copyToDevice/Compute/CopyToHost for each stream scheme:
for( int i = 0; i < 5; i++ )
{
for( int j=0; j!=nbOfStrip; j++)
{
size_t offset = stripSize*j;
size_t nextOffset = stripSize*(j+1);
cudaStreamSynchronize(vStream.at(j));
cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector.data())+offset, hostVector+offset, stripSize*sizeof(float), cudaMemcpyHostToDevice, vStream.at(j));
thrust::transform( thrust::cuda::par.on(vStream.at(j)), deviceVector.begin()+offset, deviceVector.begin()+nextOffset, deviceVector.begin()+offset, computeFunctor<float>() );
cudaMemcpyAsync(hostVector+offset, thrust::raw_pointer_cast(deviceVector.data())+offset, stripSize*sizeof(float), cudaMemcpyDeviceToHost, vStream.at(j));
}
}
//On devices that do not possess multiple queues copy engine capability, this solution serializes all command even if they have been issued to different streams
//Why ? Because in the point of view of the copy engine, which is a single ressource in this case, there is a time dependency between HtoD(n) and DtoH(n) which is ok, but there is also
// a false dependency between DtoH(n) and HtoD(n+1), that preclude any copy/compute overlap
//Full Synchronize before testing second solution
cudaDeviceSynchronize();
//Now, we would like to perform an alternate scheme copy/compute in a loop using the copyToDevice for each stream /Compute for each stream /CopyToHost for each stream scheme:
for( int i = 0; i < 5; i++ )
{
for( int j=0; j!=nbOfStrip; j++)
{
cudaStreamSynchronize(vStream.at(j));
}
for( int j=0; j!=nbOfStrip; j++)
{
size_t offset = stripSize*j;
cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector.data())+offset, hostVector+offset, stripSize*sizeof(float), cudaMemcpyHostToDevice, vStream.at(j));
}
for( int j=0; j!=nbOfStrip; j++)
{
size_t offset = stripSize*j;
size_t nextOffset = stripSize*(j+1);
thrust::transform( thrust::cuda::par.on(vStream.at(j)), deviceVector.begin()+offset, deviceVector.begin()+nextOffset, deviceVector.begin()+offset, computeFunctor<float>() );
}
for( int j=0; j!=nbOfStrip; j++)
{
size_t offset = stripSize*j;
cudaMemcpyAsync(hostVector+offset, thrust::raw_pointer_cast(deviceVector.data())+offset, stripSize*sizeof(float), cudaMemcpyDeviceToHost, vStream.at(j));
}
}
//On device that do not possess multiple queues in the copy engine, this solution yield better results, on other, it should show nearly identic results
//Full Synchronize before exit
cudaDeviceSynchronize();
for( auto it = vStream.begin(); it != vStream.end(); it++ )
{
cudaStreamDestroy( *it );
}
cudaFreeHost( hostVector );
return EXIT_SUCCESS;
}
使用nvcc编译./test.cu -o ./test.exe -std=c++11
我要指出两点。 this related question/answer 中(现在)引用了这两者,您可能希望参考。
在这种情况下thrust未能将底层内核发布到非默认流似乎与this issue. It can be rectified (as covered in the comments to the question) by updating to the latest thrust version有关。未来的 CUDA 版本(超过 7)可能也会包含固定推力。这可能是本题讨论的中心问题。
这个问题似乎也表明目标之一是复制和计算的重叠:
in order to show how to use copy/compute concurrency using cuda streams
但我认为,使用当前编写的代码无法实现,即使上面的第 1 项已修复。复制与计算操作的重叠需要在复制操作中正确使用 cuda 流 (cudaMemcpyAsync
) as well as a pinned host allocation. The code proposed in the question is lacking any use of a pinned host allocation (std::vector
does not use a pinned allocator by default, AFAIK), and so I would not expect the cudaMemcpyAsync
operation to overlap with any kernel activity, even if it should be otherwise possible. To rectify this, a pinned allocator should be used, and one such example is given here.
为了完整起见,问题缺少 MCVE, which is expected for questions of this type。这使得其他人更难尝试测试您的问题,并且明确地是 SO 的一个紧密原因。是的,您向外部 github 存储库提供了一个 link,但这种行为是不受欢迎的。 MCVE 要求明确指出,必要的部分应包含在问题本身(而不是外部参考)中。由于唯一缺少的部分 AFAICT 是 "AsynchronousLaunch.cu.h",因此包含这个似乎相对简单您问题中的另一部分。外部 links 的问题在于,当它们在未来崩溃时,这个问题对未来的读者来说就变得没那么有用了。 (而且,在我看来,强迫其他人浏览外部 github 存储库以查找特定文件不利于获得帮助。)
我目前正在设计一个简短的教程,展示 Thrust 模板库的各个方面和功能。
不幸的是,我为展示如何使用 cuda 流来使用 copy/compute 并发而编写的代码似乎存在问题。
我的代码可以在这里找到,在 asynchronousLaunch 目录中: https://github.com/gnthibault/Cuda_Thrust_Introduction/tree/master/AsynchronousLaunch
这里是生成问题的代码摘要:
//STL
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <vector>
#include <functional>
//Thrust
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/scan.h>
//Cuda
#include <cuda_runtime.h>
//Local
#include "AsynchronousLaunch.cu.h"
int main( int argc, char* argv[] )
{
const size_t fullSize = 1024*1024*64;
const size_t halfSize = fullSize/2;
//Declare one host std::vector and initialize it with random values
std::vector<float> hostVector( fullSize );
std::generate(hostVector.begin(), hostVector.end(), normalRandomFunctor<float>(0.f,1.f) );
//And two device vector of Half size
thrust::device_vector<float> deviceVector0( halfSize );
thrust::device_vector<float> deviceVector1( halfSize );
//Declare and initialize also two cuda stream
cudaStream_t stream0, stream1;
cudaStreamCreate( &stream0 );
cudaStreamCreate( &stream1 );
//Now, we would like to perform an alternate scheme copy/compute
for( int i = 0; i < 10; i++ )
{
//Wait for the end of the copy to host before starting to copy back to device
cudaStreamSynchronize(stream0);
//Warning: thrust::copy does not handle asynchronous behaviour for host/device copy, you must use cudaMemcpyAsync to do so
cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector0.data()), thrust::raw_pointer_cast(hostVector.data()), halfSize*sizeof(float), cudaMemcpyHostToDevice, stream0);
cudaStreamSynchronize(stream1);
//second copy is most likely to occur sequentially after the first one
cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector1.data()), thrust::raw_pointer_cast(hostVector.data())+halfSize, halfSize*sizeof(float), cudaMemcpyHostToDevice, stream1);
//Compute on device, here inclusive scan, for histogram equalization for instance
thrust::transform( thrust::cuda::par.on(stream0), deviceVector0.begin(), deviceVector0.end(), deviceVector0.begin(), computeFunctor<float>() );
thrust::transform( thrust::cuda::par.on(stream1), deviceVector1.begin(), deviceVector1.end(), deviceVector1.begin(), computeFunctor<float>() );
//Copy back to host
cudaMemcpyAsync(thrust::raw_pointer_cast(hostVector.data()), thrust::raw_pointer_cast(deviceVector0.data()), halfSize*sizeof(float), cudaMemcpyDeviceToHost, stream0);
cudaMemcpyAsync(thrust::raw_pointer_cast(hostVector.data())+halfSize, thrust::raw_pointer_cast(deviceVector1.data()), halfSize*sizeof(float), cudaMemcpyDeviceToHost, stream1);
}
//Full Synchronize before exit
cudaDeviceSynchronize();
cudaStreamDestroy( stream0 );
cudaStreamDestroy( stream1 );
return EXIT_SUCCESS;
}
以下是通过 nvidia 视觉配置文件观察到的程序实例的结果:
如您所见,cudamemcopy(棕色)都发布到流 13 和 14,但是 Thrust 从 thrust::transform 生成的内核发布到默认流(捕获中的蓝色)
顺便说一句,我使用的是 cuda 工具包版本 7.0.28,GTX680 和 gcc 4.8.2。
如果有人能告诉我我的代码有什么问题,我将不胜感激。
提前致谢
编辑:这是我认为是解决方案的代码:
//STL
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <functional>
#include <vector>
//Thrust
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/execution_policy.h>
//Cuda
#include <cuda_runtime.h>
//Local definitions
template<typename T>
struct computeFunctor
{
__host__ __device__
computeFunctor() {}
__host__ __device__
T operator()( T in )
{
//Naive functor that generates expensive but useless instructions
T a = cos(in);
for(int i = 0; i < 350; i++ )
{
a+=cos(in);
}
return a;
}
};
int main( int argc, char* argv[] )
{
const size_t fullSize = 1024*1024*2;
const size_t nbOfStrip = 4;
const size_t stripSize = fullSize/nbOfStrip;
//Allocate host pinned memory in order to use asynchronous api and initialize it with random values
float* hostVector;
cudaMallocHost(&hostVector,fullSize*sizeof(float));
std::fill(hostVector, hostVector+fullSize, 1.0f );
//And one device vector of the same size
thrust::device_vector<float> deviceVector( fullSize );
//Declare and initialize also two cuda stream
std::vector<cudaStream_t> vStream(nbOfStrip);
for( auto it = vStream.begin(); it != vStream.end(); it++ )
{
cudaStreamCreate( &(*it) );
}
//Now, we would like to perform an alternate scheme copy/compute in a loop using the copyToDevice/Compute/CopyToHost for each stream scheme:
for( int i = 0; i < 5; i++ )
{
for( int j=0; j!=nbOfStrip; j++)
{
size_t offset = stripSize*j;
size_t nextOffset = stripSize*(j+1);
cudaStreamSynchronize(vStream.at(j));
cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector.data())+offset, hostVector+offset, stripSize*sizeof(float), cudaMemcpyHostToDevice, vStream.at(j));
thrust::transform( thrust::cuda::par.on(vStream.at(j)), deviceVector.begin()+offset, deviceVector.begin()+nextOffset, deviceVector.begin()+offset, computeFunctor<float>() );
cudaMemcpyAsync(hostVector+offset, thrust::raw_pointer_cast(deviceVector.data())+offset, stripSize*sizeof(float), cudaMemcpyDeviceToHost, vStream.at(j));
}
}
//On devices that do not possess multiple queues copy engine capability, this solution serializes all command even if they have been issued to different streams
//Why ? Because in the point of view of the copy engine, which is a single ressource in this case, there is a time dependency between HtoD(n) and DtoH(n) which is ok, but there is also
// a false dependency between DtoH(n) and HtoD(n+1), that preclude any copy/compute overlap
//Full Synchronize before testing second solution
cudaDeviceSynchronize();
//Now, we would like to perform an alternate scheme copy/compute in a loop using the copyToDevice for each stream /Compute for each stream /CopyToHost for each stream scheme:
for( int i = 0; i < 5; i++ )
{
for( int j=0; j!=nbOfStrip; j++)
{
cudaStreamSynchronize(vStream.at(j));
}
for( int j=0; j!=nbOfStrip; j++)
{
size_t offset = stripSize*j;
cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector.data())+offset, hostVector+offset, stripSize*sizeof(float), cudaMemcpyHostToDevice, vStream.at(j));
}
for( int j=0; j!=nbOfStrip; j++)
{
size_t offset = stripSize*j;
size_t nextOffset = stripSize*(j+1);
thrust::transform( thrust::cuda::par.on(vStream.at(j)), deviceVector.begin()+offset, deviceVector.begin()+nextOffset, deviceVector.begin()+offset, computeFunctor<float>() );
}
for( int j=0; j!=nbOfStrip; j++)
{
size_t offset = stripSize*j;
cudaMemcpyAsync(hostVector+offset, thrust::raw_pointer_cast(deviceVector.data())+offset, stripSize*sizeof(float), cudaMemcpyDeviceToHost, vStream.at(j));
}
}
//On device that do not possess multiple queues in the copy engine, this solution yield better results, on other, it should show nearly identic results
//Full Synchronize before exit
cudaDeviceSynchronize();
for( auto it = vStream.begin(); it != vStream.end(); it++ )
{
cudaStreamDestroy( *it );
}
cudaFreeHost( hostVector );
return EXIT_SUCCESS;
}
使用nvcc编译./test.cu -o ./test.exe -std=c++11
我要指出两点。 this related question/answer 中(现在)引用了这两者,您可能希望参考。
在这种情况下thrust未能将底层内核发布到非默认流似乎与this issue. It can be rectified (as covered in the comments to the question) by updating to the latest thrust version有关。未来的 CUDA 版本(超过 7)可能也会包含固定推力。这可能是本题讨论的中心问题。
这个问题似乎也表明目标之一是复制和计算的重叠:
in order to show how to use copy/compute concurrency using cuda streams
但我认为,使用当前编写的代码无法实现,即使上面的第 1 项已修复。复制与计算操作的重叠需要在复制操作中正确使用 cuda 流 (
cudaMemcpyAsync
) as well as a pinned host allocation. The code proposed in the question is lacking any use of a pinned host allocation (std::vector
does not use a pinned allocator by default, AFAIK), and so I would not expect thecudaMemcpyAsync
operation to overlap with any kernel activity, even if it should be otherwise possible. To rectify this, a pinned allocator should be used, and one such example is given here.
为了完整起见,问题缺少 MCVE, which is expected for questions of this type。这使得其他人更难尝试测试您的问题,并且明确地是 SO 的一个紧密原因。是的,您向外部 github 存储库提供了一个 link,但这种行为是不受欢迎的。 MCVE 要求明确指出,必要的部分应包含在问题本身(而不是外部参考)中。由于唯一缺少的部分 AFAICT 是 "AsynchronousLaunch.cu.h",因此包含这个似乎相对简单您问题中的另一部分。外部 links 的问题在于,当它们在未来崩溃时,这个问题对未来的读者来说就变得没那么有用了。 (而且,在我看来,强迫其他人浏览外部 github 存储库以查找特定文件不利于获得帮助。)