使用 thrust::for_each() 调用实现 CUDA 流时访问结构的结果

Question

我是 C++ 和 CUDA 编码的新手，并且编写了一个我希望并行化的程序，因为根据 NSIGHT 分析器，它目前仅使用 25% 的 GPU。

下面，我编写了一个玩具程序来尝试使用 thrust::for_each() 实现 cuda 流，但我似乎无法修改数组。我习惯于 thrust::transform() ，其中在调用中提供了 return 数组。

我找到的所有示例要么只是使用 for_each 调用来打印，要么不对修改后的数组执行任何操作。

当我运行这个程序时，它只是 return 一个零数组。

#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <iterator>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/transform_reduce.h>
#include <thrust/transform.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/tuple.h>
#include <thrust/count.h>
#include <thrust/sequence.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/for_each.h>
#include <ctime>
#include <cstdio>
#include <cassert>

using namespace std;

//define typedef for iterators for shorthand
typedef thrust::device_vector<float>::iterator normIter;
typedef thrust::device_vector<float>::iterator deltaIter;
typedef thrust::device_vector<float>::iterator gammaskIter;
typedef thrust::device_vector<float>::iterator zetmaskIter;
typedef thrust::device_vector<float>::iterator zetvalIter;
//typedef thrust::zip_iterator<tpl2intiter>  idxzip;

//typedef a tuple of these iterators
typedef thrust::tuple<normIter, deltaIter, gammaskIter, zetmaskIter,     zetvalIter> IteratorTuple;

//typedef the zip_iterator for this tuple
typedef thrust::zip_iterator<IteratorTuple> ZipIterator;

//structure that takes takes the absolute value of a given number
template<typename T>
struct my_function
{
    cudaStream_t s;

    my_function(cudaStream_t s) : s(s) {}
    __host__ __device__ float operator()(thrust::tuple<float, float> x)
    {
        float y = thrust::get<0>(x);
        return thrust::get<1>(x) = y + 5;
    }
};

int main() {
    clock_t start;
    double duration;
    start = clock();


    thrust::device_vector<float> d_fraction(5);
    d_fraction[0] = 1;
    d_fraction[1] = 5;
    d_fraction[2] = 3;
    d_fraction[3] = 2;
    d_fraction[4] = 4;

    thrust::device_vector<float> d_fraction2(5);
    d_fraction2[0] = 0.00;
    d_fraction2[1] = 0.04;
    d_fraction2[2] = 0.08;
    d_fraction2[3] = 0.12;
    d_fraction2[4] = 0.16;

    cout << "original" << endl;
    int f = 0;
    while (f < 5){
        cout << d_fraction[f] << endl;
        f++;
    }
    cout << "original" << endl;
    int y = 0;
    while (y < 5){
        cout << d_fraction2[y] << endl;
        y++;
    }


    cudaStream_t s1, s2;
    cudaStreamCreate(&s1);
    cudaStreamCreate(&s2);

    thrust::device_vector<float> result1(5);
    thrust::device_vector<float> result2(5);

    thrust::for_each(thrust::cuda::par.on(s1), thrust::make_zip_iterator(thrust::make_tuple(d_fraction.begin(), result1.begin())),
        thrust::make_zip_iterator(thrust::make_tuple(d_fraction.end(), result1.end())), my_function<float>(s1));

    thrust::for_each(thrust::cuda::par.on(s2), thrust::make_zip_iterator(thrust::make_tuple(d_fraction2.begin(), result2.begin())),
    thrust::make_zip_iterator(thrust::make_tuple(d_fraction2.end(), result2.end())), my_function<float>(s2));


    cudaStreamSynchronize(s1);
    cudaStreamSynchronize(s2);

    cout << "norm" << endl;
    int i = 0;
    while (i < 5){
        cout << result1[i] << endl;
        i++;
    }
    cout << "dut" << endl;
    int a = 0;
    while (a < 5){
        cout << result2[a] << endl;
        a++;
    }
    cudaStreamDestroy(s1);
    cudaStreamDestroy(s2);

    duration = (clock() - start) / (double)CLOCKS_PER_SEC;
    cout << "time " << duration << endl;
    cin.get();
    return 0;
}

Answer 1

对于 thrust::transform，仿函数运算符的 return 值是分配给输出迭代器的值。正如您所发现的那样，使用 thrust::for_each （仅采用输入迭代器），情况并非如此。

因此，当我们想使用 thrust::for_each 修改向量的元素时（而不是仅仅打印一些东西），我们必须使用提供的元组（函子运算符的输入）作为我们的路径所以。

您可能一直在尝试这样做：

    return thrust::get<1>(x) = y + 5;

但这不起作用，因为仿函数运算符的 return 值（在本例中为 float，因此也不匹配迭代器解引用类型，它是一个元组）不用于这个目的，并且输入 x 元组的修改没有达到预期的效果，因为您的公式将元组传递给仿函数 by value:

__host__ __device__ float operator()(thrust::tuple<float, float> x)
                                                                 ^
                                                 tuple passed by value

在 C（或 C++）中，当我们按值传递参数时，对该参数所做的修改不会显示在调用环境中。通常的解决方案是通过引用传递，以便在仿函数中所做的修改将显示在调用环境中（即在输入向量中，对于 thrust::for_each）。

如果我们尝试这样做：

__host__ __device__ float operator()(thrust::tuple<float, float> &x)

我们得到了一些具有指导意义的编译错误：

$ nvcc t1188.cu -o t1188
/usr/local/cuda/bin/..//include/thrust/detail/function.h(60): error: function "my_function<T>::operator() [with T=float]" cannot be called with the given argument list
        argument types are: (thrust::detail::tuple_of_iterator_references<float &, float &, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>)
                                                                     NOTE ^^^^^^^^^^^^^^^^

并希望将我们引向这个公式：

__host__ __device__ float operator()(thrust::tuple<float &, float &> x)

有效并达到预期效果：

$ ./t1188
original
1
5
3
2
4
original
0
0.04
0.08
0.12
0.16
norm
6
10
8
7
9
dut
5
5.04
5.08
5.12
5.16
time 0.488422

我注意到的其他一些事情：

我不确定你试图通过将流参数传递给函子来完成什么。在仿函数中您无法用它做任何事情，因此没有必要。
你的 my_function 仿函数前面有一个模板定义，它没有任何用处（模板类型 T 没有在我能看到的仿函数的任何地方使用） .也许这只是您的编码工作遗留下来的。有时可以通过为函子模板化输入参数的类型来避免必须记住输入参数上的这个引用元组配置，但你似乎并没有这样做。

使用 thrust::for_each() 调用实现 CUDA 流时访问结构的结果

Accessing the result of a structure when using thrust::for_each() call to implement CUDA streams

c++

parallel-processing

cuda

thrust