推力:inplace exclusive_scan 一名成员

Thrust: inplace exclusive_scan of one member

我有一个向量MyElement,定义如下:

struct MyElement {
    int count;
    int prefixSum;
}

我想执行 count 的就地 exclusive_scan,但结果使用 prefixSum,而不更改 count。可以使用推力吗?

例如,对于以下输入(prefixSum 初始化为零):

{ (0, 0), (0, 0), (1, 0), (2, 0), (0, 0), (1, 0), (0, 0), (1, 0), (0, 0), (1, 0) }

正确的输出是:

{ (0, 0), (0, 0), (1, 0), (2, 1), (0, 3), (1, 3), (0, 4), (1, 4), (0, 5), (1, 5) }

count不变,prefixSum包含count.

的唯一前缀和

这是我迄今为止尝试过的推力:

#include <thrust/scan.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

struct MyElement {
    int count;
    int prefixSum;

    // Needed by thrust
    __host__ __device__ MyElement() {
        count = 0;
        prefixSum = 0;
    }

    __host__ __device__ MyElement(int a) {
        count = 0;
        prefixSum = 0;
    }

    // Used for initialization
    __host__ MyElement(int count, int prefixSum) {
        this->count = count;
        this->prefixSum = prefixSum;
    }

    __host__ __device__ friend MyElement operator +(const MyElement& a, const MyElement& b) {
        return MyElement(0, a.count + b.count + a.prefixSum + b.prefixSum);
    }

    //__host__ __device__ MyElement& operator=(const MyElement& other) {
    //    // check for self-assignment
    //    if (&other == this) {
    //        return *this;
    //    }

    //    count = other.count;
    //    prefixSum = other.prefixSum;
    //    return *this;
    //}
};

int main(int argc, char* argv[]) {
    thrust::device_vector<MyElement> d_vector;
    d_vector.push_back(MyElement(0, 0));
    d_vector.push_back(MyElement(0, 0));
    d_vector.push_back(MyElement(1, 0));
    d_vector.push_back(MyElement(2, 0));
    d_vector.push_back(MyElement(0, 0));
    d_vector.push_back(MyElement(1, 0));
    d_vector.push_back(MyElement(0, 0));
    d_vector.push_back(MyElement(1, 0));
    d_vector.push_back(MyElement(0, 0));
    d_vector.push_back(MyElement(1, 0));

    thrust::exclusive_scan(d_vector.data(), d_vector.data() + 10, d_vector.data());
    
    // Copy vector from device to host
    thrust::host_vector<MyElement> h_vector = d_vector;

    // Print
    for (const MyElement& element : h_vector) {
        printf("{ %d, %d }\n", element.count, element.prefixSum);
    }

    return 0;
}

使用上面的代码,可以计算出 prefixSum 的正确值,但是 count 会丢失(设置为零)。我尝试了求和运算符和赋值运算符的多种变体,但找不到正确的解决方案。

这是一个可能的方法。我们将结构数组重新解释为整数数组可能有点难看。

基本思想是将要进行前缀求和的数据复制到所需位置,移动 1,然后在那里执行包含扫描。

$ cat t13.cu
#include <thrust/scan.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/permutation_iterator.h>

using namespace thrust::placeholders;

struct MyElement {
    int count;
    int prefixSum;

    // Used for initialization
    __host__ __device__ MyElement(int count, int prefixSum) {
        this->count = count;
        this->prefixSum = prefixSum;
    }

};

int main(int argc, char* argv[]) {
    thrust::device_vector<MyElement> d_vector;
    d_vector.push_back(MyElement(0, 0));
    d_vector.push_back(MyElement(0, 0));
    d_vector.push_back(MyElement(1, 0));
    d_vector.push_back(MyElement(2, 0));
    d_vector.push_back(MyElement(0, 0));
    d_vector.push_back(MyElement(1, 0));
    d_vector.push_back(MyElement(0, 0));
    d_vector.push_back(MyElement(1, 0));
    d_vector.push_back(MyElement(0, 0));
    d_vector.push_back(MyElement(1, 0));

    thrust::device_ptr<int> dvec(reinterpret_cast<int *>(thrust::raw_pointer_cast(d_vector.data())));
    auto pi_count = thrust::make_permutation_iterator(dvec, thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1*2));
    auto pi_prefix = thrust::make_permutation_iterator(dvec, thrust::make_transform_iterator(thrust::counting_iterator<int>(0), (_1*2)+1));

    thrust::copy_n(pi_count, d_vector.size()-1, pi_prefix+1);
    thrust::inclusive_scan(pi_prefix+1, pi_prefix + d_vector.size(), pi_prefix+1);

    // Copy vector from device to host
    thrust::host_vector<MyElement> h_vector = d_vector;

    // Print
    for (const MyElement& element : h_vector) {
        printf("{ %d, %d }\n", element.count, element.prefixSum);
    }

    return 0;
}
$ nvcc -o t13 t13.cu -std=c++11
$ cuda-memcheck ./t13
========= CUDA-MEMCHECK
{ 0, 0 }
{ 0, 0 }
{ 1, 0 }
{ 2, 1 }
{ 0, 3 }
{ 1, 3 }
{ 0, 4 }
{ 1, 4 }
{ 0, 5 }
{ 1, 5 }
========= ERROR SUMMARY: 0 errors
$

我最终得到了类似的解决方案。 Whosebug answer 描述了如何对一个 class 成员执行包含性前缀和。

因为我不能对一个成员执行扫描并将其“输出”到另一个成员,而不是将 prefixSum 初始化为零,我使用与 count 相同的值(我这样做这在以前的内核中,没有使用推力)。 因此我的新输入变成:

{ (0, 0), (0, 0), (1, 1), (2, 2), (0, 0), (1, 1), (0, 0), (1, 1), (0, 0), (1, 1) }

之后我用thrust只对prefixSum做一个包含前缀和,得到:

{ (0, 0), (0, 0), (1, 1), (2, 3), (0, 3), (1, 4), (0, 4), (1, 5), (0, 5), (1, 6) }

我其实需要一个独占前缀和,但是因为我有原始的count,所以我想要的值很简单 prefixSum - count.