推力:inplace exclusive_scan 一名成员
Thrust: inplace exclusive_scan of one member
我有一个向量MyElement
,定义如下:
struct MyElement {
int count;
int prefixSum;
}
我想执行 count
的就地 exclusive_scan,但结果使用 prefixSum
,而不更改 count
。可以使用推力吗?
例如,对于以下输入(prefixSum
初始化为零):
{ (0, 0), (0, 0), (1, 0), (2, 0), (0, 0), (1, 0), (0, 0), (1, 0), (0, 0), (1, 0) }
正确的输出是:
{ (0, 0), (0, 0), (1, 0), (2, 1), (0, 3), (1, 3), (0, 4), (1, 4), (0, 5), (1, 5) }
即count
不变,prefixSum
包含count
.
的唯一前缀和
这是我迄今为止尝试过的推力:
#include <thrust/scan.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
struct MyElement {
int count;
int prefixSum;
// Needed by thrust
__host__ __device__ MyElement() {
count = 0;
prefixSum = 0;
}
__host__ __device__ MyElement(int a) {
count = 0;
prefixSum = 0;
}
// Used for initialization
__host__ MyElement(int count, int prefixSum) {
this->count = count;
this->prefixSum = prefixSum;
}
__host__ __device__ friend MyElement operator +(const MyElement& a, const MyElement& b) {
return MyElement(0, a.count + b.count + a.prefixSum + b.prefixSum);
}
//__host__ __device__ MyElement& operator=(const MyElement& other) {
// // check for self-assignment
// if (&other == this) {
// return *this;
// }
// count = other.count;
// prefixSum = other.prefixSum;
// return *this;
//}
};
int main(int argc, char* argv[]) {
thrust::device_vector<MyElement> d_vector;
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
d_vector.push_back(MyElement(2, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
thrust::exclusive_scan(d_vector.data(), d_vector.data() + 10, d_vector.data());
// Copy vector from device to host
thrust::host_vector<MyElement> h_vector = d_vector;
// Print
for (const MyElement& element : h_vector) {
printf("{ %d, %d }\n", element.count, element.prefixSum);
}
return 0;
}
使用上面的代码,可以计算出 prefixSum
的正确值,但是 count
会丢失(设置为零)。我尝试了求和运算符和赋值运算符的多种变体,但找不到正确的解决方案。
这是一个可能的方法。我们将结构数组重新解释为整数数组可能有点难看。
基本思想是将要进行前缀求和的数据复制到所需位置,移动 1,然后在那里执行包含扫描。
$ cat t13.cu
#include <thrust/scan.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
using namespace thrust::placeholders;
struct MyElement {
int count;
int prefixSum;
// Used for initialization
__host__ __device__ MyElement(int count, int prefixSum) {
this->count = count;
this->prefixSum = prefixSum;
}
};
int main(int argc, char* argv[]) {
thrust::device_vector<MyElement> d_vector;
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
d_vector.push_back(MyElement(2, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
thrust::device_ptr<int> dvec(reinterpret_cast<int *>(thrust::raw_pointer_cast(d_vector.data())));
auto pi_count = thrust::make_permutation_iterator(dvec, thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1*2));
auto pi_prefix = thrust::make_permutation_iterator(dvec, thrust::make_transform_iterator(thrust::counting_iterator<int>(0), (_1*2)+1));
thrust::copy_n(pi_count, d_vector.size()-1, pi_prefix+1);
thrust::inclusive_scan(pi_prefix+1, pi_prefix + d_vector.size(), pi_prefix+1);
// Copy vector from device to host
thrust::host_vector<MyElement> h_vector = d_vector;
// Print
for (const MyElement& element : h_vector) {
printf("{ %d, %d }\n", element.count, element.prefixSum);
}
return 0;
}
$ nvcc -o t13 t13.cu -std=c++11
$ cuda-memcheck ./t13
========= CUDA-MEMCHECK
{ 0, 0 }
{ 0, 0 }
{ 1, 0 }
{ 2, 1 }
{ 0, 3 }
{ 1, 3 }
{ 0, 4 }
{ 1, 4 }
{ 0, 5 }
{ 1, 5 }
========= ERROR SUMMARY: 0 errors
$
我最终得到了类似的解决方案。 Whosebug answer 描述了如何对一个 class 成员执行包含性前缀和。
因为我不能对一个成员执行扫描并将其“输出”到另一个成员,而不是将 prefixSum
初始化为零,我使用与 count
相同的值(我这样做这在以前的内核中,没有使用推力)。
因此我的新输入变成:
{ (0, 0), (0, 0), (1, 1), (2, 2), (0, 0), (1, 1), (0, 0), (1, 1), (0, 0), (1, 1) }
之后我用thrust只对prefixSum
做一个包含前缀和,得到:
{ (0, 0), (0, 0), (1, 1), (2, 3), (0, 3), (1, 4), (0, 4), (1, 5), (0, 5), (1, 6) }
我其实需要一个独占前缀和,但是因为我有原始的count
,所以我想要的值很简单
prefixSum - count
.
我有一个向量MyElement
,定义如下:
struct MyElement {
int count;
int prefixSum;
}
我想执行 count
的就地 exclusive_scan,但结果使用 prefixSum
,而不更改 count
。可以使用推力吗?
例如,对于以下输入(prefixSum
初始化为零):
{ (0, 0), (0, 0), (1, 0), (2, 0), (0, 0), (1, 0), (0, 0), (1, 0), (0, 0), (1, 0) }
正确的输出是:
{ (0, 0), (0, 0), (1, 0), (2, 1), (0, 3), (1, 3), (0, 4), (1, 4), (0, 5), (1, 5) }
即count
不变,prefixSum
包含count
.
这是我迄今为止尝试过的推力:
#include <thrust/scan.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
struct MyElement {
int count;
int prefixSum;
// Needed by thrust
__host__ __device__ MyElement() {
count = 0;
prefixSum = 0;
}
__host__ __device__ MyElement(int a) {
count = 0;
prefixSum = 0;
}
// Used for initialization
__host__ MyElement(int count, int prefixSum) {
this->count = count;
this->prefixSum = prefixSum;
}
__host__ __device__ friend MyElement operator +(const MyElement& a, const MyElement& b) {
return MyElement(0, a.count + b.count + a.prefixSum + b.prefixSum);
}
//__host__ __device__ MyElement& operator=(const MyElement& other) {
// // check for self-assignment
// if (&other == this) {
// return *this;
// }
// count = other.count;
// prefixSum = other.prefixSum;
// return *this;
//}
};
int main(int argc, char* argv[]) {
thrust::device_vector<MyElement> d_vector;
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
d_vector.push_back(MyElement(2, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
thrust::exclusive_scan(d_vector.data(), d_vector.data() + 10, d_vector.data());
// Copy vector from device to host
thrust::host_vector<MyElement> h_vector = d_vector;
// Print
for (const MyElement& element : h_vector) {
printf("{ %d, %d }\n", element.count, element.prefixSum);
}
return 0;
}
使用上面的代码,可以计算出 prefixSum
的正确值,但是 count
会丢失(设置为零)。我尝试了求和运算符和赋值运算符的多种变体,但找不到正确的解决方案。
这是一个可能的方法。我们将结构数组重新解释为整数数组可能有点难看。
基本思想是将要进行前缀求和的数据复制到所需位置,移动 1,然后在那里执行包含扫描。
$ cat t13.cu
#include <thrust/scan.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
using namespace thrust::placeholders;
struct MyElement {
int count;
int prefixSum;
// Used for initialization
__host__ __device__ MyElement(int count, int prefixSum) {
this->count = count;
this->prefixSum = prefixSum;
}
};
int main(int argc, char* argv[]) {
thrust::device_vector<MyElement> d_vector;
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
d_vector.push_back(MyElement(2, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
d_vector.push_back(MyElement(0, 0));
d_vector.push_back(MyElement(1, 0));
thrust::device_ptr<int> dvec(reinterpret_cast<int *>(thrust::raw_pointer_cast(d_vector.data())));
auto pi_count = thrust::make_permutation_iterator(dvec, thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1*2));
auto pi_prefix = thrust::make_permutation_iterator(dvec, thrust::make_transform_iterator(thrust::counting_iterator<int>(0), (_1*2)+1));
thrust::copy_n(pi_count, d_vector.size()-1, pi_prefix+1);
thrust::inclusive_scan(pi_prefix+1, pi_prefix + d_vector.size(), pi_prefix+1);
// Copy vector from device to host
thrust::host_vector<MyElement> h_vector = d_vector;
// Print
for (const MyElement& element : h_vector) {
printf("{ %d, %d }\n", element.count, element.prefixSum);
}
return 0;
}
$ nvcc -o t13 t13.cu -std=c++11
$ cuda-memcheck ./t13
========= CUDA-MEMCHECK
{ 0, 0 }
{ 0, 0 }
{ 1, 0 }
{ 2, 1 }
{ 0, 3 }
{ 1, 3 }
{ 0, 4 }
{ 1, 4 }
{ 0, 5 }
{ 1, 5 }
========= ERROR SUMMARY: 0 errors
$
我最终得到了类似的解决方案。 Whosebug answer 描述了如何对一个 class 成员执行包含性前缀和。
因为我不能对一个成员执行扫描并将其“输出”到另一个成员,而不是将 prefixSum
初始化为零,我使用与 count
相同的值(我这样做这在以前的内核中,没有使用推力)。
因此我的新输入变成:
{ (0, 0), (0, 0), (1, 1), (2, 2), (0, 0), (1, 1), (0, 0), (1, 1), (0, 0), (1, 1) }
之后我用thrust只对prefixSum
做一个包含前缀和,得到:
{ (0, 0), (0, 0), (1, 1), (2, 3), (0, 3), (1, 4), (0, 4), (1, 5), (0, 5), (1, 6) }
我其实需要一个独占前缀和,但是因为我有原始的count
,所以我想要的值很简单
prefixSum - count
.