带推力的 CUDA 二阶递归 inclusive_scan
CUDA 2nd order recursion with thrust inclusive_scan
我正在尝试了解如何并行化递归计算。连续地,计算采用以下形式:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
对于 i-1
索引,这里有一个解决我之前问题的方法:
我想修改它以使用 i-2
,但我不明白如何将相同的过程应用于二阶计算。应该可以使用 thrust::inclusive_scan
函数,但我不知道如何操作。有人知道解决方案吗?
这里是一些 cpu 代码,它显示了从 https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf 派生的公式的可能实现,以将高阶递归表示为扫描操作。
关键思想是扫描结果的每个元素不是一个标量,而是一个包含n个前一个标量结果的向量。这样,所有必需的先前结果都可用于扫描运算符以计算下一个结果。
#include <iostream>
#include <algorithm>
#include <numeric>
#include <array>
void calculate1(std::vector<int> vec, int k){
std::vector<int> result(vec.size(), 0);
for(int i = 2; i < vec.size(); i++){
result[i] = vec[i] + k * result[i-2];
}
std::cerr << "calculate1 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
struct S{
//data[0] stores result of last iteration
//data[1] stores result of second last iteration
std::array<int, 2> data;
};
std::ostream& operator<<(std::ostream& os, S s){
os << "(" << s.data[0] << "," << s.data[1] << ")";
}
void calculate2(std::vector<int> vec, int k){
S initvalue{{0,0}};
std::vector<S> result(vec.size(), initvalue);
std::exclusive_scan(
vec.begin() + 2,
vec.end(),
result.begin(),
initvalue,
[k](S left, int right){
S result;
/*A = (
0 1
k 0
)
Compute result = left * A + (right 0)
*/
result.data[0] = right + k * left.data[1];
result.data[1] = left.data[0];
return result;
}
);
std::cerr << "calculate2 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
int main(){
const int k = 5;
const std::vector<int> vec1{1,3,5,7,9,11,3,6,7,1,2,4};
calculate1(vec1, k);
calculate2(vec1, k);
}
https://godbolt.org/z/cszzn8Ec8
输出:
calculate1 result: 0, 0, 5, 7, 34, 46, 173, 236, 872, 1181, 4362, 5909,
calculate2 result: (0,0), (5,0), (7,5), (34,7), (46,34), (173,46), (236,173), (872,236), (1181,872), (4362,1181), (0,0), (0,0),
某处仍然存在一对一错误,但可以理解其背后的想法。
我之前说过这种方法可以用于 CUDA 中的并行扫描。这是不正确的。对于并行扫描,扫描算子必须多一个属性,即结合律,即(a OP b) OP c == a OP (b OP c)。这种方法不是这种情况。
Robert Crovella 的回答展示了如何推导可用于并行扫描的关联扫描运算符。
从上一个 question/answer 中断的地方继续,我们将注意力转移到 Blelloch referenced paper 中的方程式 1.11。我们观察到您的问题表述:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
如果我们设置 m=2, 似乎与等式 1.11 匹配,在这种情况下,我们还可以观察到对于您的公式,所有 ai,1 都为零(并且,如前所述,所有 ai,2 都是 k).
根据那篇论文中的等式 1.12,我们的状态变量 si 现在变成了一个二元组:
si = |xi xi-1|
注意到这些事情,我们观察到等式 1.13 的“正确性”:
si = |xi-1 xi-2| . |0 1, k 0| + |bi 0|
改写:
si,1 = xi = k*xi-2 + bi
si,2 = xi-1 = xi-1
(在我看来, 在这一点上离开了你。这种认识,即 result.data[0] = right + k * left.data[1];
足以进行串行扫描,但不适用于并行扫描。这也很明显 functor/scan op 没有关联。)
我们现在需要提出一个二元运算符bop
,它是(1.7)中的定义对这种情况的扩展。参照前面方程1.7中的定义,我们在1.13的处理基础上扩展如下:
Ci = |Ai , Bi|
其中:
Ai = |0 1, k 0|
和:
Bi = |bi 0|
然后我们有:
Ci bop
Cj = | Ai 。 Aj , Bi 。 Aj + Bj |
这就成为我们 functor/scan 运算符的公式。我们需要始终携带 6 个标量“状态”量:2 个用于 B 向量,4 个用于 A 矩阵。
接下来是上面的实现:
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
thrust::device_vector<mt> db(b1, b1+ds);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(db.begin(), b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
thrust::host_vector<mt> hx1 = dx1;
thrust::copy_n(hx1.begin(), snip, std::ostream_iterator<mt>(std::cout, ","));
thrust::copy_n(hx1.begin()+ds-snip, snip, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.218,601.275,576.315,607.993,582.947,614.621,589.516,621.699,595.644,628.843,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.621,589.516,621.7,595.644,628.843,
========= ERROR SUMMARY: 0 errors
$
是的,上面有一些结果在第 6 位不同。考虑到串行和并行方法之间非常不同的操作顺序,我将此归因于 float
分辨率的局限性。如果将 typedef
更改为 double
,结果将完全匹配。
既然你已经问过了,这里有一个等效的实现,其中使用先前使用 cudaMalloc
:
分配的设备数据进行了演示
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef double mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, thrust::raw_pointer_cast(dx1.data()), ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
========= ERROR SUMMARY: 0 errors
这两种方法之间应该没有明显的性能差异。 (然而,对于这个例子,我碰巧将 typedef
切换为 double
,所以这有所不同。)使用 cudaMalloc
作为各种状态向量的 device_vector
的替代(dx0
, dx1
, dy0
, dy1
...) 可能会稍微快一些,因为 device_vector
先做一个 cudaMalloc
风格的分配,然后启动一个内核来清零分配。这个归零步骤对于状态向量来说是不必要的。如果您有兴趣,此处给出的模式应该演示如何做到这一点。
这是一个完全消除使用 thrust::device_vector
和 thrust::host_vector
的版本:
#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 32768*4;
const mt k = 1.001;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // result
for (int i = 0; i < ds; i++) { b1[i] = (rand()/(float)RAND_MAX)-0.5;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db, *dstate;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMalloc(&dstate, 6*ds*sizeof(dstate[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_ptr<mt> dx1 = thrust::device_pointer_cast(dstate);
thrust::device_ptr<mt> dx0 = thrust::device_pointer_cast(dstate+ds);
thrust::device_ptr<mt> dy0 = thrust::device_pointer_cast(dstate+2*ds);
thrust::device_ptr<mt> dy1 = thrust::device_pointer_cast(dstate+3*ds);
thrust::device_ptr<mt> dy2 = thrust::device_pointer_cast(dstate+4*ds);
thrust::device_ptr<mt> dy3 = thrust::device_pointer_cast(dstate+5*ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1, dx0, dy0, dy1, dy2, dy3));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, dstate, ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}
我正在尝试了解如何并行化递归计算。连续地,计算采用以下形式:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
对于 i-1
索引,这里有一个解决我之前问题的方法:
我想修改它以使用 i-2
,但我不明白如何将相同的过程应用于二阶计算。应该可以使用 thrust::inclusive_scan
函数,但我不知道如何操作。有人知道解决方案吗?
这里是一些 cpu 代码,它显示了从 https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf 派生的公式的可能实现,以将高阶递归表示为扫描操作。
关键思想是扫描结果的每个元素不是一个标量,而是一个包含n个前一个标量结果的向量。这样,所有必需的先前结果都可用于扫描运算符以计算下一个结果。
#include <iostream>
#include <algorithm>
#include <numeric>
#include <array>
void calculate1(std::vector<int> vec, int k){
std::vector<int> result(vec.size(), 0);
for(int i = 2; i < vec.size(); i++){
result[i] = vec[i] + k * result[i-2];
}
std::cerr << "calculate1 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
struct S{
//data[0] stores result of last iteration
//data[1] stores result of second last iteration
std::array<int, 2> data;
};
std::ostream& operator<<(std::ostream& os, S s){
os << "(" << s.data[0] << "," << s.data[1] << ")";
}
void calculate2(std::vector<int> vec, int k){
S initvalue{{0,0}};
std::vector<S> result(vec.size(), initvalue);
std::exclusive_scan(
vec.begin() + 2,
vec.end(),
result.begin(),
initvalue,
[k](S left, int right){
S result;
/*A = (
0 1
k 0
)
Compute result = left * A + (right 0)
*/
result.data[0] = right + k * left.data[1];
result.data[1] = left.data[0];
return result;
}
);
std::cerr << "calculate2 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
int main(){
const int k = 5;
const std::vector<int> vec1{1,3,5,7,9,11,3,6,7,1,2,4};
calculate1(vec1, k);
calculate2(vec1, k);
}
https://godbolt.org/z/cszzn8Ec8
输出:
calculate1 result: 0, 0, 5, 7, 34, 46, 173, 236, 872, 1181, 4362, 5909,
calculate2 result: (0,0), (5,0), (7,5), (34,7), (46,34), (173,46), (236,173), (872,236), (1181,872), (4362,1181), (0,0), (0,0),
某处仍然存在一对一错误,但可以理解其背后的想法。
我之前说过这种方法可以用于 CUDA 中的并行扫描。这是不正确的。对于并行扫描,扫描算子必须多一个属性,即结合律,即(a OP b) OP c == a OP (b OP c)。这种方法不是这种情况。
Robert Crovella 的回答展示了如何推导可用于并行扫描的关联扫描运算符。
从上一个 question/answer 中断的地方继续,我们将注意力转移到 Blelloch referenced paper 中的方程式 1.11。我们观察到您的问题表述:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
如果我们设置 m=2,似乎与等式 1.11 匹配,在这种情况下,我们还可以观察到对于您的公式,所有 ai,1 都为零(并且,如前所述,所有 ai,2 都是 k).
根据那篇论文中的等式 1.12,我们的状态变量 si 现在变成了一个二元组:
si = |xi xi-1|
注意到这些事情,我们观察到等式 1.13 的“正确性”:
si = |xi-1 xi-2| . |0 1, k 0| + |bi 0|
改写:
si,1 = xi = k*xi-2 + bi
si,2 = xi-1 = xi-1
(在我看来,result.data[0] = right + k * left.data[1];
足以进行串行扫描,但不适用于并行扫描。这也很明显 functor/scan op 没有关联。)
我们现在需要提出一个二元运算符bop
,它是(1.7)中的定义对这种情况的扩展。参照前面方程1.7中的定义,我们在1.13的处理基础上扩展如下:
Ci = |Ai , Bi|
其中:
Ai = |0 1, k 0|
和:
Bi = |bi 0|
然后我们有:
Ci bop
Cj = | Ai 。 Aj , Bi 。 Aj + Bj |
这就成为我们 functor/scan 运算符的公式。我们需要始终携带 6 个标量“状态”量:2 个用于 B 向量,4 个用于 A 矩阵。
接下来是上面的实现:
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
thrust::device_vector<mt> db(b1, b1+ds);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(db.begin(), b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
thrust::host_vector<mt> hx1 = dx1;
thrust::copy_n(hx1.begin(), snip, std::ostream_iterator<mt>(std::cout, ","));
thrust::copy_n(hx1.begin()+ds-snip, snip, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.218,601.275,576.315,607.993,582.947,614.621,589.516,621.699,595.644,628.843,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.621,589.516,621.7,595.644,628.843,
========= ERROR SUMMARY: 0 errors
$
是的,上面有一些结果在第 6 位不同。考虑到串行和并行方法之间非常不同的操作顺序,我将此归因于 float
分辨率的局限性。如果将 typedef
更改为 double
,结果将完全匹配。
既然你已经问过了,这里有一个等效的实现,其中使用先前使用 cudaMalloc
:
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef double mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, thrust::raw_pointer_cast(dx1.data()), ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
========= ERROR SUMMARY: 0 errors
这两种方法之间应该没有明显的性能差异。 (然而,对于这个例子,我碰巧将 typedef
切换为 double
,所以这有所不同。)使用 cudaMalloc
作为各种状态向量的 device_vector
的替代(dx0
, dx1
, dy0
, dy1
...) 可能会稍微快一些,因为 device_vector
先做一个 cudaMalloc
风格的分配,然后启动一个内核来清零分配。这个归零步骤对于状态向量来说是不必要的。如果您有兴趣,此处给出的模式应该演示如何做到这一点。
这是一个完全消除使用 thrust::device_vector
和 thrust::host_vector
的版本:
#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 32768*4;
const mt k = 1.001;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // result
for (int i = 0; i < ds; i++) { b1[i] = (rand()/(float)RAND_MAX)-0.5;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db, *dstate;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMalloc(&dstate, 6*ds*sizeof(dstate[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_ptr<mt> dx1 = thrust::device_pointer_cast(dstate);
thrust::device_ptr<mt> dx0 = thrust::device_pointer_cast(dstate+ds);
thrust::device_ptr<mt> dy0 = thrust::device_pointer_cast(dstate+2*ds);
thrust::device_ptr<mt> dy1 = thrust::device_pointer_cast(dstate+3*ds);
thrust::device_ptr<mt> dy2 = thrust::device_pointer_cast(dstate+4*ds);
thrust::device_ptr<mt> dy3 = thrust::device_pointer_cast(dstate+5*ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1, dx0, dy0, dy1, dy2, dy3));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, dstate, ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}