Thrust transform_reduce 可以使用 2 个数组吗？

Question

我发现Thrust能提供的东西非常有限，如下代码所示：我最终有 9*9*2（1 个倍数 + 1 个减少）Thrust 调用，即 162 次内核启动。而如果我编写自己的内核，则只需要启动 1 个内核。

for(i=1;i<=9;i++) 
{
    for(j=i;j<=9;j++)
    {
        ATA[i][j]=0;
        for(m=1;m<=50000;m++)
            ATA[i][j]=ATA[i][j]+X[idx0[i]][m]*X[idx0[j]][m];
    }
}

然后我得到以下 Thrust 实现：

for(i=1;i<=dim0;i++)
{
    for(j=i;j<=dim0;j++)
    {
        thrust::transform(t_d_X+(idx0[i]-1)*(1+iNumPaths)+1, t_d_X+(idx0[i]-1)*(1+iNumPaths)+iNumPaths+1, t_d_X+(idx0[j]-1)*(1+iNumPaths)+1,t_d_cdataMulti, thrust::multiplies<double>());
        ATA[i][j] = thrust::reduce(t_d_cdataMulti, t_d_cdataMulti+iNumPaths, (double) 0, thrust::plus<double>());
    }
}

一些分析：

transform_reduce：不会有帮助，因为有一个指针重定向 idx0[i]，基本上涉及 2 个数组。第一个是 X[idx0[i]]，第二个是 X[idx0[j]]
reduce_by_key：会有帮助。但是我需要将所有中间结果存储到一个大数组中，并准备一个相同大小的巨大映射键 table。会去试试的
transform_iterator：无济于事，原因与 1.

认为我无法避免编写自己的内核？

Answer 1

我赌@m.s。可以提供更有效的方法。但这是一种可能的方法。为了通过 thrust 将整个计算减少到单个内核调用，有必要通过单个 thrust 算法调用来处理所有事情。在操作的核心，我们将许多计算加在一起，以填充一个矩阵。所以我相信thrust::reduce_by_key is an appropriate thrust algorithm to use. This means we must realize all other transformations using various thrust "fancy iterators", which are mostly covered in the thrust getting started guide.

尝试这样做（通过一次内核调用处理所有事情）会使代码非常密集且难以阅读。我通常不喜欢以这种方式展示推力，但由于这是您问题的症结所在，因此无法避免。因此，让我们大致从内向外解压 reduce_by_key 调用中包含的操作序列。该算法的一般基础是将"flatten"所有数据放入一个单一的长逻辑向量中。让我们假设我们的方阵维度仅为 2x2，而我们的 m 向量的长度为 3。您可以这样想 "flattening" 或线性索引转换：

linear index: 0 1 2 3 4 5 6 7 8 9 10 11
     i index: 0 0 0 0 0 0 1 1 1 1 1  1
     j index: 0 0 0 1 1 1 0 0 0 1 1  1
     m index: 0 1 2 0 1 2 0 1 2 0 1  2
     k index: 0 0 0 1 1 1 2 2 2 3 3  3

上面的 "k index" 是我们的键，reduce_by_key 最终将使用这些键为矩阵的每个元素收集乘积项。请注意，该代码具有 EXT_I、EXT_J、EXT_M 和 EXT_K 辅助宏，它们将使用 thrust placeholders 定义要对线性执行的操作索引（使用 counting_iterator 创建）生成其他各种 "indices"。

我们需要做的第一件事是构建一个合适的推力操作，将线性索引转换为 idx0[i] 的转换值（同样，从 "inward to outward" 开始） .我们可以使用 idx0 向量上的置换迭代器来完成此操作，其中 transform_iterator 为置换迭代器提供 "map" - 此变换迭代器仅转换线性索引 (mb)到 "i" 索引：
```
thrust::make_permutation_iterator(d_idx0.begin(), thrust::make_transform_iterator(mb, EXT_I))
```
现在我们需要将步骤 1 的结果与另一个索引 - m 在这种情况下，生成二维索引的线性化版本到 X（ d_X 是 X 的矢量线性化版本）。为此，我们将 zip_iterator 中第一步的结果与创建 m 索引的另一个转换迭代器结合起来。这个 zip_iterator 将被传递给一个 transform_iterator ，它采用两个索引并将其转换为线性化索引到 "look into" d_X 向量：
```
thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_idx0.begin(), thrust::make_transform_iterator(mb, EXT_I)), thrust::make_transform_iterator(mb, EXT_M))), create_Xidx()))
```
create_Xidx 是一个函子，它将两个计算出的索引转换成线性索引 d_X
根据第 2 步的结果，我们可以使用置换迭代器从 d_X 中获取乘法第一项的适当值：
```
thrust::make_permutation_iterator(d_X.begin(), {code from step 2})
```
重复步骤 1、2、3，使用 EXT_J 而不是 EXT_I，以创建乘法中的第二项：
```
X[idx0[i]][m]*X[idx0[j]][m]
```
将在步骤 3 和 4 中创建的项放入 zip_iterator，供 transform_iterator 使用，将两者相乘（使用 my_mult 函子) 创建实际产品：
```
thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple({result from step 3}, {result from step 4}, my_mult())
```
reduce_by_key 的其余部分相当简单。我们如前所述创建键索引，然后使用它对方矩阵的每个元素的各种乘积求和。

这是一个完整的示例：

$ cat t875.cu
#include <iostream>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>

// rows
#define D1 9
// cols
#define D2 9
// size of m
#define D3 50
// helpers to convert linear indices to i,j,m or "key" indices
#define EXT_I (_1/(D2*D3))
#define EXT_J ((_1/(D3))%D2)
#define EXT_M (_1%D3)
#define EXT_K (_1/D3)


void test_cpu(float ATA[][D2], float X[][D3], int idx0[]){
  for(int i=0;i<D1;i++)
  {
    for(int j=0;j<D2;j++)
    {
        ATA[i][j]=0;
        for(int m=0;m<D3;m++)
            ATA[i][j]=ATA[i][j]+X[idx0[i]][m]*X[idx0[j]][m];
    }
  }
}

using namespace thrust::placeholders;

struct create_Xidx : public thrust::unary_function<thrust::tuple<int, int>, int>{
__host__ __device__
  int operator()(thrust::tuple<int, int> &my_tuple){
    return (thrust::get<0>(my_tuple) * D3) + thrust::get<1>(my_tuple);
    }
};

struct my_mult : public thrust::unary_function<thrust::tuple<float, float>, float>{
__host__ __device__
  float operator()(thrust::tuple<float, float> &my_tuple){
    return thrust::get<0>(my_tuple) * thrust::get<1>(my_tuple);
    }
};

int main(){

  //synthesize data

  float ATA[D1][D2];
  float X[D1][D3];
  int idx0[D1];
  thrust::host_vector<float>  h_X(D1*D3);
  thrust::host_vector<int>    h_idx0(D1);
  for (int i = 0; i < D1; i++){
    idx0[i] = (i + 2)%D1; h_idx0[i] = idx0[i];
    for (int j = 0; j < D2; j++) {ATA[i][j] = 0;}
    for (int j = 0; j < D3; j++) {X[i][j] = j%(i+1); h_X[i*D3+j] = X[i][j];}}

  thrust::device_vector<float>  d_ATA(D1*D2);
  thrust::device_vector<float>  d_X = h_X;
  thrust::device_vector<int>    d_idx0 = h_idx0;

  // helpers
  thrust::counting_iterator<int> mb = thrust::make_counting_iterator(0);
  thrust::counting_iterator<int> me = thrust::make_counting_iterator(D1*D2*D3);

  // perform computation

  thrust::reduce_by_key(thrust::make_transform_iterator(mb, EXT_K), thrust::make_transform_iterator(me, EXT_K), thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_X.begin(), thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_idx0.begin(), thrust::make_transform_iterator(mb, EXT_I)), thrust::make_transform_iterator(mb, EXT_M))), create_Xidx())), thrust::make_permutation_iterator(d_X.begin(), thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_idx0.begin(), thrust::make_transform_iterator(mb, EXT_J)), thrust::make_transform_iterator(mb, EXT_M))), create_Xidx())))), my_mult()), thrust::make_discard_iterator(), d_ATA.begin());


  thrust::host_vector<float>  h_ATA = d_ATA;
  test_cpu(ATA, X, idx0);
  std::cout << "GPU:        CPU: " << std::endl;
  for (int i = 0; i < D1*D2; i++)
    std::cout << i/D1 << "," << i%D2 << ":" << h_ATA[i] << "      " << ATA[i/D1][i%D2] << std::endl;
}
$ nvcc -o t875 t875.cu
$ ./t875
GPU:        CPU:
0,0:81      81
0,1:73      73
0,2:99      99
0,3:153      153
0,4:145      145
0,5:169      169
0,6:219      219
0,7:0      0
0,8:25      25
1,0:73      73
1,1:169      169
1,2:146      146
1,3:193      193
1,4:212      212
1,5:313      313
1,6:280      280
1,7:0      0
1,8:49      49
2,0:99      99
2,1:146      146
2,2:300      300
2,3:234      234
2,4:289      289
2,5:334      334
2,6:390      390
2,7:0      0
2,8:50      50
3,0:153      153
3,1:193      193
3,2:234      234
3,3:441      441
3,4:370      370
3,5:433      433
3,6:480      480
3,7:0      0
3,8:73      73
4,0:145      145
4,1:212      212
4,2:289      289
4,3:370      370
4,4:637      637
4,5:476      476
4,6:547      547
4,7:0      0
4,8:72      72
5,0:169      169
5,1:313      313
5,2:334      334
5,3:433      433
5,4:476      476
5,5:841      841
5,6:604      604
5,7:0      0
5,8:97      97
6,0:219      219
6,1:280      280
6,2:390      390
6,3:480      480
6,4:547      547
6,5:604      604
6,6:1050      1050
6,7:0      0
6,8:94      94
7,0:0      0
7,1:0      0
7,2:0      0
7,3:0      0
7,4:0      0
7,5:0      0
7,6:0      0
7,7:0      0
7,8:0      0
8,0:25      25
8,1:49      49
8,2:50      50
8,3:73      73
8,4:72      72
8,5:97      97
8,6:94      94
8,7:0      0
8,8:25      25
$

备注：

如果你分析上面的代码，例如nvprof --print-gpu-trace ./t875，您将见证两次内核调用。第一个与 device_vector 创作有关。第二个内核调用处理整个 reduce_by_key 操作。
我不知道这一切是否比您的 CUDA 内核慢或快，因为您没有提供它。有时，专业编写的 CUDA 内核可能比执行相同操作的推力算法更快。
很可能我这里的算法并不完全您想到的算法。例如，您的代码建议您只填写 ATA 的三角形部分。但是您的描述 (9*9*2) 表明您想要填充 ATA 中的每个位置。尽管如此，我的目的不是给你一个黑盒子，而是展示你如何使用各种推力方法在单个内核调用中实现你想要的任何东西。

Thrust transform_reduce 可以使用 2 个数组吗？

Can Thrust transform_reduce work with 2 arrays?

cuda

thrust