zip_iterator 推力 reduce_by_key 错误

Thrust reduce_by_key error with zip_iterator

我有一个带有 x,y,z 点的双填充展平数组 我将此数组传输到 x、y、z 数组中。我通过 zip 迭代器使用这些数组

      tpl4zip first = thrust::make_zip_iterator(thrust::make_tuple(X.begin(), Y.begin(), Z.begin(), K.begin()));
      tpl4zip last = thrust::make_zip_iterator(thrust::make_tuple(X.end(), Y.end(), Z.end(), K.end()));

我可以毫无问题地对该元组进行排序。 然后我需要减少才能计数,所以想使用 reduce_by_key。 但是我所有尝试制作功能性 reduce_by_key 的尝试都出错了。 有人可以告诉我我做错了什么吗? 提前致谢。

这是我的代码:



      thrust::device_vector<int> counter(N/KEYLEN);         // keys in one row
      thrust::fill(counter.begin(), counter.end(), 1);      // set counter for key
      auto my_z = thrust::make_zip_iterator(thrust::make_tuple(first,counter.begin()));

      thrust::device_vector<double> X_r(N/KEYLEN);          // keys in one row
      thrust::device_vector<double> Y_r(N/KEYLEN);          // keys in one row
      thrust::device_vector<double> Z_r(N/KEYLEN);          // keys in one row
      thrust::device_vector<int> K_r(N/KEYLEN);             // index in one row
      
      tpl4zip my_zr = thrust::make_zip_iterator(thrust::make_tuple(X_r.begin(), Y_r.begin(), Z_r.begin(), K_r.begin()));

      thrust::device_vector<int> counter_2(N/KEYLEN);          // keys in one row
      thrust::fill(counter_2.begin(), counter_2.end(), 1);  // set index for key
      auto pack = thrust::make_zip_iterator(thrust::make_tuple(my_zr,counter_2.begin()));

      thrust::equal_to<int> binary_pred;
      thrust::reduce_by_key(
                              first,
                              last,
                              my_z, 
                              first, 
                              pack, 
                              binary_pred,
                              TuplePlus()
                              );
  

我的 TuplePlus 是这样定义的:

struct TuplePlus
{
    __host__ __device__
    tpl5int operator ()(const tpl5int& lhs, const tpl5int& rhs)
    {
        int count = thrust::get<1>(lhs)+thrust::get<1>(rhs);
        return thrust::make_tuple(thrust::get<0>(lhs),count);
    }
};

我的元组和 zip 迭代器是这样的:

#define N 30 // make this evenly divisible by 3 for this example

typedef thrust::tuple<double, double, double, int>  tpl4int;
typedef thrust::device_vector<double>::iterator doubleiter;
typedef thrust::device_vector<int>::iterator intiter;

typedef thrust::tuple<doubleiter, doubleiter, doubleiter, intiter>  tpl4doubleiter;
typedef thrust::zip_iterator<tpl4doubleiter>  tpl4zip;

typedef thrust::tuple<tpl4zip, int>  tpl5int;
typedef thrust::tuple<tpl4doubleiter, intiter>  tpl5doubleiter;
typedef thrust::zip_iterator<tpl5doubleiter>  tpl5zip;

这里是完整代码

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#include <thrust/tuple.h>

#define N 30 // make this evenly divisible by 3 for this example

typedef thrust::tuple<double, double, double, int>  tpl4int;
typedef thrust::device_vector<double>::iterator doubleiter;
typedef thrust::device_vector<int>::iterator intiter;

typedef thrust::tuple<doubleiter, doubleiter, doubleiter, intiter>  tpl4doubleiter;
typedef thrust::zip_iterator<tpl4doubleiter>  tpl4zip;

typedef thrust::tuple<tpl4zip, int>  tpl5int;
typedef thrust::tuple<tpl4doubleiter, intiter>  tpl5doubleiter;
typedef thrust::zip_iterator<tpl5doubleiter>  tpl5zip;


struct sort_
{    
    double decimPrecision;
    sort_(double _decimPrecision)
    {
      decimPrecision=_decimPrecision;
    }
        __host__ __device__
    bool operator()(const tpl4int &a,const tpl4int &b) const
    {
      if (round(thrust::get<0>(a)/decimPrecision) != round(thrust::get<0>(b)/decimPrecision))
      return round(thrust::get<0>(a)/decimPrecision) > round(thrust::get<0>(b)/decimPrecision);

      if (round(thrust::get<1>(a)/decimPrecision) != round(thrust::get<1>(b)/decimPrecision))
      return round(thrust::get<1>(a)/decimPrecision) > round(thrust::get<1>(b)/decimPrecision);

      return round(thrust::get<2>(a)/decimPrecision) > round(thrust::get<2>(b)/decimPrecision);
    }
};

struct TuplePlus
{
    __host__ __device__
    tpl5int operator ()(const tpl5int& lhs, const tpl5int& rhs)
    {
        int count = thrust::get<1>(lhs)+thrust::get<1>(rhs);
        return thrust::make_tuple(thrust::get<0>(lhs),count);
    }
};

int main() 
{
#define KEYLEN 3
   thrust::device_vector<double> input(10*KEYLEN);
      int i=0;

//     input[0] = vec3(0,0,5.005);
      input[i++] = 1.0245;
      input[i++] = 2.54;
      input[i++] = 3.001;

//     input[1] = vec3(0,0,5.005);
      input[i++] = 2.0;
      input[i++] = 1.0;
      input[i++] = 5.01125;

//     input[2] = vec3(0,0,5.005);
      input[i++] = 6.0;
      input[i++] = 1.0;
      input[i++] = 5.0145;

    
//     input[3] = vec3(2,1,5.001);
      input[i++] = 6.0;
      input[i++] = 1.0215;
      input[i++] = 6.001;

//     input[4] = vec3(3,0,5.001);
      input[i++] = 6.0;
      input[i++] = 1.0845;
      input[i++] = 5.00125;

//     input[5] = vec3(4,0,5.001);
      input[i++] = 5.0;
      input[i++] = 0.0;
      input[i++] = 5.001;
    
//     input[6] = vec3(5,0,5.001);
      input[i++] = 5.0;
      input[i++] = 0.0;
      input[i++] = 5.001;

//     input[7] = vec3(6,0,10.501);
      input[i++] = 6.0;
      input[i++] = 0.0;
      input[i++] = 10.501;

//     input[8] = vec3(0,0,5.001);
      input[i++] = 1.0;
      input[i++] = 0.0;
      input[i++] = 5.0015478;

//     input[9] = vec3(0,0,5.001);
      input[i++] = 6.0;
      input[i++] = 1.005;
      input[i++] = 5.001;

      thrust::device_vector<double> X(N/KEYLEN);          // keys in one row
      thrust::device_vector<double> Y(N/KEYLEN);          // keys in one row
      thrust::device_vector<double> Z(N/KEYLEN);          // keys in one row

      size_t dsize = sizeof(input)/sizeof(double);
      size_t numkeys = dsize/KEYLEN;

      int index=0;  
      for( int i = 0; i<N/KEYLEN;i++)
      {
            X[i]=input[index++];
            Y[i]=input[index++];
            Z[i]=input[index++];
      }

      thrust::device_vector<int> K(N/KEYLEN);          // keys in one row
      thrust::sequence(K.begin(), K.end(), 0);         // set index for key

      tpl4zip first = thrust::make_zip_iterator(thrust::make_tuple(X.begin(), Y.begin(), Z.begin(), K.begin()));
      tpl4zip last = thrust::make_zip_iterator(thrust::make_tuple(X.end(), Y.end(), Z.end(), K.end()));
   
      thrust::sort(first,last,sort_(0.01));

      thrust::device_vector<int> counter(N/KEYLEN);         // keys in one row
      thrust::fill(counter.begin(), counter.end(), 1);      // set counter for key
      auto my_z = thrust::make_zip_iterator(thrust::make_tuple(first,counter.begin()));

      thrust::device_vector<double> X_r(N/KEYLEN);          // keys in one row
      thrust::device_vector<double> Y_r(N/KEYLEN);          // keys in one row
      thrust::device_vector<double> Z_r(N/KEYLEN);          // keys in one row
      thrust::device_vector<int> K_r(N/KEYLEN);             // keys in one row
      
      tpl4zip my_zr = thrust::make_zip_iterator(thrust::make_tuple(X_r.begin(), Y_r.begin(), Z_r.begin(), K_r.begin()));

      thrust::device_vector<int> counter_2(N/KEYLEN);          // keys in one row
      thrust::fill(counter_2.begin(), counter_2.end(), 1);     // set counter for key
      auto pack = thrust::make_zip_iterator(thrust::make_tuple(my_zr,counter_2.begin()));

      thrust::equal_to<int> binary_pred;
      thrust::reduce_by_key(
                              first,
                              last,
                              my_z, 
                              first, 
                              pack, 
                              binary_pred,
                              TuplePlus()
                              );

      std::vector<tpl4int> result_sorted(N/KEYLEN);
      thrust::copy(first,first+N/KEYLEN,result_sorted.begin());  
      for (int i=0; i<N/KEYLEN; i++)
      {
            std::cout << "{ " << result_sorted[i].get<0>() ;
            std::cout << ", " << result_sorted[i].get<1>() ;
            std::cout << ", " << result_sorted[i].get<2>() ;
            std::cout << "} i= " << result_sorted[i].get<3>()<< std::endl;
      }

  return 0;
}

引用 thrust::reduce_by_keydocumentation:

The input ranges shall not overlap either output range.

所以你不能在这里给first作为输出迭代器。此操作无法原地完成。

您还尝试使用 thrust::equal_to<int> 而您的键不是 int 类型,而是 tpl4int.

类型

你似乎在这里使用 zip 迭代器使事情变得比它们需要的复杂得多:你用你的值压缩你的键而不是仅仅使用值。要获得预期的结果,您可能只需要

// only to see how it would look if needed:
auto custom_binary_pred = [](const tpl4int &lhs, const tpl4int &rhs) {
    return thrust::get<0>(lhs) == thrust::get<0>(rhs) &&
           thrust::get<1>(lhs) == thrust::get<1>(rhs) &&
           thrust::get<2>(lhs) == thrust::get<2>(rhs) &&
           thrust::get<3>(lhs) == thrust::get<3>(rhs);

};
thrust::reduce_by_key(first,
                      last,
                      counter.begin(), 
                      my_zr, 
                      counter_2.begin(), 
                      custom_binary_pred,
                      thrust::plus<int>()); // doesn't need to be specified due to overload of reduce_by_key

我相当确定您甚至不需要指定 custom_binary_pred,因为 Thrust 可能为其元组实现了 operator==(尚未找到有关此 here 的文档) ).

您不需要用 thrust::fill 初始化 counter_2,因为无论哪种方式,值都会被覆盖。

在我的跟踪代码中,我意识到来自 Python 的展平数组的传输效率非常低。我的 30000000 个输入数组在 3210 毫秒内加载到 X、Y、Z 数组中。 我不知道如何改进这一点。我不知道如何使用 thrust 做这个或者它可以更快吗?

      thrust::device_vector<double> X(N/KEYLEN);          // keys in one row
      thrust::device_vector<double> Y(N/KEYLEN);          // keys in one row
      thrust::device_vector<double> Z(N/KEYLEN);          // keys in one row

      size_t dsize = sizeof(input)/sizeof(double);
      size_t numkeys = dsize/KEYLEN;

      int index=0;  
      for( int i = 0; i<N/KEYLEN;i++)
      {
            X[i]=input[index++];
            Y[i]=input[index++];
            Z[i]=input[index++];
      }