CUDA 仅在满足谓词时才推力复制转换后的结果

CUDA Thrust copy transformed result only if it satisfies a predicate

我想对输入执行转换 thrust::device_vector 并且仅当结果满足谓词时才将结果复制到输出向量。因此结果的数量可能小于输入的大小 device_vector(类似于 thrust::copy_if 的输出向量)。我还没有找到使用 thrust::transform_if 执行此操作的方法。目前我可以使用 thrust::transformthrust::remove_if 来做到这一点,如下例所示:

#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>

__host__ __device__ unsigned int hash(unsigned int a) {
  a = (a+0x7ed55d16) + (a<<12);
  a = (a^0xc761c23c) ^ (a>>19);
  a = (a+0x165667b1) + (a<<5);
  a = (a+0xd3a2646c) ^ (a<<9);
  a = (a+0xfd7046c5) + (a<<3);
  a = (a^0xb55a4f09) ^ (a>>16);
  return a;
};

struct add_random {
  __host__ __device__ add_random() {}
  __device__ int operator()(const int n, const int x) const {
    thrust::default_random_engine rng(hash(n));
    thrust::uniform_int_distribution<int> uniform(0, 11);
    return uniform(rng)+x;
  } 
};

struct is_greater {
  __host__ __device__ bool operator()(const int x) {
    return x > 6 ;
  }
};

int main(void) {
  int x[5] = {10, 2, 5, 3, 0};
  thrust::device_vector<int> d_x(x, x+5);

  thrust::transform(
      thrust::counting_iterator<int>(0),
      thrust::counting_iterator<int>(5),
      d_x.begin(),
      d_x.begin(),
      add_random());

  std::cout << "after adding random number:" << std::endl;
  std::ostream_iterator<int> o(std::cout, " ");
  thrust::copy(d_x.begin(), d_x.end(), o);
  std::cout << std::endl;

  thrust::device_vector<int>::iterator new_end(thrust::remove_if(d_x.begin(), d_x.end(), is_greater()));

  std::cout << "after removing values greater than 6:" << std::endl;
  thrust::copy(d_x.begin(), new_end, o);
  std::cout << std::endl;

  return 0;
}

给出输出:

after adding random number:
18 4 8 7 11 
after removing values greater than 6:
4 

我想避免将结果复制到内存中两次,在上面的示例中,首先是 thrust::transform,然后是 thrust::remove_if。是否可以通过单个转换函数获得上述输出?我怎样才能做到这一点?我最担心的是计算成本,因此任何优化的解决方案,即使它不使用 Thrust 库也会很棒。

欢迎来到推力花式迭代器的世界。通过查看 thrust quick start guide,您可以快速了解一些奇特的迭代器类型。特别是,推力变换迭代器可以经常用于替换应用于另一个推力算法输入的推力变换操作,"fusing" 将两种算法合并为一个操作。

这是一个适用于您的案例的示例:

$ cat t1254.cu
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>

__host__ __device__ unsigned int hash(unsigned int a) {
  a = (a+0x7ed55d16) + (a<<12);
  a = (a^0xc761c23c) ^ (a>>19);
  a = (a+0x165667b1) + (a<<5);
  a = (a+0xd3a2646c) ^ (a<<9);
  a = (a+0xfd7046c5) + (a<<3);
  a = (a^0xb55a4f09) ^ (a>>16);
  return a;
};

struct add_random : public thrust::unary_function<thrust::tuple<int, int>, int> {
  __host__ __device__ int operator()(thrust::tuple<int, int> t) const {
    int n = thrust::get<0>(t);
    int x = thrust::get<1>(t);
    thrust::default_random_engine rng(hash(n));
    thrust::uniform_int_distribution<int> uniform(0, 11);
    return uniform(rng)+x;
  }
};

struct is_greater {
  __host__ __device__ bool operator()(const int x) {
    return x < 6 ;
  }
};

int main(void) {
  int x[5] = {10, 2, 5, 3, 0};
  thrust::device_vector<int> d_x(x, x+5);
  thrust::device_vector<int> d_r(5);
  int rsize = thrust::copy_if(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(0), d_x.begin())), add_random()), thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(5), d_x.end())), add_random()), d_r.begin(), is_greater())- d_r.begin();
  std::cout << "after removing values greater than 6:" << std::endl;
  thrust::copy_n(d_r.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;

  return 0;
}
$ nvcc -o t1254 t1254.cu
$ ./t1254
after removing values greater than 6:
4
$
  1. 我们已将您的变换操作替换为应用于相同两个输入的变换迭代器。由于您的转换操作有两个输入,我们使用一个 zip 迭代器来组合这些,并且转换仿函数也被稍微修改以接受该元组作为其输入。

  2. 已将您的 remove_if 转换为 copy_if,以使用转换迭代器作为输入。这需要对复制谓词的逻辑稍作改动。