key为字符串或char数组时,如何使用Thrust实现reduce by key

How to use Thrust to implement reduce by key when keys are strings or char array

输入:

BC
BD
BC
BC
BD
CD

输出:

BC 3
BD 2
CD 1

如果我使用 char 类型作为键,它是 available.But 似乎 Thrust 不支持字符串作为键。

#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/reduce.h>
#include <string>

int main(void)
{
  std::string data = "aaabbbbbcddeeeeeeeeeff";

  size_t N = data.size();

  thrust::device_vector<char> input(data.begin(), data.end());

  thrust::device_vector<char> output(N);
  thrust::device_vector<int>  lengths(N);

  size_t num_runs =
    thrust::reduce_by_key(input.begin(), input.end(),        
                      thrust::constant_iterator<int>(1), 
                      output.begin(),                    
                      lengths.begin()                    
                      ).first - output.begin();
   return 0;
}

如何使用Thrust实现?

向@AngryLettuce 致歉,这里有两种可能的方法:

方法一:

  1. 创建一个结构来保存您的密钥。该结构将为您的键中的每个字符包含一个 char 项。
  2. sort 将相似的键放在一起。看来您想要的实际上只是每种键类型的计数,而不管它出现在序列中的什么位置。为了使用 reduce_by_key 来促进这一点,有必要首先将相似的键组合在一起。否则,reduce_by_key 会将由不同中间键分隔的类似键视为不同的键序列。从您想要的输入和输出可以明显看出这不是您想要的。
  3. 现在对已排序的键使用 reduce_by_key,以像键一样计数。

第 2 步需要(对于此方法)一个仿函数来对键进行排序,第 3 步需要一个仿函数来识别 "equal" 个键的含义,reduce_by_key 需要。

方法二:

  1. 创建两个单独的chardevice_vector(s),一个保存每个键的第一个字母,另一个保存每个键的第二个字母。然后,我们将在整个代码的其余部分使用 zip_iterator 将这两个向量视为统一的 "key" 向量。

  2. sort 压缩的密钥向量。在这种情况下,thrust 知道如何对基本类型的压缩向量进行排序,并且不需要单独的排序函子

  3. 对压缩(和排序)的键向量执行reduce_by_key。这再次不需要单独的相等函子。 Thrust 知道如何确定基本类型的压缩向量的相等性。

第二种方法除了不需要任何函子定义外,可能还会更快,因为与第一种方法中存在的 AoS(结构数组)相比,zip_iterator 倾向于改进数据访问.

这是一个演示这两种方法的示例:

$ cat t1004.cu
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/iterator/constant_iterator.h>
#include <iostream>

#include <thrust/iterator/zip_iterator.h>

struct key {
  char k1;
  char k2;
};

struct sort_functor{
  __host__ __device__ bool operator()(key &k1, key &k2){
    if (k1.k1 < k2.k1) return true;
    if (k1.k1 > k2.k1) return false;
    if (k1.k2 < k2.k2) return true;
    return false;}
};

struct equal_key{
  __host__ __device__ bool operator()(key k1, key k2){
    if ((k1.k1 == k2.k1)&&(k1.k2 == k2.k2)) return true;
    return false;}
};

int main(){

  key data[] = {{'B','C'},{'B','D'},{'B','C'},{'B','C'},{'B','D'},{'C','D'}};;
  size_t dsize = sizeof(data)/sizeof(key);


//method 1
  thrust::device_vector<key> keys(data, data+dsize);
  thrust::device_vector<key> keys_out(dsize);
  thrust::device_vector<int> lengths(dsize);
  thrust::sort(keys.begin(), keys.end(), sort_functor());
  int rsize = thrust::reduce_by_key(keys.begin(), keys.end(), thrust::constant_iterator<int>(1), keys_out.begin(), lengths.begin(),equal_key()).first - keys_out.begin();
  std::cout << "Method1:" << std::endl;
  for (int i = 0; i < rsize; i++){
    key temp = keys_out[i];
    int len = lengths[i];
    std::cout << " " << temp.k1 << temp.k2 << " " <<  len << std::endl;}

//method 2

  //get the key data into 2 separate vectors.
  //there are more efficient ways to do this
  //but this is not the crux of your question

  thrust::device_vector<char> k1;
  thrust::device_vector<char> k2;
  for (int i = 0; i < dsize; i++){
    k1.push_back(data[i].k1);
    k2.push_back(data[i].k2);}

  thrust::sort(thrust::make_zip_iterator(thrust::make_tuple(k1.begin(), k2.begin())), thrust::make_zip_iterator(thrust::make_tuple(k1.end(), k2.end())));

  thrust::device_vector<char> k1r(dsize);
  thrust::device_vector<char> k2r(dsize);
  rsize = thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(k1.begin(), k2.begin())), thrust::make_zip_iterator(thrust::make_tuple(k1.end(), k2.end())), thrust::constant_iterator<int>(1), thrust::make_zip_iterator(thrust::make_tuple(k1r.begin(), k2r.begin())), lengths.begin()).first - thrust::make_zip_iterator(thrust::make_tuple(k1r.begin(),k2r.begin()));
  std::cout << "Method2:" << std::endl;
  for (int i = 0; i < rsize; i++){
    char c1 = k1r[i];
    char c2 = k2r[i];
    int len = lengths[i];
    std::cout << " " << c1 << c2 << " " <<  len << std::endl;}

  return 0;
}
$ nvcc -o t1004 t1004.cu
$ ./t1004
Method1:
 BC 3
 BD 2
 CD 1
Method2:
 BC 3
 BD 2
 CD 1
$

这是方法 2 的改进版本。您应该可以直接使用 string/char 数组,并且还可以相当容易地修改此版本以适应 2 到 10 个字符的密钥长度。此方法使用 strided range iterator 直接从数据数组中提取各个关键字符:

$ cat t1004.cu
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/iterator/constant_iterator.h>
#include <iostream>

#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>

template <typename Iterator>
class strided_range
{
    public:

    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
    {
        difference_type stride;

        stride_functor(difference_type stride)
            : stride(stride) {}

        __host__ __device__
        difference_type operator()(const difference_type& i) const
        {
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last, difference_type stride)
        : first(first), last(last), stride(stride) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

    protected:
    Iterator first;
    Iterator last;
    difference_type stride;
};

typedef thrust::device_vector<char>::iterator cIterator;

int main(){

//method 2

  //get the key data into separate vectors, one per character in key.
#define KEYLEN 2
  const char data[] = "BCBDBCBCBDCD";
  size_t dsize = sizeof(data)/sizeof(char);
  size_t numkeys = dsize/KEYLEN;
  thrust::device_vector<char> keys(data, data+dsize);
  strided_range<cIterator>  *str_k[KEYLEN];
  for (int i = 0; i < KEYLEN; i++)
    str_k[i] = new strided_range<cIterator>(keys.begin()+i, keys.end(), KEYLEN);

//modify this line also if KEYLEN changes (max 10)
  auto my_z = thrust::make_zip_iterator(thrust::make_tuple((*str_k[0]).begin(), (*str_k[1]).begin()));

  thrust::sort(my_z, my_z+numkeys);

  thrust::device_vector<char> kr[KEYLEN];
  for (int i = 0; i < KEYLEN; i++)
    kr[i].resize(numkeys);

//modify this line also if KEYLEN changes (max 10)
  auto my_zr = thrust::make_zip_iterator(thrust::make_tuple(kr[0].begin(), kr[1].begin()));

  thrust::device_vector<int> lengths(numkeys);

  size_t rsize = thrust::reduce_by_key(my_z, my_z + numkeys, thrust::constant_iterator<int>(1), my_zr, lengths.begin()).first - my_zr;
  std::cout << "Method2:" << std::endl;

  for (int i = 0; i < rsize; i++){
    std::cout << " ";
    for (int j = 0; j < KEYLEN; j++){
      char c = kr[j][i];
      std::cout << c; }
    int len = lengths[i];
    std::cout <<" " <<  len << std::endl;}

  return 0;
}
$ nvcc -std=c++11 t1004.cu -o t1004
$ ./t1004
Method2:
 BC 3
 BD 2
 CD 1
$