如何使步幅块迭代器推力 cuda

Question

我需要一个 class 这样的迭代器

https://github.com/thrust/thrust/blob/master/examples/strided_range.cu

但是这个新迭代器执行下一个序列

[k * size_stride, k * size_stride+1, ...,k * size_stride+size_chunk-1,...]

和

k = 0,1,...,N

示例：

size_stride = 8
size_chunk = 3
N = 3

那么顺序就是

[0,1,2,8,9,10,16,17,18,24,25,26]

我不知道如何有效地做到这一点...

Answer 1

strided range interator 基本上是一个精心设计的置换迭代器，带有一个函子，可以为置换提供适当的索引。

这里是对跨步范围迭代器示例的修改。主要变化是：

包括 chunk 大小作为迭代器参数
修改为置换迭代器提供索引的仿函数以输出所需的序列
调整 .end() 迭代器的定义以提供适当长度的序列。

工作示例：

$ cat t1280.cu
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/sequence.h>
#include <iostream>
#include <assert.h>

// this example illustrates how to make strided-chunk access to a range of values
// examples:
//   strided_chunk_range([0, 1, 2, 3, 4, 5, 6], 1,1) -> [0, 1, 2, 3, 4, 5, 6]
//   strided_chunk_range([0, 1, 2, 3, 4, 5, 6], 2,1) -> [0, 2, 4, 6]
//   strided_chunk_range([0, 1, 2, 3, 4, 5, 6], 3,2) -> [0 ,1, 3, 4, 6]
//   ...

template <typename Iterator>
class strided_chunk_range
{
    public:

    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
    {
        difference_type stride;
        int chunk;
        stride_functor(difference_type stride, int chunk)
            : stride(stride), chunk(chunk) {}

        __host__ __device__
        difference_type operator()(const difference_type& i) const
        {
            int pos = i/chunk;
            return ((pos * stride) + (i-(pos*chunk)));
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_chunk_range(Iterator first, Iterator last, difference_type stride, int chunk)
        : first(first), last(last), stride(stride), chunk(chunk) {assert(chunk<=stride);}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride, chunk)));
    }

    iterator end(void) const
    {
        int lmf = last-first;
        int nfs = lmf/stride;
        int rem = lmf-(nfs*stride);
        return begin() + (nfs*chunk) + ((rem<chunk)?rem:chunk);
    }

    protected:
    Iterator first;
    Iterator last;
    difference_type stride;
    int chunk;
};

int main(void)
{
    thrust::device_vector<int> data(50);
    thrust::sequence(data.begin(), data.end());

    typedef thrust::device_vector<int>::iterator Iterator;

    // create strided_chunk_range
    std::cout << "stride 3, chunk 2, length 7" << std::endl;
    strided_chunk_range<Iterator> scr1(data.begin(), data.begin()+7, 3, 2);
    thrust::copy(scr1.begin(), scr1.end(), std::ostream_iterator<int>(std::cout, " "));  std::cout << std::endl;
    std::cout << "stride 8, chunk 3, length 50" << std::endl;
    strided_chunk_range<Iterator> scr(data.begin(), data.end(), 8, 3);
    thrust::copy(scr.begin(), scr.end(), std::ostream_iterator<int>(std::cout, " "));  std::cout << std::endl;

    return 0;
}
$ nvcc -arch=sm_35 -o t1280 t1280.cu
$ ./t1280
stride 3, chunk 2, length 7
0 1 3 4 6
stride 8, chunk 3, length 50
0 1 2 8 9 10 16 17 18 24 25 26 32 33 34 40 41 42 48 49
$

这可能不是最佳实现，特别是因为我们在置换函子中进行除法，但它应该可以帮助您入门。
我假设（并测试）chunk<=stride，因为这对我来说似乎很合理，并且简化了我的思维过程。对于 chunk>stride.

如何使步幅块迭代器推力 cuda

How make a stride chunk iterator thrust cuda

parallel-processing

cuda

gpgpu

thrust