将 CUDA 设备交错数组转换为元组以进行向量运算
convert CUDA device interleaved array to tuple for vector operations
如何将包含交错浮点数的设备数组转换为用于推力矢量运算的 CUDA 推力元组。
目的:我在 CUDA 上使用 Marching Cubes 生成一个粗略的顶点列表。输出是一个顶点列表,有冗余但没有连通性。我希望获得唯一顶点的列表,然后是这些唯一顶点的索引缓冲区,这样我就可以执行一些操作,例如网格简化等...
float *devPtr; //this is device pointer that holds an array of floats
//6 floats represent a vertex, array size is vertsCount*6*sizeof(float).
//format is [v0x, v0y, v0z, n0x, n0y, n0z, v1x, v1y, v1z, n1x, ...]
typedef thrust::tuple<float, float, float, float, float, float> MCVertex;
thrust::device_vector<MCVertex> inputVertices(vertsCount);
//copy from *devPtr to inputVertices.
//use something like unique to get rid of redundancies.
thrust::unique(inputVertices.begin(), inputVertices.end());
我如何实现复制,或者有其他更好的方法吗?
不需要复制,可以用thrust::zip_iterator
和一个strided_range
iterator的组合。
以下示例适用于浮点数列表,其中 3 个连续值彼此属于彼此。它当然可以扩展到支持更多,这只是输入的问题。
第一步是将一些演示数据加载到 GPU 上,这使用 thrust::device_vector
,但这会产生一个 float*
指针,就像您拥有的一样。
基于strided_range
迭代器和thrust::zip_iterator
数据先排序后压缩。
此代码使用 C++11 功能,因此使用以下代码编译它:
nvcc -std=c++11 unique.cu -o unique
运行./unique
时的输出为:
1 2 3 4 5 6
unique.cu
#include <thrust/device_vector.h>
#include <iostream>
#include <thrust/unique.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
template<typename... Iterators>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
return thrust::make_zip_iterator(thrust::make_tuple(its...));
}
template <typename Iterator>
struct strided_range
{
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main()
{
const int stride = 3;
const int num = 3;
const int size = stride * num;
float values[size] = {1,2,3,
4,5,6,
1,2,3};
// in this example I use thrust vectors to simplify copying from host to device
thrust::host_vector<float> h_vec (values, values+size);
thrust::device_vector<float> d_vec = h_vec;
// in your case, dev_ptr is your input pointer
float* dev_ptr = thrust::raw_pointer_cast(d_vec.data());
auto first = strided_range<float*>(dev_ptr, dev_ptr + size+1-stride, stride);
auto second = strided_range<float*>(dev_ptr+1, dev_ptr + size+1-stride+1, stride);
auto third = strided_range<float*>(dev_ptr+2, dev_ptr + size+1-stride+2, stride);
auto zip_begin = zip(first.begin(),second.begin(), third.begin());
auto zip_end = zip(first.end(), second.end(), third.end());
thrust::sort(thrust::device, zip_begin, zip_end);
auto new_end = thrust::unique(thrust::device, zip_begin,zip_end);
std::size_t new_size = stride * (new_end - zip_begin);
// use the underlying thrust::device_vector again to simplify printing
thrust::copy(d_vec.begin(), d_vec.begin()+new_size, std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
return 0;
}
顺便说一下:在尝试获取唯一值时要注意浮点数的不准确性。
我还创建了上面示例的通用版本,它自动构建 zip_iterator
并适用于任意数量的连续元素。由于官方推力版本还不支持可变元组,我们使用 std::tuple
构建所需的元组类型,然后将其转换为 thrust::tuple
。如果 Andrew Corrigan's branch of thrust(它增加了对可变元组的支持)被合并到 thrust master 中,我们可以完全避免使用 std::tuple。
使用以下方法编译此示例:
nvcc generic_unique.cu -std=c++11 -o generic_unique
运行./generic_unique
时的输出为:
input data: 1 2 3 4 5 6 0 0 0 0 0 0 1 2 3 4 5 6 0 0 0 0 0 0 1 2 3 4 5 6 0 0 0 0 0 0 0 0 0 0 0 0
after sort: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4 5 6 1 2 3 4 5 6 1 2 3 4 5 6
after unique: 0 0 0 0 0 0 1 2 3 4 5 6
generic_unique.cu
#include <tuple>
#include <thrust/tuple.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <thrust/unique.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
// adapted from https://github.com/thrust/thrust/blob/master/examples/strided_range.cu
template <typename Iterator, typename thrust::iterator_difference<Iterator>::type stride>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
//template <difference_type stride>
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last)
: first(first), last(last) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor()));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
};
// copied from
template<typename, typename>
struct append_to_type_seq { };
template<typename T, typename... Ts, template<typename...> class TT>
struct append_to_type_seq<T, TT<Ts...>>
{
using type = TT<Ts..., T>;
};
template<typename T, unsigned int N, template<typename...> class TT>
struct repeat
{
using type = typename
append_to_type_seq<
T,
typename repeat<T, N-1, TT>::type
>::type;
};
template<typename T, template<typename...> class TT>
struct repeat<T, 0, TT>
{
using type = TT<>;
};
template<typename Tuple> struct std_to_thrust_tuple;
template<typename...T> struct std_to_thrust_tuple<std::tuple<T...>> {
using type = thrust::tuple<T...>;
};
template<typename IteratorType, std::size_t stride>
class zipped_strided_range
{
public:
typedef typename strided_range<IteratorType, stride>::iterator SingleIterator;
typedef typename repeat<SingleIterator, stride, std::tuple>::type StdIteratorTuple;
typedef typename std_to_thrust_tuple<StdIteratorTuple>::type IteratorTuple;
typedef decltype(thrust::make_zip_iterator(IteratorTuple())) ZipIterator;
zipped_strided_range(IteratorType first, IteratorType last) : first(first), last(last)
{
assign<0>();
}
ZipIterator begin() const
{
return thrust::make_zip_iterator(begin_tuple);
}
ZipIterator end() const
{
return thrust::make_zip_iterator(end_tuple);
}
protected:
template <std::size_t index>
void assign(typename std::enable_if< (index < stride) >::type* = 0)
{
strided_range<IteratorType,stride> strided_range_iterator(first+index, last-(stride-1)+index);
thrust::get<index>(begin_tuple) = strided_range_iterator.begin();
thrust::get<index>(end_tuple) = strided_range_iterator.end();
assign<index+1>();
}
template <std::size_t index>
void assign(typename std::enable_if< (index == stride) >::type* = 0)
{
// end recursion
}
IteratorType first;
IteratorType last;
IteratorTuple begin_tuple;
IteratorTuple end_tuple;
};
int main()
{
const int stride = 6;
const int num = 6;
const int size = stride * num;
float values[size] = {1,2,3,4,5,6,
0,0,0,0,0,0,
1,2,3,4,5,6,
0,0,0,0,0,0,
1,2,3,4,5,6,
0,0,0,0,0,0
};
// in this example I use thrust vectors to simplify copying from host to device
// it also simplifies printing
thrust::host_vector<float> h_vec (values, values+size);
thrust::device_vector<float> d_vec = h_vec;
std::cout << "input data: ";
thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
// in your case, dev_ptr is your input pointer
float* dev_ptr = thrust::raw_pointer_cast(d_vec.data());
zipped_strided_range<float*, stride> zipped(dev_ptr, dev_ptr+size);
thrust::sort(thrust::device, zipped.begin(), zipped.end());
std::cout << "after sort: ";
thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
auto new_end = thrust::unique(thrust::device, zipped.begin(), zipped.end());
std::size_t new_size = stride * (new_end - zipped.begin());
std::cout << "after unique: ";
d_vec.resize(new_size);
thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
return 0;
}
首先,感谢 m.s. 的回答,他的回答为我指明了正确的方向。
请注意,如果您使用的是 Microsoft Visual Studio,则只有 VS2013 支持可变元组。
对于主机编译器的 c++11 功能支持列表(cl.exe,如 VS2013),使用下面的 link。
https://msdn.microsoft.com/en-us/library/hh567368.aspx
PS:确保您正在为 v120 平台工具集构建以利用可变参数模板功能。
感谢@Robert Crovella,VS2013 默认设置了 [-std=c++11],因此不需要设置标志。
回到手头的问题,下面是我如何使用 m.s. 中的代码解决它,但使用 thrust::device_ptr 而不是原始指针。
#include <iostream>
#include "thrust\host_vector.h"
#include "thrust\device_vector.h"
#include "thrust\sort.h"
#include "thrust\unique.h"
#include "thrust\binary_search.h"
#include "thrust\iterator\zip_iterator.h"
#include "thrust\execution_policy.h"
template <typename Iterator>
struct strided_range
{
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function < difference_type, difference_type >
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator, TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
//forcing it to be a 3-tuple one instead of using variadic templates
template<typename Iterator>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterator, Iterator, Iterator>> zip(const Iterator& sr1, const Iterator& sr2, const Iterator& sr3)
{
return thrust::make_zip_iterator(thrust::make_tuple(sr1, sr2, sr3));
}
int main()
{
const int stride = 3;
const int num = 6;
const int size = stride * num;
//values on host
float values[size] = { 1, 2, 3,
4, 5, 6,
1, 2, 3,
4, 5, 6,
1, 2, 3,
7, 8, 9 };
//ptr for device
float *d_data;
//allocate memory on the device
cudaMalloc((void**)&d_data, size*sizeof(float));
//copy from host to device
cudaMemcpy(d_data, values, size*sizeof(float), cudaMemcpyHostToDevice);
//a typedef for device_ptr<float>
typedef thrust::device_ptr<float> floatdevptr;
//cast our raw pointer to device pointer
floatdevptr dev_dataptr = thrust::device_pointer_cast(d_data);
//create a device_vector from the dev_dataptr
thrust::device_vector<float> d_vec(dev_dataptr, dev_dataptr + size);
//make a copy
thrust::device_vector<float> d_veccopy = d_vec;
//create a device_vector to hold indices (6 indices for 6 vertices)
thrust::device_vector<unsigned int> indices( num );
//print input values
std::cout << "Input Values : ";
thrust::copy(d_vec.begin(), d_vec.begin() + size, std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
//a typedef for our strided_range<device_ptr<float>>
typedef strided_range<floatdevptr>::iterator floatdevptr_stridedrangeiterator;
//create the strided_range for x, y and z;
strided_range<floatdevptr> dvx = strided_range<floatdevptr>(dev_dataptr + 0, dev_dataptr + size - stride + 1, stride);
strided_range<floatdevptr> dvy = strided_range<floatdevptr>(dev_dataptr + 1, dev_dataptr + size - stride + 2, stride);
strided_range<floatdevptr> dvz = strided_range<floatdevptr>(dev_dataptr + 2, dev_dataptr + size - stride + 3, stride);
//create zip_iterator for the vertex
auto zip_dv_first = zip<floatdevptr_stridedrangeiterator>(dvx.begin(), dvy.begin(), dvz.begin());
auto zip_dv_last = zip<floatdevptr_stridedrangeiterator>(dvx.end(), dvy.end(), dvz.end());
//sort
thrust::sort(zip_dv_first, zip_dv_last);
//remove duplicates
auto new_dv_last = thrust::unique(zip_dv_first, zip_dv_last);
//compute new size
std::size_t new_dv_size = stride * (new_dv_last - zip_dv_first);
//create the same for the copy.
strided_range<floatdevptr> dvcpyx = strided_range<floatdevptr>(d_veccopy.data() + 0, d_veccopy.data() + size - stride + 1, stride);
strided_range<floatdevptr> dvcpyy = strided_range<floatdevptr>(d_veccopy.data() + 1, d_veccopy.data() + size - stride + 2, stride);
strided_range<floatdevptr> dvcpyz = strided_range<floatdevptr>(d_veccopy.data() + 2, d_veccopy.data() + size - stride + 3, stride);
auto zip_dvcpy_first = zip<floatdevptr_stridedrangeiterator>(dvcpyx.begin(), dvcpyy.begin(), dvcpyz.begin());
auto zip_dvcpy_last = zip<floatdevptr_stridedrangeiterator>(dvcpyx.end(), dvcpyy.end(), dvcpyz.end());
//find index of each input vertex in the list of unique vertices
thrust::lower_bound(zip_dv_first, new_dv_last,
zip_dvcpy_first, zip_dvcpy_last,
indices.begin());
// print unique vertex data
std::cout << "Output Values : ";
thrust::copy(d_vec.begin(), d_vec.begin() + new_dv_size, std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
// print the indices
std::cout << "Index Values : ";
thrust::copy(indices.begin(), indices.end(), std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
}
输出为:
Input Values : 1 2 3 4 5 6 1 2 3 4 5 6 1 2 3 7 8 9
Output Values : 1 2 3 4 5 6 7 8 9
Index Values : 0 1 0 1 0 2
如何将包含交错浮点数的设备数组转换为用于推力矢量运算的 CUDA 推力元组。
目的:我在 CUDA 上使用 Marching Cubes 生成一个粗略的顶点列表。输出是一个顶点列表,有冗余但没有连通性。我希望获得唯一顶点的列表,然后是这些唯一顶点的索引缓冲区,这样我就可以执行一些操作,例如网格简化等...
float *devPtr; //this is device pointer that holds an array of floats
//6 floats represent a vertex, array size is vertsCount*6*sizeof(float).
//format is [v0x, v0y, v0z, n0x, n0y, n0z, v1x, v1y, v1z, n1x, ...]
typedef thrust::tuple<float, float, float, float, float, float> MCVertex;
thrust::device_vector<MCVertex> inputVertices(vertsCount);
//copy from *devPtr to inputVertices.
//use something like unique to get rid of redundancies.
thrust::unique(inputVertices.begin(), inputVertices.end());
我如何实现复制,或者有其他更好的方法吗?
不需要复制,可以用thrust::zip_iterator
和一个strided_range
iterator的组合。
以下示例适用于浮点数列表,其中 3 个连续值彼此属于彼此。它当然可以扩展到支持更多,这只是输入的问题。
第一步是将一些演示数据加载到 GPU 上,这使用 thrust::device_vector
,但这会产生一个 float*
指针,就像您拥有的一样。
基于strided_range
迭代器和thrust::zip_iterator
数据先排序后压缩。
此代码使用 C++11 功能,因此使用以下代码编译它:
nvcc -std=c++11 unique.cu -o unique
运行./unique
时的输出为:
1 2 3 4 5 6
unique.cu
#include <thrust/device_vector.h>
#include <iostream>
#include <thrust/unique.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
template<typename... Iterators>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
return thrust::make_zip_iterator(thrust::make_tuple(its...));
}
template <typename Iterator>
struct strided_range
{
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main()
{
const int stride = 3;
const int num = 3;
const int size = stride * num;
float values[size] = {1,2,3,
4,5,6,
1,2,3};
// in this example I use thrust vectors to simplify copying from host to device
thrust::host_vector<float> h_vec (values, values+size);
thrust::device_vector<float> d_vec = h_vec;
// in your case, dev_ptr is your input pointer
float* dev_ptr = thrust::raw_pointer_cast(d_vec.data());
auto first = strided_range<float*>(dev_ptr, dev_ptr + size+1-stride, stride);
auto second = strided_range<float*>(dev_ptr+1, dev_ptr + size+1-stride+1, stride);
auto third = strided_range<float*>(dev_ptr+2, dev_ptr + size+1-stride+2, stride);
auto zip_begin = zip(first.begin(),second.begin(), third.begin());
auto zip_end = zip(first.end(), second.end(), third.end());
thrust::sort(thrust::device, zip_begin, zip_end);
auto new_end = thrust::unique(thrust::device, zip_begin,zip_end);
std::size_t new_size = stride * (new_end - zip_begin);
// use the underlying thrust::device_vector again to simplify printing
thrust::copy(d_vec.begin(), d_vec.begin()+new_size, std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
return 0;
}
顺便说一下:在尝试获取唯一值时要注意浮点数的不准确性。
我还创建了上面示例的通用版本,它自动构建 zip_iterator
并适用于任意数量的连续元素。由于官方推力版本还不支持可变元组,我们使用 std::tuple
构建所需的元组类型,然后将其转换为 thrust::tuple
。如果 Andrew Corrigan's branch of thrust(它增加了对可变元组的支持)被合并到 thrust master 中,我们可以完全避免使用 std::tuple。
使用以下方法编译此示例:
nvcc generic_unique.cu -std=c++11 -o generic_unique
运行./generic_unique
时的输出为:
input data: 1 2 3 4 5 6 0 0 0 0 0 0 1 2 3 4 5 6 0 0 0 0 0 0 1 2 3 4 5 6 0 0 0 0 0 0 0 0 0 0 0 0
after sort: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4 5 6 1 2 3 4 5 6 1 2 3 4 5 6
after unique: 0 0 0 0 0 0 1 2 3 4 5 6
generic_unique.cu
#include <tuple>
#include <thrust/tuple.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <thrust/unique.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
// adapted from https://github.com/thrust/thrust/blob/master/examples/strided_range.cu
template <typename Iterator, typename thrust::iterator_difference<Iterator>::type stride>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
//template <difference_type stride>
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last)
: first(first), last(last) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor()));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
};
// copied from
template<typename, typename>
struct append_to_type_seq { };
template<typename T, typename... Ts, template<typename...> class TT>
struct append_to_type_seq<T, TT<Ts...>>
{
using type = TT<Ts..., T>;
};
template<typename T, unsigned int N, template<typename...> class TT>
struct repeat
{
using type = typename
append_to_type_seq<
T,
typename repeat<T, N-1, TT>::type
>::type;
};
template<typename T, template<typename...> class TT>
struct repeat<T, 0, TT>
{
using type = TT<>;
};
template<typename Tuple> struct std_to_thrust_tuple;
template<typename...T> struct std_to_thrust_tuple<std::tuple<T...>> {
using type = thrust::tuple<T...>;
};
template<typename IteratorType, std::size_t stride>
class zipped_strided_range
{
public:
typedef typename strided_range<IteratorType, stride>::iterator SingleIterator;
typedef typename repeat<SingleIterator, stride, std::tuple>::type StdIteratorTuple;
typedef typename std_to_thrust_tuple<StdIteratorTuple>::type IteratorTuple;
typedef decltype(thrust::make_zip_iterator(IteratorTuple())) ZipIterator;
zipped_strided_range(IteratorType first, IteratorType last) : first(first), last(last)
{
assign<0>();
}
ZipIterator begin() const
{
return thrust::make_zip_iterator(begin_tuple);
}
ZipIterator end() const
{
return thrust::make_zip_iterator(end_tuple);
}
protected:
template <std::size_t index>
void assign(typename std::enable_if< (index < stride) >::type* = 0)
{
strided_range<IteratorType,stride> strided_range_iterator(first+index, last-(stride-1)+index);
thrust::get<index>(begin_tuple) = strided_range_iterator.begin();
thrust::get<index>(end_tuple) = strided_range_iterator.end();
assign<index+1>();
}
template <std::size_t index>
void assign(typename std::enable_if< (index == stride) >::type* = 0)
{
// end recursion
}
IteratorType first;
IteratorType last;
IteratorTuple begin_tuple;
IteratorTuple end_tuple;
};
int main()
{
const int stride = 6;
const int num = 6;
const int size = stride * num;
float values[size] = {1,2,3,4,5,6,
0,0,0,0,0,0,
1,2,3,4,5,6,
0,0,0,0,0,0,
1,2,3,4,5,6,
0,0,0,0,0,0
};
// in this example I use thrust vectors to simplify copying from host to device
// it also simplifies printing
thrust::host_vector<float> h_vec (values, values+size);
thrust::device_vector<float> d_vec = h_vec;
std::cout << "input data: ";
thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
// in your case, dev_ptr is your input pointer
float* dev_ptr = thrust::raw_pointer_cast(d_vec.data());
zipped_strided_range<float*, stride> zipped(dev_ptr, dev_ptr+size);
thrust::sort(thrust::device, zipped.begin(), zipped.end());
std::cout << "after sort: ";
thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
auto new_end = thrust::unique(thrust::device, zipped.begin(), zipped.end());
std::size_t new_size = stride * (new_end - zipped.begin());
std::cout << "after unique: ";
d_vec.resize(new_size);
thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
return 0;
}
首先,感谢 m.s. 的回答,他的回答为我指明了正确的方向。
请注意,如果您使用的是 Microsoft Visual Studio,则只有 VS2013 支持可变元组。
对于主机编译器的 c++11 功能支持列表(cl.exe,如 VS2013),使用下面的 link。 https://msdn.microsoft.com/en-us/library/hh567368.aspx
PS:确保您正在为 v120 平台工具集构建以利用可变参数模板功能。
感谢@Robert Crovella,VS2013 默认设置了 [-std=c++11],因此不需要设置标志。
回到手头的问题,下面是我如何使用 m.s. 中的代码解决它,但使用 thrust::device_ptr 而不是原始指针。
#include <iostream>
#include "thrust\host_vector.h"
#include "thrust\device_vector.h"
#include "thrust\sort.h"
#include "thrust\unique.h"
#include "thrust\binary_search.h"
#include "thrust\iterator\zip_iterator.h"
#include "thrust\execution_policy.h"
template <typename Iterator>
struct strided_range
{
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function < difference_type, difference_type >
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator, TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
//forcing it to be a 3-tuple one instead of using variadic templates
template<typename Iterator>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterator, Iterator, Iterator>> zip(const Iterator& sr1, const Iterator& sr2, const Iterator& sr3)
{
return thrust::make_zip_iterator(thrust::make_tuple(sr1, sr2, sr3));
}
int main()
{
const int stride = 3;
const int num = 6;
const int size = stride * num;
//values on host
float values[size] = { 1, 2, 3,
4, 5, 6,
1, 2, 3,
4, 5, 6,
1, 2, 3,
7, 8, 9 };
//ptr for device
float *d_data;
//allocate memory on the device
cudaMalloc((void**)&d_data, size*sizeof(float));
//copy from host to device
cudaMemcpy(d_data, values, size*sizeof(float), cudaMemcpyHostToDevice);
//a typedef for device_ptr<float>
typedef thrust::device_ptr<float> floatdevptr;
//cast our raw pointer to device pointer
floatdevptr dev_dataptr = thrust::device_pointer_cast(d_data);
//create a device_vector from the dev_dataptr
thrust::device_vector<float> d_vec(dev_dataptr, dev_dataptr + size);
//make a copy
thrust::device_vector<float> d_veccopy = d_vec;
//create a device_vector to hold indices (6 indices for 6 vertices)
thrust::device_vector<unsigned int> indices( num );
//print input values
std::cout << "Input Values : ";
thrust::copy(d_vec.begin(), d_vec.begin() + size, std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
//a typedef for our strided_range<device_ptr<float>>
typedef strided_range<floatdevptr>::iterator floatdevptr_stridedrangeiterator;
//create the strided_range for x, y and z;
strided_range<floatdevptr> dvx = strided_range<floatdevptr>(dev_dataptr + 0, dev_dataptr + size - stride + 1, stride);
strided_range<floatdevptr> dvy = strided_range<floatdevptr>(dev_dataptr + 1, dev_dataptr + size - stride + 2, stride);
strided_range<floatdevptr> dvz = strided_range<floatdevptr>(dev_dataptr + 2, dev_dataptr + size - stride + 3, stride);
//create zip_iterator for the vertex
auto zip_dv_first = zip<floatdevptr_stridedrangeiterator>(dvx.begin(), dvy.begin(), dvz.begin());
auto zip_dv_last = zip<floatdevptr_stridedrangeiterator>(dvx.end(), dvy.end(), dvz.end());
//sort
thrust::sort(zip_dv_first, zip_dv_last);
//remove duplicates
auto new_dv_last = thrust::unique(zip_dv_first, zip_dv_last);
//compute new size
std::size_t new_dv_size = stride * (new_dv_last - zip_dv_first);
//create the same for the copy.
strided_range<floatdevptr> dvcpyx = strided_range<floatdevptr>(d_veccopy.data() + 0, d_veccopy.data() + size - stride + 1, stride);
strided_range<floatdevptr> dvcpyy = strided_range<floatdevptr>(d_veccopy.data() + 1, d_veccopy.data() + size - stride + 2, stride);
strided_range<floatdevptr> dvcpyz = strided_range<floatdevptr>(d_veccopy.data() + 2, d_veccopy.data() + size - stride + 3, stride);
auto zip_dvcpy_first = zip<floatdevptr_stridedrangeiterator>(dvcpyx.begin(), dvcpyy.begin(), dvcpyz.begin());
auto zip_dvcpy_last = zip<floatdevptr_stridedrangeiterator>(dvcpyx.end(), dvcpyy.end(), dvcpyz.end());
//find index of each input vertex in the list of unique vertices
thrust::lower_bound(zip_dv_first, new_dv_last,
zip_dvcpy_first, zip_dvcpy_last,
indices.begin());
// print unique vertex data
std::cout << "Output Values : ";
thrust::copy(d_vec.begin(), d_vec.begin() + new_dv_size, std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
// print the indices
std::cout << "Index Values : ";
thrust::copy(indices.begin(), indices.end(), std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
}
输出为:
Input Values : 1 2 3 4 5 6 1 2 3 4 5 6 1 2 3 7 8 9
Output Values : 1 2 3 4 5 6 7 8 9
Index Values : 0 1 0 1 0 2