zip_iterator 推力 reduce_by_key 错误
Thrust reduce_by_key error with zip_iterator
我有一个带有 x,y,z 点的双填充展平数组
我将此数组传输到 x、y、z 数组中。我通过 zip 迭代器使用这些数组
tpl4zip first = thrust::make_zip_iterator(thrust::make_tuple(X.begin(), Y.begin(), Z.begin(), K.begin()));
tpl4zip last = thrust::make_zip_iterator(thrust::make_tuple(X.end(), Y.end(), Z.end(), K.end()));
我可以毫无问题地对该元组进行排序。
然后我需要减少才能计数,所以想使用 reduce_by_key。
但是我所有尝试制作功能性 reduce_by_key 的尝试都出错了。
有人可以告诉我我做错了什么吗?
提前致谢。
这是我的代码:
thrust::device_vector<int> counter(N/KEYLEN); // keys in one row
thrust::fill(counter.begin(), counter.end(), 1); // set counter for key
auto my_z = thrust::make_zip_iterator(thrust::make_tuple(first,counter.begin()));
thrust::device_vector<double> X_r(N/KEYLEN); // keys in one row
thrust::device_vector<double> Y_r(N/KEYLEN); // keys in one row
thrust::device_vector<double> Z_r(N/KEYLEN); // keys in one row
thrust::device_vector<int> K_r(N/KEYLEN); // index in one row
tpl4zip my_zr = thrust::make_zip_iterator(thrust::make_tuple(X_r.begin(), Y_r.begin(), Z_r.begin(), K_r.begin()));
thrust::device_vector<int> counter_2(N/KEYLEN); // keys in one row
thrust::fill(counter_2.begin(), counter_2.end(), 1); // set index for key
auto pack = thrust::make_zip_iterator(thrust::make_tuple(my_zr,counter_2.begin()));
thrust::equal_to<int> binary_pred;
thrust::reduce_by_key(
first,
last,
my_z,
first,
pack,
binary_pred,
TuplePlus()
);
我的 TuplePlus 是这样定义的:
struct TuplePlus
{
__host__ __device__
tpl5int operator ()(const tpl5int& lhs, const tpl5int& rhs)
{
int count = thrust::get<1>(lhs)+thrust::get<1>(rhs);
return thrust::make_tuple(thrust::get<0>(lhs),count);
}
};
我的元组和 zip 迭代器是这样的:
#define N 30 // make this evenly divisible by 3 for this example
typedef thrust::tuple<double, double, double, int> tpl4int;
typedef thrust::device_vector<double>::iterator doubleiter;
typedef thrust::device_vector<int>::iterator intiter;
typedef thrust::tuple<doubleiter, doubleiter, doubleiter, intiter> tpl4doubleiter;
typedef thrust::zip_iterator<tpl4doubleiter> tpl4zip;
typedef thrust::tuple<tpl4zip, int> tpl5int;
typedef thrust::tuple<tpl4doubleiter, intiter> tpl5doubleiter;
typedef thrust::zip_iterator<tpl5doubleiter> tpl5zip;
这里是完整代码
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#include <thrust/tuple.h>
#define N 30 // make this evenly divisible by 3 for this example
typedef thrust::tuple<double, double, double, int> tpl4int;
typedef thrust::device_vector<double>::iterator doubleiter;
typedef thrust::device_vector<int>::iterator intiter;
typedef thrust::tuple<doubleiter, doubleiter, doubleiter, intiter> tpl4doubleiter;
typedef thrust::zip_iterator<tpl4doubleiter> tpl4zip;
typedef thrust::tuple<tpl4zip, int> tpl5int;
typedef thrust::tuple<tpl4doubleiter, intiter> tpl5doubleiter;
typedef thrust::zip_iterator<tpl5doubleiter> tpl5zip;
struct sort_
{
double decimPrecision;
sort_(double _decimPrecision)
{
decimPrecision=_decimPrecision;
}
__host__ __device__
bool operator()(const tpl4int &a,const tpl4int &b) const
{
if (round(thrust::get<0>(a)/decimPrecision) != round(thrust::get<0>(b)/decimPrecision))
return round(thrust::get<0>(a)/decimPrecision) > round(thrust::get<0>(b)/decimPrecision);
if (round(thrust::get<1>(a)/decimPrecision) != round(thrust::get<1>(b)/decimPrecision))
return round(thrust::get<1>(a)/decimPrecision) > round(thrust::get<1>(b)/decimPrecision);
return round(thrust::get<2>(a)/decimPrecision) > round(thrust::get<2>(b)/decimPrecision);
}
};
struct TuplePlus
{
__host__ __device__
tpl5int operator ()(const tpl5int& lhs, const tpl5int& rhs)
{
int count = thrust::get<1>(lhs)+thrust::get<1>(rhs);
return thrust::make_tuple(thrust::get<0>(lhs),count);
}
};
int main()
{
#define KEYLEN 3
thrust::device_vector<double> input(10*KEYLEN);
int i=0;
// input[0] = vec3(0,0,5.005);
input[i++] = 1.0245;
input[i++] = 2.54;
input[i++] = 3.001;
// input[1] = vec3(0,0,5.005);
input[i++] = 2.0;
input[i++] = 1.0;
input[i++] = 5.01125;
// input[2] = vec3(0,0,5.005);
input[i++] = 6.0;
input[i++] = 1.0;
input[i++] = 5.0145;
// input[3] = vec3(2,1,5.001);
input[i++] = 6.0;
input[i++] = 1.0215;
input[i++] = 6.001;
// input[4] = vec3(3,0,5.001);
input[i++] = 6.0;
input[i++] = 1.0845;
input[i++] = 5.00125;
// input[5] = vec3(4,0,5.001);
input[i++] = 5.0;
input[i++] = 0.0;
input[i++] = 5.001;
// input[6] = vec3(5,0,5.001);
input[i++] = 5.0;
input[i++] = 0.0;
input[i++] = 5.001;
// input[7] = vec3(6,0,10.501);
input[i++] = 6.0;
input[i++] = 0.0;
input[i++] = 10.501;
// input[8] = vec3(0,0,5.001);
input[i++] = 1.0;
input[i++] = 0.0;
input[i++] = 5.0015478;
// input[9] = vec3(0,0,5.001);
input[i++] = 6.0;
input[i++] = 1.005;
input[i++] = 5.001;
thrust::device_vector<double> X(N/KEYLEN); // keys in one row
thrust::device_vector<double> Y(N/KEYLEN); // keys in one row
thrust::device_vector<double> Z(N/KEYLEN); // keys in one row
size_t dsize = sizeof(input)/sizeof(double);
size_t numkeys = dsize/KEYLEN;
int index=0;
for( int i = 0; i<N/KEYLEN;i++)
{
X[i]=input[index++];
Y[i]=input[index++];
Z[i]=input[index++];
}
thrust::device_vector<int> K(N/KEYLEN); // keys in one row
thrust::sequence(K.begin(), K.end(), 0); // set index for key
tpl4zip first = thrust::make_zip_iterator(thrust::make_tuple(X.begin(), Y.begin(), Z.begin(), K.begin()));
tpl4zip last = thrust::make_zip_iterator(thrust::make_tuple(X.end(), Y.end(), Z.end(), K.end()));
thrust::sort(first,last,sort_(0.01));
thrust::device_vector<int> counter(N/KEYLEN); // keys in one row
thrust::fill(counter.begin(), counter.end(), 1); // set counter for key
auto my_z = thrust::make_zip_iterator(thrust::make_tuple(first,counter.begin()));
thrust::device_vector<double> X_r(N/KEYLEN); // keys in one row
thrust::device_vector<double> Y_r(N/KEYLEN); // keys in one row
thrust::device_vector<double> Z_r(N/KEYLEN); // keys in one row
thrust::device_vector<int> K_r(N/KEYLEN); // keys in one row
tpl4zip my_zr = thrust::make_zip_iterator(thrust::make_tuple(X_r.begin(), Y_r.begin(), Z_r.begin(), K_r.begin()));
thrust::device_vector<int> counter_2(N/KEYLEN); // keys in one row
thrust::fill(counter_2.begin(), counter_2.end(), 1); // set counter for key
auto pack = thrust::make_zip_iterator(thrust::make_tuple(my_zr,counter_2.begin()));
thrust::equal_to<int> binary_pred;
thrust::reduce_by_key(
first,
last,
my_z,
first,
pack,
binary_pred,
TuplePlus()
);
std::vector<tpl4int> result_sorted(N/KEYLEN);
thrust::copy(first,first+N/KEYLEN,result_sorted.begin());
for (int i=0; i<N/KEYLEN; i++)
{
std::cout << "{ " << result_sorted[i].get<0>() ;
std::cout << ", " << result_sorted[i].get<1>() ;
std::cout << ", " << result_sorted[i].get<2>() ;
std::cout << "} i= " << result_sorted[i].get<3>()<< std::endl;
}
return 0;
}
引用 thrust::reduce_by_key
的 documentation:
The input ranges shall not overlap either output range.
所以你不能在这里给first
作为输出迭代器。此操作无法原地完成。
您还尝试使用 thrust::equal_to<int>
而您的键不是 int
类型,而是 tpl4int
.
类型
你似乎在这里使用 zip 迭代器使事情变得比它们需要的复杂得多:你用你的值压缩你的键而不是仅仅使用值。要获得预期的结果,您可能只需要
// only to see how it would look if needed:
auto custom_binary_pred = [](const tpl4int &lhs, const tpl4int &rhs) {
return thrust::get<0>(lhs) == thrust::get<0>(rhs) &&
thrust::get<1>(lhs) == thrust::get<1>(rhs) &&
thrust::get<2>(lhs) == thrust::get<2>(rhs) &&
thrust::get<3>(lhs) == thrust::get<3>(rhs);
};
thrust::reduce_by_key(first,
last,
counter.begin(),
my_zr,
counter_2.begin(),
custom_binary_pred,
thrust::plus<int>()); // doesn't need to be specified due to overload of reduce_by_key
我相当确定您甚至不需要指定 custom_binary_pred
,因为 Thrust 可能为其元组实现了 operator==
(尚未找到有关此 here 的文档) ).
您不需要用 thrust::fill
初始化 counter_2
,因为无论哪种方式,值都会被覆盖。
在我的跟踪代码中,我意识到来自 Python 的展平数组的传输效率非常低。我的 30000000 个输入数组在 3210 毫秒内加载到 X、Y、Z 数组中。
我不知道如何改进这一点。我不知道如何使用 thrust 做这个或者它可以更快吗?
thrust::device_vector<double> X(N/KEYLEN); // keys in one row
thrust::device_vector<double> Y(N/KEYLEN); // keys in one row
thrust::device_vector<double> Z(N/KEYLEN); // keys in one row
size_t dsize = sizeof(input)/sizeof(double);
size_t numkeys = dsize/KEYLEN;
int index=0;
for( int i = 0; i<N/KEYLEN;i++)
{
X[i]=input[index++];
Y[i]=input[index++];
Z[i]=input[index++];
}
我有一个带有 x,y,z 点的双填充展平数组 我将此数组传输到 x、y、z 数组中。我通过 zip 迭代器使用这些数组
tpl4zip first = thrust::make_zip_iterator(thrust::make_tuple(X.begin(), Y.begin(), Z.begin(), K.begin()));
tpl4zip last = thrust::make_zip_iterator(thrust::make_tuple(X.end(), Y.end(), Z.end(), K.end()));
我可以毫无问题地对该元组进行排序。 然后我需要减少才能计数,所以想使用 reduce_by_key。 但是我所有尝试制作功能性 reduce_by_key 的尝试都出错了。 有人可以告诉我我做错了什么吗? 提前致谢。
这是我的代码:
thrust::device_vector<int> counter(N/KEYLEN); // keys in one row
thrust::fill(counter.begin(), counter.end(), 1); // set counter for key
auto my_z = thrust::make_zip_iterator(thrust::make_tuple(first,counter.begin()));
thrust::device_vector<double> X_r(N/KEYLEN); // keys in one row
thrust::device_vector<double> Y_r(N/KEYLEN); // keys in one row
thrust::device_vector<double> Z_r(N/KEYLEN); // keys in one row
thrust::device_vector<int> K_r(N/KEYLEN); // index in one row
tpl4zip my_zr = thrust::make_zip_iterator(thrust::make_tuple(X_r.begin(), Y_r.begin(), Z_r.begin(), K_r.begin()));
thrust::device_vector<int> counter_2(N/KEYLEN); // keys in one row
thrust::fill(counter_2.begin(), counter_2.end(), 1); // set index for key
auto pack = thrust::make_zip_iterator(thrust::make_tuple(my_zr,counter_2.begin()));
thrust::equal_to<int> binary_pred;
thrust::reduce_by_key(
first,
last,
my_z,
first,
pack,
binary_pred,
TuplePlus()
);
我的 TuplePlus 是这样定义的:
struct TuplePlus
{
__host__ __device__
tpl5int operator ()(const tpl5int& lhs, const tpl5int& rhs)
{
int count = thrust::get<1>(lhs)+thrust::get<1>(rhs);
return thrust::make_tuple(thrust::get<0>(lhs),count);
}
};
我的元组和 zip 迭代器是这样的:
#define N 30 // make this evenly divisible by 3 for this example
typedef thrust::tuple<double, double, double, int> tpl4int;
typedef thrust::device_vector<double>::iterator doubleiter;
typedef thrust::device_vector<int>::iterator intiter;
typedef thrust::tuple<doubleiter, doubleiter, doubleiter, intiter> tpl4doubleiter;
typedef thrust::zip_iterator<tpl4doubleiter> tpl4zip;
typedef thrust::tuple<tpl4zip, int> tpl5int;
typedef thrust::tuple<tpl4doubleiter, intiter> tpl5doubleiter;
typedef thrust::zip_iterator<tpl5doubleiter> tpl5zip;
这里是完整代码
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#include <thrust/tuple.h>
#define N 30 // make this evenly divisible by 3 for this example
typedef thrust::tuple<double, double, double, int> tpl4int;
typedef thrust::device_vector<double>::iterator doubleiter;
typedef thrust::device_vector<int>::iterator intiter;
typedef thrust::tuple<doubleiter, doubleiter, doubleiter, intiter> tpl4doubleiter;
typedef thrust::zip_iterator<tpl4doubleiter> tpl4zip;
typedef thrust::tuple<tpl4zip, int> tpl5int;
typedef thrust::tuple<tpl4doubleiter, intiter> tpl5doubleiter;
typedef thrust::zip_iterator<tpl5doubleiter> tpl5zip;
struct sort_
{
double decimPrecision;
sort_(double _decimPrecision)
{
decimPrecision=_decimPrecision;
}
__host__ __device__
bool operator()(const tpl4int &a,const tpl4int &b) const
{
if (round(thrust::get<0>(a)/decimPrecision) != round(thrust::get<0>(b)/decimPrecision))
return round(thrust::get<0>(a)/decimPrecision) > round(thrust::get<0>(b)/decimPrecision);
if (round(thrust::get<1>(a)/decimPrecision) != round(thrust::get<1>(b)/decimPrecision))
return round(thrust::get<1>(a)/decimPrecision) > round(thrust::get<1>(b)/decimPrecision);
return round(thrust::get<2>(a)/decimPrecision) > round(thrust::get<2>(b)/decimPrecision);
}
};
struct TuplePlus
{
__host__ __device__
tpl5int operator ()(const tpl5int& lhs, const tpl5int& rhs)
{
int count = thrust::get<1>(lhs)+thrust::get<1>(rhs);
return thrust::make_tuple(thrust::get<0>(lhs),count);
}
};
int main()
{
#define KEYLEN 3
thrust::device_vector<double> input(10*KEYLEN);
int i=0;
// input[0] = vec3(0,0,5.005);
input[i++] = 1.0245;
input[i++] = 2.54;
input[i++] = 3.001;
// input[1] = vec3(0,0,5.005);
input[i++] = 2.0;
input[i++] = 1.0;
input[i++] = 5.01125;
// input[2] = vec3(0,0,5.005);
input[i++] = 6.0;
input[i++] = 1.0;
input[i++] = 5.0145;
// input[3] = vec3(2,1,5.001);
input[i++] = 6.0;
input[i++] = 1.0215;
input[i++] = 6.001;
// input[4] = vec3(3,0,5.001);
input[i++] = 6.0;
input[i++] = 1.0845;
input[i++] = 5.00125;
// input[5] = vec3(4,0,5.001);
input[i++] = 5.0;
input[i++] = 0.0;
input[i++] = 5.001;
// input[6] = vec3(5,0,5.001);
input[i++] = 5.0;
input[i++] = 0.0;
input[i++] = 5.001;
// input[7] = vec3(6,0,10.501);
input[i++] = 6.0;
input[i++] = 0.0;
input[i++] = 10.501;
// input[8] = vec3(0,0,5.001);
input[i++] = 1.0;
input[i++] = 0.0;
input[i++] = 5.0015478;
// input[9] = vec3(0,0,5.001);
input[i++] = 6.0;
input[i++] = 1.005;
input[i++] = 5.001;
thrust::device_vector<double> X(N/KEYLEN); // keys in one row
thrust::device_vector<double> Y(N/KEYLEN); // keys in one row
thrust::device_vector<double> Z(N/KEYLEN); // keys in one row
size_t dsize = sizeof(input)/sizeof(double);
size_t numkeys = dsize/KEYLEN;
int index=0;
for( int i = 0; i<N/KEYLEN;i++)
{
X[i]=input[index++];
Y[i]=input[index++];
Z[i]=input[index++];
}
thrust::device_vector<int> K(N/KEYLEN); // keys in one row
thrust::sequence(K.begin(), K.end(), 0); // set index for key
tpl4zip first = thrust::make_zip_iterator(thrust::make_tuple(X.begin(), Y.begin(), Z.begin(), K.begin()));
tpl4zip last = thrust::make_zip_iterator(thrust::make_tuple(X.end(), Y.end(), Z.end(), K.end()));
thrust::sort(first,last,sort_(0.01));
thrust::device_vector<int> counter(N/KEYLEN); // keys in one row
thrust::fill(counter.begin(), counter.end(), 1); // set counter for key
auto my_z = thrust::make_zip_iterator(thrust::make_tuple(first,counter.begin()));
thrust::device_vector<double> X_r(N/KEYLEN); // keys in one row
thrust::device_vector<double> Y_r(N/KEYLEN); // keys in one row
thrust::device_vector<double> Z_r(N/KEYLEN); // keys in one row
thrust::device_vector<int> K_r(N/KEYLEN); // keys in one row
tpl4zip my_zr = thrust::make_zip_iterator(thrust::make_tuple(X_r.begin(), Y_r.begin(), Z_r.begin(), K_r.begin()));
thrust::device_vector<int> counter_2(N/KEYLEN); // keys in one row
thrust::fill(counter_2.begin(), counter_2.end(), 1); // set counter for key
auto pack = thrust::make_zip_iterator(thrust::make_tuple(my_zr,counter_2.begin()));
thrust::equal_to<int> binary_pred;
thrust::reduce_by_key(
first,
last,
my_z,
first,
pack,
binary_pred,
TuplePlus()
);
std::vector<tpl4int> result_sorted(N/KEYLEN);
thrust::copy(first,first+N/KEYLEN,result_sorted.begin());
for (int i=0; i<N/KEYLEN; i++)
{
std::cout << "{ " << result_sorted[i].get<0>() ;
std::cout << ", " << result_sorted[i].get<1>() ;
std::cout << ", " << result_sorted[i].get<2>() ;
std::cout << "} i= " << result_sorted[i].get<3>()<< std::endl;
}
return 0;
}
引用 thrust::reduce_by_key
的 documentation:
The input ranges shall not overlap either output range.
所以你不能在这里给first
作为输出迭代器。此操作无法原地完成。
您还尝试使用 thrust::equal_to<int>
而您的键不是 int
类型,而是 tpl4int
.
你似乎在这里使用 zip 迭代器使事情变得比它们需要的复杂得多:你用你的值压缩你的键而不是仅仅使用值。要获得预期的结果,您可能只需要
// only to see how it would look if needed:
auto custom_binary_pred = [](const tpl4int &lhs, const tpl4int &rhs) {
return thrust::get<0>(lhs) == thrust::get<0>(rhs) &&
thrust::get<1>(lhs) == thrust::get<1>(rhs) &&
thrust::get<2>(lhs) == thrust::get<2>(rhs) &&
thrust::get<3>(lhs) == thrust::get<3>(rhs);
};
thrust::reduce_by_key(first,
last,
counter.begin(),
my_zr,
counter_2.begin(),
custom_binary_pred,
thrust::plus<int>()); // doesn't need to be specified due to overload of reduce_by_key
我相当确定您甚至不需要指定 custom_binary_pred
,因为 Thrust 可能为其元组实现了 operator==
(尚未找到有关此 here 的文档) ).
您不需要用 thrust::fill
初始化 counter_2
,因为无论哪种方式,值都会被覆盖。
在我的跟踪代码中,我意识到来自 Python 的展平数组的传输效率非常低。我的 30000000 个输入数组在 3210 毫秒内加载到 X、Y、Z 数组中。 我不知道如何改进这一点。我不知道如何使用 thrust 做这个或者它可以更快吗?
thrust::device_vector<double> X(N/KEYLEN); // keys in one row
thrust::device_vector<double> Y(N/KEYLEN); // keys in one row
thrust::device_vector<double> Z(N/KEYLEN); // keys in one row
size_t dsize = sizeof(input)/sizeof(double);
size_t numkeys = dsize/KEYLEN;
int index=0;
for( int i = 0; i<N/KEYLEN;i++)
{
X[i]=input[index++];
Y[i]=input[index++];
Z[i]=input[index++];
}