CUDA/thrust 中分段数据的成对运算
Pairwise operation on segmented data in CUDA/thrust
假设我有
- 一个数据数组,
- 一个数组,其中包含引用数据数组中条目的键和
- 第三个数组,其中包含每个键数组条目的 id
例如
DataType dataArray[5];
int keyArray[10] = {1, 2, 3, 1, 2, 2, 1, 1, 1, 1};
int ids[10] = {0, 0, 0, 1, 2, 2, 2, 3, 3, 3};
如何在忽略大小写 key1 == key2
的情况下使用 thrust 对每个 ID 段成对执行自定义运算符 ResultDataType fun(int key1, int key2, int id)
?
在此示例中,我想执行并存储以下结果:
fun(1,2,0)
fun(1,3,0)
fun(2,3,0)
fun(2,1,2)
事实证明,这比我最初想象的要复杂得多。我的解释是,您基本上想要计算 N 个事物的所有组合,一次取两个,不重复,每个片段 (每个片段的 N 不同)。我将提供一个我认为涵盖了您可能想要考虑的大部分概念的答案。当然,这只是一种可能的解决方法。
这可能无法准确地得出您想要的解决方案(尽管我认为它非常接近)。我的目的不是给你一个完成的代码,而是演示一种可能的算法方法。
算法演练如下:
按段对键进行排序。如果您可以保证相似的键(在段内)被组合在一起,则此步骤不是必需的。 (您提供的数据不需要此步骤。)我们可以使用背靠背 stable_sort_by_key
来完成此操作。输出:
d_keys:
1,2,3,1,1,2,2,1,1,1,
d_segs:
0,0,0,1,2,2,2,3,3,3,
将数据集减少到每个段的唯一键(删除重复的键)。我们可以在键和段上与 thrust::unique_copy
以及适当的函子一起执行此操作仅在段内定义键的相等性。输出:
d_ukeys:
1,2,3,1,1,2,1,
d_usegs:
0,0,0,1,2,2,3,
删除重复键后计算每个段的长度。我们可以用 thrust::reduce_by_key
和 constant_iterator
来做到这一点。输出:
d_seg_lens:
3,1,2,1,
现在我们知道了每个段的长度,我们想要删除段长度为 1 的任何段(加上它的关联键)。这些段没有任何可能的键对。为了方便这一点,我们还需要计算每个段的起始索引并创建一个模板来标识长度为 1 的段,然后我们可以使用 remove_if
和 scatter_if
删除关联的段和键数据。输出:
d_seg_idxs:
0,3,4,6,
d_stencil:
0,0,0,1,0,0,1,
和减少的数据:
d_ukeys:
1,2,3,1,2,
d_usegs:
0,0,0,2,2,
d_seg_lens:
3,2,
此时我们的目标是创建一个适当长度的向量,以便它可以为一次取两个 N 事物的每个组合保存一个条目,其中 N 是每个的长度分割。所以对于上面的第一个部分,它有 3 个项目,所以一次取 2 个 3 个东西的组合需要总共存储 3 个组合。同样,上面长度为 2 的第二段在 2 个键之间只有一个唯一组合,因此该段只需要存储一个组合。我们可以使用标准组合公式计算此存储长度,该公式简化为一次取 2 个 N 事物的组合:
C = (N)*(N-1)/2
我们将使用传递给 thrust::transform_reduce
的这个公式以及我们的段长度来计算所需的总存储长度。 (对于此示例,总共有 4 个组合)。
一旦我们确定了总长度,并分配了该长度的必要向量,我们就需要生成属于每个片段的实际组合。这又是一个多步骤序列,首先生成标志以标记(描绘)这些向量中的每个 "segment"。使用标志数组,我们可以使用 exclusive_scan_by_key
生成一个序号序列,标识属于每个位置、每个段的组合。输出:
d_flags:
1,0,0,1,
example ordinal sequence:
0,1,2,0
有了每个段的序数序列,我们现在需要为每个段生成实际的唯一组合。这一步需要一些考虑,以尝试提出一种算法来在固定时间内完成此操作(即,无需迭代)。我想出的算法是将每个序数序列映射到一个矩阵,因此对于具有 4 个组合的段(这将有 4 个东西一次取 2 个,所以总共有 6 个组合,比这个例子中的任何组合都大):
1 2 3
0 C1 C2 C3
1 C4 C5 C6
2
然后我们 "re-map" 在主对角线 下方 的任何组合(Cx
以上)到矩阵中的交替位置,如下所示:
1 2 3
0 C1 C2 C3
1 C5 C6
2 C4
然后我们可以读出独特的组合对作为上述特制矩阵的行和列索引。 (实际上是线性到三角形的映射)所以C1对应逻辑键0和逻辑键1的组合。C6对应逻辑键1和逻辑键3的组合。这个矩阵组装和映射到行和列"indices" 生成唯一组合由传递给 thrust::for_each_n
的 comb_n
函子处理,它也接收段长度和输入段序号序列,并生成 "logical" 键 1 和2 作为输出:
d_lkey1:
1,2,2,1,
d_lkey2:
0,0,1,0,
(补充说明: 我相信我在这里描述的方法与我最近偶然发现的 this question/answer 中讨论的方法相当,尽管它们看起来乍一看不一样。)
最后一个转换步骤是将我们的逻辑键和段转换为"actual"键和段。输出:
d_key1:
2,3,3,2,
d_key2:
1,1,2,1,
d_aseg:
0,0,0,2,
这是实现上述内容的完整代码。它可能不完全符合您的要求,但我认为它与您描述的非常接近,如果您想用推力做到这一点,希望对您的想法有用:
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/scan.h>
#include <thrust/scatter.h>
#include <thrust/for_each.h>
#include <thrust/remove.h>
#include <iostream>
#define MY_DISPLAY
typedef float DataType;
// user supplied functor
struct fun
{
DataType *dptr;
fun(DataType *_dptr) : dptr(_dptr) {};
template <typename T>
__host__ __device__
void operator()(const T d1)
{
int key1 = thrust::get<0>(d1);
int key2 = thrust::get<1>(d1);
int id = thrust::get<2>(d1);
DataType my_val = dptr[key1]*100+dptr[key2]*10+id;
printf("result: %f\n", my_val);
}
};
// for determining "equality" of keys
struct my_unique_eq
{
template <typename T>
__host__ __device__
bool operator()(const T d1, const T d2) const
{
return ((thrust::get<0>(d1) == thrust::get<0>(d2))&&(thrust::get<1>(d1) == thrust::get<1>(d2)));
}
};
// BinaryPredicate for the head flag segment representation
// equivalent to thrust::not2(thrust::project2nd<int,int>()));
// from: https://github.com/thrust/thrust/blob/master/examples/scan_by_key.cu
template <typename HeadFlagType>
struct head_flag_predicate
: public thrust::binary_function<HeadFlagType,HeadFlagType,bool>
{
__host__ __device__
bool operator()(HeadFlagType left, HeadFlagType right) const
{
return !right;
}
};
// find nth combination of c things taken 2 at a time
struct comb_n
{
template <typename T>
__host__ __device__
void operator()(const T &d1)
{ // item c from n choose 2
int c = thrust::get<0>(d1);
int n = thrust::get<1>(d1);
int row = c/(n-1);
int col = c%(n-1);
if (col < row) {
col = (n-2)-col;
row = (n-1)-row;
}
thrust::get<2>(d1) = col+1; // lkey1
thrust::get<3>(d1) = row; // lkey2
}
};
using namespace thrust::placeholders;
typedef thrust::pair<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator > mip;
typedef thrust::zip_iterator< thrust::tuple<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> > mzi;
int main(){
// set up sample data
DataType dataArray[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
int keyArray[] = {1, 2, 3, 1, 2, 2, 1, 1, 1, 1};
int ids[] = {0, 0, 0, 1, 2, 2, 2, 3, 3, 3};
int sz0 = sizeof(dataArray)/sizeof(dataArray[0]);
int sz1 = sizeof(keyArray)/sizeof(keyArray[0]);
thrust::host_vector<int> h_keys(keyArray, keyArray+sz1);
thrust::host_vector<int> h_segs(ids, ids+sz1);
thrust::device_vector<int> d_keys = h_keys;
thrust::device_vector<int> d_segs = h_segs;
thrust::host_vector<DataType> h_data(dataArray, dataArray+sz0);
thrust::device_vector<DataType> d_data = h_data;
//sort each segment to group like keys together
thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_segs.begin());
thrust::stable_sort_by_key(d_segs.begin(), d_segs.end(), d_keys.begin());
#ifdef MY_DISPLAY
std::cout << "d_keys:" << std::endl;
thrust::copy(d_keys.begin(), d_keys.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_segs:" << std::endl;
thrust::copy(d_segs.begin(), d_segs.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
//now reduce the data set to just unique keys
thrust::device_vector<int> d_ukeys(sz1);
thrust::device_vector<int> d_usegs(sz1);
mzi ip1 = thrust::unique_copy(thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_segs.begin())), thrust::make_zip_iterator(thrust::make_tuple(d_keys.end(), d_segs.end())), thrust::make_zip_iterator(thrust::make_tuple(d_ukeys.begin(), d_usegs.begin())), my_unique_eq());
int sz2 = ip1 - thrust::make_zip_iterator(thrust::make_tuple(d_ukeys.begin(), d_usegs.begin()));
thrust::device_vector<int> d_seg_lens(sz2);
d_ukeys.resize(sz2);
d_usegs.resize(sz2);
#ifdef MY_DISPLAY
std::cout << "d_ukeys:" << std::endl;
thrust::copy(d_ukeys.begin(), d_ukeys.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_usegs:" << std::endl;
thrust::copy(d_usegs.begin(), d_usegs.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
thrust::device_vector<int> d_seg_nums(sz2);
// compute the length of each segment
mip ip2 = thrust::reduce_by_key(d_usegs.begin(), d_usegs.end(), thrust::make_constant_iterator(1), d_seg_nums.begin(), d_seg_lens.begin());
int sz3 = thrust::get<1>(ip2) - d_seg_lens.begin();
d_seg_nums.resize(sz3);
d_seg_lens.resize(sz3);
thrust::device_vector<int> d_seg_idxs(sz3);
// remove segments that have only one unique key
// compute start index of each segment
thrust::exclusive_scan(d_seg_lens.begin(), d_seg_lens.end(), d_seg_idxs.begin());
#ifdef MY_DISPLAY
std::cout << "d_seg_lens:" << std::endl;
thrust::copy(d_seg_lens.begin(), d_seg_lens.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
std::cout << "d_seg_idxs:" << std::endl;
thrust::copy(d_seg_idxs.begin(), d_seg_idxs.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// remove if the length of segment is 1
thrust::device_vector<int> d_stencil(d_usegs.size());
thrust::scatter_if(thrust::make_constant_iterator(1), thrust::make_constant_iterator(1)+d_seg_lens.size(), d_seg_idxs.begin(), d_seg_lens.begin(), d_stencil.begin(), (_1 == 1));
#ifdef MY_DISPLAY
std::cout << "d_stencil:" << std::endl;
thrust::copy(d_stencil.begin(), d_stencil.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
thrust::remove_if(d_usegs.begin(), d_usegs.end(), d_stencil.begin(), (_1 == 1));
thrust::remove_if(d_ukeys.begin(), d_ukeys.end(), d_stencil.begin(), (_1 == 1));
int removals = thrust::remove_if(d_seg_lens.begin(), d_seg_lens.end(),(_1 == 1)) - d_seg_lens.begin();
d_usegs.resize(d_usegs.size()-removals);
d_ukeys.resize(d_ukeys.size()-removals);
d_seg_lens.resize(d_seg_lens.size()-removals);
// compute storage length needed (sum of combinations in each segment)
int sz4 = thrust::transform_reduce(d_seg_lens.begin(), d_seg_lens.end(), ((_1)*(_1 - 1))/2, 0, thrust::plus<int>());
#ifdef MY_DISPLAY
std::cout << "d_ukeys:" << std::endl;
thrust::copy(d_ukeys.begin(), d_ukeys.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_usegs:" << std::endl;
thrust::copy(d_usegs.begin(), d_usegs.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
std::cout << "d_seg_lens:" << std::endl;
thrust::copy(d_seg_lens.begin(), d_seg_lens.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// now create intra-segment index sequence
// first create flag array to mark segments
thrust::device_vector<int> d_flags(sz4);
// compute flag indexes
thrust::device_vector<int> d_flag_idxs(sz3);
thrust::exclusive_scan(d_seg_lens.begin(), d_seg_lens.end(), d_flag_idxs.begin());
// scatter flags
thrust::scatter(thrust::make_constant_iterator(1), thrust::make_constant_iterator(1)+sz3, d_flag_idxs.begin(), d_flags.begin());
#ifdef MY_DISPLAY
std::cout << "d_flags:" << std::endl;
thrust::copy(d_flags.begin(), d_flags.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// use flag array to create intra-segment index sequence
thrust::device_vector<int> d_seg_i_idxs(d_flags.size());
thrust::exclusive_scan_by_key(d_flags.begin(), d_flags.end(), thrust::make_constant_iterator(1), d_seg_i_idxs.begin(), 0, head_flag_predicate<int>());
// convert flags to segment references
thrust::inclusive_scan(d_flags.begin(), d_flags.end(), d_flags.begin());
thrust::transform(d_flags.begin(), d_flags.end(), d_flags.begin(), (_1 - 1));
// for_each - run functor that uses intra-segment index sequence plus segment
// index to create logical combinations
// input required:
// -- logical segment number
// -- intra-segment index
// -- segment size
// output:
// -- logical key1
// -- logical key2
thrust::device_vector<int> d_lkey1(sz4);
thrust::device_vector<int> d_lkey2(sz4);
thrust::for_each_n(thrust::make_zip_iterator(thrust::make_tuple(d_seg_i_idxs.begin(), thrust::make_permutation_iterator(d_seg_lens.begin(), d_flags.begin()), d_lkey1.begin(), d_lkey2.begin())), sz4, comb_n());
#ifdef MY_DISPLAY
std::cout << "d_lkey1:" << std::endl;
thrust::copy(d_lkey1.begin(), d_lkey1.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_lkey2:" << std::endl;
thrust::copy(d_lkey2.begin(), d_lkey2.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// convert logical keys and logical segments to actual keys/segments
thrust::device_vector<int> d_key1(sz4);
thrust::device_vector<int> d_key2(sz4);
thrust::device_vector<int> d_aseg(sz4);
thrust::copy_n(thrust::make_permutation_iterator(d_seg_idxs.begin(), d_flags.begin()), sz4, d_aseg.begin());
thrust::transform(d_lkey1.begin(), d_lkey1.end(), thrust::make_permutation_iterator(d_seg_idxs.begin(), d_flags.begin()), d_lkey1.begin(), thrust::plus<int>());
thrust::transform(d_lkey2.begin(), d_lkey2.end(), thrust::make_permutation_iterator(d_seg_idxs.begin(), d_flags.begin()), d_lkey2.begin(), thrust::plus<int>());
thrust::copy_n(thrust::make_permutation_iterator(d_ukeys.begin(), d_lkey1.begin()), sz4, d_key1.begin());
thrust::copy_n(thrust::make_permutation_iterator(d_ukeys.begin(), d_lkey2.begin()), sz4, d_key2.begin());
#ifdef MY_DISPLAY
std::cout << "d_lkey1:" << std::endl;
thrust::copy(d_lkey1.begin(), d_lkey1.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_lkey2:" << std::endl;
thrust::copy(d_lkey2.begin(), d_lkey2.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
#ifdef MY_DISPLAY
std::cout << "d_key1:" << std::endl;
thrust::copy(d_key1.begin(), d_key1.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_key2:" << std::endl;
thrust::copy(d_key2.begin(), d_key2.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// d_flags has already been converted to logical segment sequence
// just need to convert these via lookup to the actual segments
thrust::unique(d_usegs.begin(), d_usegs.end());
thrust::copy_n(thrust::make_permutation_iterator(d_usegs.begin(), d_flags.begin()), sz4, d_aseg.begin());
#ifdef MY_DISPLAY
std::cout << "d_aseg:" << std::endl;
thrust::copy(d_aseg.begin(), d_aseg.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// run desired user function
thrust::for_each_n(thrust::make_zip_iterator(thrust::make_tuple(d_key1.begin(), d_key2.begin(), d_aseg.begin())), sz4, fun(thrust::raw_pointer_cast(d_data.data())));
std::cout << "Finished." << std::endl;
return 0;
}
我找到了您问题的解决方案。我混合使用了 cuda 内核和推力原语。我想您的运算符在键参数上具有可交换的 属性,即
fun(key1, key2, id) == fun(key2, key1, id)
我的解决方案分两步生成三个输出数组(keys1、keys2 和 ids)。在第一步中,它只计算输出数组中元素的数量,在第二步中,它填充数组。
基本上该算法运行两次:第一次 "simulates" 写入,第二次实际写入输出。
这是我在输出大小取决于输入数据时使用的常见模式。
步骤如下:
- 按段对键进行排序。此步骤与@Robert Crovella 的算法相同。
- 计算每个 key.Each 线程生成的对数与一个键相关联。每个线程开始计算从其基本索引到段末尾的所有有效对。这一步的输出在向量 d_cpairs
中
- 计算 key1、key2 和 ids 的大小以及那些 arrays.A 中每对 int 的偏移量 d_cpairs 上的总和缩减计算输出数组的大小。在 d_cpairs 上执行的独占扫描操作会生成输出数组中对的位置。此步骤的输出在向量 d_addrs
中
- 用数据填充 keys1、keys2 和 ids。此步骤与步骤3)完全相同,但它实际上将数据写入keys1、keys2和ids。
这是我的算法的输出:
d_keys: 1 2 3 1 1 2 2 1 1 1
d_segs: 0 0 0 1 2 2 2 3 3 3
d_cpairs: 2 1 0 0 1 0 0 0 0 0
num_pairs: 4
d_addrs: 0 2 3 3 3 4 4 4 4 4
keys1: 1 1 2 1
keys2: 2 3 3 2
ids: 0 0 0 2
请注意,输出中的最后一列与您想要的不完全一样,但如果交换 属性 成立就没问题。
这是我的完整代码。可能这不是最快的解决方案,但我认为这很简单。
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/sort.h>
#define SHOW_VECTOR(V,size) std::cout << #V << ": "; for(int i =0 ; i < size; i++ ){ std::cout << V[i] << " "; } std::cout << std::endl;
#define RAW_CAST(V) thrust::raw_pointer_cast(V.data())
__global__
void count_kernel(const int *d_keys,const int *d_segs,int *d_cpairs, int siz){
int tidx = threadIdx.x+ blockIdx.x*blockDim.x;
if(tidx < siz){
int sum = 0;
int i = tidx+1;
while(d_segs[i] == d_segs[tidx]){
if(d_keys[i] != d_keys[tidx] &&
d_keys[i] != d_keys[i-1]){
sum++;
}
i++;
}
d_cpairs[tidx] = sum;
}
}
__global__
void scatter_kernel(const int *d_keys,
const int *d_segs,
const int *d_addrs,
int *d_keys1,
int *d_keys2,
int *d_ids,
int siz){
int tidx = threadIdx.x+ blockIdx.x*blockDim.x;
if(tidx < siz){
int base_address = d_addrs[tidx];
int j =0;
int i = tidx+1;
while(d_segs[i] == d_segs[tidx]){
if(d_keys[i] != d_keys[tidx] &&
d_keys[i] != d_keys[i-1]){
d_keys1[base_address+j] = d_keys[tidx];
d_keys2[base_address+j] = d_keys[i];
d_ids[base_address+j] = d_segs[i];
j++;
}
i++;
}
}
}
int main(){
int keyArray[] = {1, 2, 3, 1, 2, 2, 1, 1, 1, 1};
int idsArray[] = {0, 0, 0, 1, 2, 2, 2, 3, 3, 3};
int sz1 = sizeof(keyArray)/sizeof(keyArray[0]);
thrust::host_vector<int> h_keys(keyArray, keyArray+sz1);
thrust::host_vector<int> h_segs(idsArray, idsArray+sz1);
thrust::device_vector<int> d_keys = h_keys;
thrust::device_vector<int> d_segs = h_segs;
thrust::device_vector<int> d_cpairs(sz1);
thrust::device_vector<int> d_addrs(sz1);
//sort each segment to group like keys together
thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_segs.begin());
thrust::stable_sort_by_key(d_segs.begin(), d_segs.end(), d_keys.begin());
SHOW_VECTOR(d_keys,sz1);
SHOW_VECTOR(d_segs,sz1);
//count the number of pairs produced by each key
count_kernel<<<1,sz1>>>(RAW_CAST(d_keys),RAW_CAST(d_segs),RAW_CAST(d_cpairs),sz1);
SHOW_VECTOR(d_cpairs,sz1);
//determine the total number of pairs
int num_pairs = thrust::reduce(d_cpairs.begin(),d_cpairs.end());
std::cout << "num_pairs: " << num_pairs << std::endl;
//compute the addresses
thrust::exclusive_scan(d_cpairs.begin(),d_cpairs.end(),d_addrs.begin());
thrust::device_vector<int> keys1(num_pairs);
thrust::device_vector<int> keys2(num_pairs);
thrust::device_vector<int> ids(num_pairs);
SHOW_VECTOR(d_addrs,sz1);
//fill the vector with the keys and ids
scatter_kernel<<<1,sz1>>>(RAW_CAST(d_keys),
RAW_CAST(d_segs),
RAW_CAST(d_addrs),
RAW_CAST(keys1),
RAW_CAST(keys2),
RAW_CAST(ids),
sz1);
SHOW_VECTOR(keys1,num_pairs);
SHOW_VECTOR(keys2,num_pairs);
SHOW_VECTOR(ids,num_pairs);
return 0;
}
假设我有
- 一个数据数组,
- 一个数组,其中包含引用数据数组中条目的键和
- 第三个数组,其中包含每个键数组条目的 id
例如
DataType dataArray[5];
int keyArray[10] = {1, 2, 3, 1, 2, 2, 1, 1, 1, 1};
int ids[10] = {0, 0, 0, 1, 2, 2, 2, 3, 3, 3};
如何在忽略大小写 key1 == key2
的情况下使用 thrust 对每个 ID 段成对执行自定义运算符 ResultDataType fun(int key1, int key2, int id)
?
在此示例中,我想执行并存储以下结果:
fun(1,2,0)
fun(1,3,0)
fun(2,3,0)
fun(2,1,2)
事实证明,这比我最初想象的要复杂得多。我的解释是,您基本上想要计算 N 个事物的所有组合,一次取两个,不重复,每个片段 (每个片段的 N 不同)。我将提供一个我认为涵盖了您可能想要考虑的大部分概念的答案。当然,这只是一种可能的解决方法。
这可能无法准确地得出您想要的解决方案(尽管我认为它非常接近)。我的目的不是给你一个完成的代码,而是演示一种可能的算法方法。
算法演练如下:
按段对键进行排序。如果您可以保证相似的键(在段内)被组合在一起,则此步骤不是必需的。 (您提供的数据不需要此步骤。)我们可以使用背靠背
stable_sort_by_key
来完成此操作。输出:d_keys: 1,2,3,1,1,2,2,1,1,1, d_segs: 0,0,0,1,2,2,2,3,3,3,
将数据集减少到每个段的唯一键(删除重复的键)。我们可以在键和段上与
thrust::unique_copy
以及适当的函子一起执行此操作仅在段内定义键的相等性。输出:d_ukeys: 1,2,3,1,1,2,1, d_usegs: 0,0,0,1,2,2,3,
删除重复键后计算每个段的长度。我们可以用
thrust::reduce_by_key
和constant_iterator
来做到这一点。输出:d_seg_lens: 3,1,2,1,
现在我们知道了每个段的长度,我们想要删除段长度为 1 的任何段(加上它的关联键)。这些段没有任何可能的键对。为了方便这一点,我们还需要计算每个段的起始索引并创建一个模板来标识长度为 1 的段,然后我们可以使用
remove_if
和scatter_if
删除关联的段和键数据。输出:d_seg_idxs: 0,3,4,6, d_stencil: 0,0,0,1,0,0,1,
和减少的数据:
d_ukeys: 1,2,3,1,2, d_usegs: 0,0,0,2,2, d_seg_lens: 3,2,
此时我们的目标是创建一个适当长度的向量,以便它可以为一次取两个 N 事物的每个组合保存一个条目,其中 N 是每个的长度分割。所以对于上面的第一个部分,它有 3 个项目,所以一次取 2 个 3 个东西的组合需要总共存储 3 个组合。同样,上面长度为 2 的第二段在 2 个键之间只有一个唯一组合,因此该段只需要存储一个组合。我们可以使用标准组合公式计算此存储长度,该公式简化为一次取 2 个 N 事物的组合:
C = (N)*(N-1)/2
我们将使用传递给
thrust::transform_reduce
的这个公式以及我们的段长度来计算所需的总存储长度。 (对于此示例,总共有 4 个组合)。一旦我们确定了总长度,并分配了该长度的必要向量,我们就需要生成属于每个片段的实际组合。这又是一个多步骤序列,首先生成标志以标记(描绘)这些向量中的每个 "segment"。使用标志数组,我们可以使用
exclusive_scan_by_key
生成一个序号序列,标识属于每个位置、每个段的组合。输出:d_flags: 1,0,0,1, example ordinal sequence: 0,1,2,0
有了每个段的序数序列,我们现在需要为每个段生成实际的唯一组合。这一步需要一些考虑,以尝试提出一种算法来在固定时间内完成此操作(即,无需迭代)。我想出的算法是将每个序数序列映射到一个矩阵,因此对于具有 4 个组合的段(这将有 4 个东西一次取 2 个,所以总共有 6 个组合,比这个例子中的任何组合都大):
1 2 3 0 C1 C2 C3 1 C4 C5 C6 2
然后我们 "re-map" 在主对角线 下方 的任何组合(
Cx
以上)到矩阵中的交替位置,如下所示:1 2 3 0 C1 C2 C3 1 C5 C6 2 C4
然后我们可以读出独特的组合对作为上述特制矩阵的行和列索引。 (实际上是线性到三角形的映射)所以C1对应逻辑键0和逻辑键1的组合。C6对应逻辑键1和逻辑键3的组合。这个矩阵组装和映射到行和列"indices" 生成唯一组合由传递给
thrust::for_each_n
的comb_n
函子处理,它也接收段长度和输入段序号序列,并生成 "logical" 键 1 和2 作为输出:d_lkey1: 1,2,2,1, d_lkey2: 0,0,1,0,
(补充说明: 我相信我在这里描述的方法与我最近偶然发现的 this question/answer 中讨论的方法相当,尽管它们看起来乍一看不一样。)
最后一个转换步骤是将我们的逻辑键和段转换为"actual"键和段。输出:
d_key1: 2,3,3,2, d_key2: 1,1,2,1, d_aseg: 0,0,0,2,
这是实现上述内容的完整代码。它可能不完全符合您的要求,但我认为它与您描述的非常接近,如果您想用推力做到这一点,希望对您的想法有用:
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/scan.h>
#include <thrust/scatter.h>
#include <thrust/for_each.h>
#include <thrust/remove.h>
#include <iostream>
#define MY_DISPLAY
typedef float DataType;
// user supplied functor
struct fun
{
DataType *dptr;
fun(DataType *_dptr) : dptr(_dptr) {};
template <typename T>
__host__ __device__
void operator()(const T d1)
{
int key1 = thrust::get<0>(d1);
int key2 = thrust::get<1>(d1);
int id = thrust::get<2>(d1);
DataType my_val = dptr[key1]*100+dptr[key2]*10+id;
printf("result: %f\n", my_val);
}
};
// for determining "equality" of keys
struct my_unique_eq
{
template <typename T>
__host__ __device__
bool operator()(const T d1, const T d2) const
{
return ((thrust::get<0>(d1) == thrust::get<0>(d2))&&(thrust::get<1>(d1) == thrust::get<1>(d2)));
}
};
// BinaryPredicate for the head flag segment representation
// equivalent to thrust::not2(thrust::project2nd<int,int>()));
// from: https://github.com/thrust/thrust/blob/master/examples/scan_by_key.cu
template <typename HeadFlagType>
struct head_flag_predicate
: public thrust::binary_function<HeadFlagType,HeadFlagType,bool>
{
__host__ __device__
bool operator()(HeadFlagType left, HeadFlagType right) const
{
return !right;
}
};
// find nth combination of c things taken 2 at a time
struct comb_n
{
template <typename T>
__host__ __device__
void operator()(const T &d1)
{ // item c from n choose 2
int c = thrust::get<0>(d1);
int n = thrust::get<1>(d1);
int row = c/(n-1);
int col = c%(n-1);
if (col < row) {
col = (n-2)-col;
row = (n-1)-row;
}
thrust::get<2>(d1) = col+1; // lkey1
thrust::get<3>(d1) = row; // lkey2
}
};
using namespace thrust::placeholders;
typedef thrust::pair<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator > mip;
typedef thrust::zip_iterator< thrust::tuple<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> > mzi;
int main(){
// set up sample data
DataType dataArray[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
int keyArray[] = {1, 2, 3, 1, 2, 2, 1, 1, 1, 1};
int ids[] = {0, 0, 0, 1, 2, 2, 2, 3, 3, 3};
int sz0 = sizeof(dataArray)/sizeof(dataArray[0]);
int sz1 = sizeof(keyArray)/sizeof(keyArray[0]);
thrust::host_vector<int> h_keys(keyArray, keyArray+sz1);
thrust::host_vector<int> h_segs(ids, ids+sz1);
thrust::device_vector<int> d_keys = h_keys;
thrust::device_vector<int> d_segs = h_segs;
thrust::host_vector<DataType> h_data(dataArray, dataArray+sz0);
thrust::device_vector<DataType> d_data = h_data;
//sort each segment to group like keys together
thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_segs.begin());
thrust::stable_sort_by_key(d_segs.begin(), d_segs.end(), d_keys.begin());
#ifdef MY_DISPLAY
std::cout << "d_keys:" << std::endl;
thrust::copy(d_keys.begin(), d_keys.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_segs:" << std::endl;
thrust::copy(d_segs.begin(), d_segs.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
//now reduce the data set to just unique keys
thrust::device_vector<int> d_ukeys(sz1);
thrust::device_vector<int> d_usegs(sz1);
mzi ip1 = thrust::unique_copy(thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_segs.begin())), thrust::make_zip_iterator(thrust::make_tuple(d_keys.end(), d_segs.end())), thrust::make_zip_iterator(thrust::make_tuple(d_ukeys.begin(), d_usegs.begin())), my_unique_eq());
int sz2 = ip1 - thrust::make_zip_iterator(thrust::make_tuple(d_ukeys.begin(), d_usegs.begin()));
thrust::device_vector<int> d_seg_lens(sz2);
d_ukeys.resize(sz2);
d_usegs.resize(sz2);
#ifdef MY_DISPLAY
std::cout << "d_ukeys:" << std::endl;
thrust::copy(d_ukeys.begin(), d_ukeys.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_usegs:" << std::endl;
thrust::copy(d_usegs.begin(), d_usegs.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
thrust::device_vector<int> d_seg_nums(sz2);
// compute the length of each segment
mip ip2 = thrust::reduce_by_key(d_usegs.begin(), d_usegs.end(), thrust::make_constant_iterator(1), d_seg_nums.begin(), d_seg_lens.begin());
int sz3 = thrust::get<1>(ip2) - d_seg_lens.begin();
d_seg_nums.resize(sz3);
d_seg_lens.resize(sz3);
thrust::device_vector<int> d_seg_idxs(sz3);
// remove segments that have only one unique key
// compute start index of each segment
thrust::exclusive_scan(d_seg_lens.begin(), d_seg_lens.end(), d_seg_idxs.begin());
#ifdef MY_DISPLAY
std::cout << "d_seg_lens:" << std::endl;
thrust::copy(d_seg_lens.begin(), d_seg_lens.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
std::cout << "d_seg_idxs:" << std::endl;
thrust::copy(d_seg_idxs.begin(), d_seg_idxs.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// remove if the length of segment is 1
thrust::device_vector<int> d_stencil(d_usegs.size());
thrust::scatter_if(thrust::make_constant_iterator(1), thrust::make_constant_iterator(1)+d_seg_lens.size(), d_seg_idxs.begin(), d_seg_lens.begin(), d_stencil.begin(), (_1 == 1));
#ifdef MY_DISPLAY
std::cout << "d_stencil:" << std::endl;
thrust::copy(d_stencil.begin(), d_stencil.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
thrust::remove_if(d_usegs.begin(), d_usegs.end(), d_stencil.begin(), (_1 == 1));
thrust::remove_if(d_ukeys.begin(), d_ukeys.end(), d_stencil.begin(), (_1 == 1));
int removals = thrust::remove_if(d_seg_lens.begin(), d_seg_lens.end(),(_1 == 1)) - d_seg_lens.begin();
d_usegs.resize(d_usegs.size()-removals);
d_ukeys.resize(d_ukeys.size()-removals);
d_seg_lens.resize(d_seg_lens.size()-removals);
// compute storage length needed (sum of combinations in each segment)
int sz4 = thrust::transform_reduce(d_seg_lens.begin(), d_seg_lens.end(), ((_1)*(_1 - 1))/2, 0, thrust::plus<int>());
#ifdef MY_DISPLAY
std::cout << "d_ukeys:" << std::endl;
thrust::copy(d_ukeys.begin(), d_ukeys.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_usegs:" << std::endl;
thrust::copy(d_usegs.begin(), d_usegs.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
std::cout << "d_seg_lens:" << std::endl;
thrust::copy(d_seg_lens.begin(), d_seg_lens.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// now create intra-segment index sequence
// first create flag array to mark segments
thrust::device_vector<int> d_flags(sz4);
// compute flag indexes
thrust::device_vector<int> d_flag_idxs(sz3);
thrust::exclusive_scan(d_seg_lens.begin(), d_seg_lens.end(), d_flag_idxs.begin());
// scatter flags
thrust::scatter(thrust::make_constant_iterator(1), thrust::make_constant_iterator(1)+sz3, d_flag_idxs.begin(), d_flags.begin());
#ifdef MY_DISPLAY
std::cout << "d_flags:" << std::endl;
thrust::copy(d_flags.begin(), d_flags.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// use flag array to create intra-segment index sequence
thrust::device_vector<int> d_seg_i_idxs(d_flags.size());
thrust::exclusive_scan_by_key(d_flags.begin(), d_flags.end(), thrust::make_constant_iterator(1), d_seg_i_idxs.begin(), 0, head_flag_predicate<int>());
// convert flags to segment references
thrust::inclusive_scan(d_flags.begin(), d_flags.end(), d_flags.begin());
thrust::transform(d_flags.begin(), d_flags.end(), d_flags.begin(), (_1 - 1));
// for_each - run functor that uses intra-segment index sequence plus segment
// index to create logical combinations
// input required:
// -- logical segment number
// -- intra-segment index
// -- segment size
// output:
// -- logical key1
// -- logical key2
thrust::device_vector<int> d_lkey1(sz4);
thrust::device_vector<int> d_lkey2(sz4);
thrust::for_each_n(thrust::make_zip_iterator(thrust::make_tuple(d_seg_i_idxs.begin(), thrust::make_permutation_iterator(d_seg_lens.begin(), d_flags.begin()), d_lkey1.begin(), d_lkey2.begin())), sz4, comb_n());
#ifdef MY_DISPLAY
std::cout << "d_lkey1:" << std::endl;
thrust::copy(d_lkey1.begin(), d_lkey1.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_lkey2:" << std::endl;
thrust::copy(d_lkey2.begin(), d_lkey2.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// convert logical keys and logical segments to actual keys/segments
thrust::device_vector<int> d_key1(sz4);
thrust::device_vector<int> d_key2(sz4);
thrust::device_vector<int> d_aseg(sz4);
thrust::copy_n(thrust::make_permutation_iterator(d_seg_idxs.begin(), d_flags.begin()), sz4, d_aseg.begin());
thrust::transform(d_lkey1.begin(), d_lkey1.end(), thrust::make_permutation_iterator(d_seg_idxs.begin(), d_flags.begin()), d_lkey1.begin(), thrust::plus<int>());
thrust::transform(d_lkey2.begin(), d_lkey2.end(), thrust::make_permutation_iterator(d_seg_idxs.begin(), d_flags.begin()), d_lkey2.begin(), thrust::plus<int>());
thrust::copy_n(thrust::make_permutation_iterator(d_ukeys.begin(), d_lkey1.begin()), sz4, d_key1.begin());
thrust::copy_n(thrust::make_permutation_iterator(d_ukeys.begin(), d_lkey2.begin()), sz4, d_key2.begin());
#ifdef MY_DISPLAY
std::cout << "d_lkey1:" << std::endl;
thrust::copy(d_lkey1.begin(), d_lkey1.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_lkey2:" << std::endl;
thrust::copy(d_lkey2.begin(), d_lkey2.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
#ifdef MY_DISPLAY
std::cout << "d_key1:" << std::endl;
thrust::copy(d_key1.begin(), d_key1.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "d_key2:" << std::endl;
thrust::copy(d_key2.begin(), d_key2.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// d_flags has already been converted to logical segment sequence
// just need to convert these via lookup to the actual segments
thrust::unique(d_usegs.begin(), d_usegs.end());
thrust::copy_n(thrust::make_permutation_iterator(d_usegs.begin(), d_flags.begin()), sz4, d_aseg.begin());
#ifdef MY_DISPLAY
std::cout << "d_aseg:" << std::endl;
thrust::copy(d_aseg.begin(), d_aseg.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
// run desired user function
thrust::for_each_n(thrust::make_zip_iterator(thrust::make_tuple(d_key1.begin(), d_key2.begin(), d_aseg.begin())), sz4, fun(thrust::raw_pointer_cast(d_data.data())));
std::cout << "Finished." << std::endl;
return 0;
}
我找到了您问题的解决方案。我混合使用了 cuda 内核和推力原语。我想您的运算符在键参数上具有可交换的 属性,即
fun(key1, key2, id) == fun(key2, key1, id)
我的解决方案分两步生成三个输出数组(keys1、keys2 和 ids)。在第一步中,它只计算输出数组中元素的数量,在第二步中,它填充数组。 基本上该算法运行两次:第一次 "simulates" 写入,第二次实际写入输出。 这是我在输出大小取决于输入数据时使用的常见模式。
步骤如下:
- 按段对键进行排序。此步骤与@Robert Crovella 的算法相同。
- 计算每个 key.Each 线程生成的对数与一个键相关联。每个线程开始计算从其基本索引到段末尾的所有有效对。这一步的输出在向量 d_cpairs 中
- 计算 key1、key2 和 ids 的大小以及那些 arrays.A 中每对 int 的偏移量 d_cpairs 上的总和缩减计算输出数组的大小。在 d_cpairs 上执行的独占扫描操作会生成输出数组中对的位置。此步骤的输出在向量 d_addrs 中
- 用数据填充 keys1、keys2 和 ids。此步骤与步骤3)完全相同,但它实际上将数据写入keys1、keys2和ids。
这是我的算法的输出:
d_keys: 1 2 3 1 1 2 2 1 1 1
d_segs: 0 0 0 1 2 2 2 3 3 3
d_cpairs: 2 1 0 0 1 0 0 0 0 0
num_pairs: 4
d_addrs: 0 2 3 3 3 4 4 4 4 4
keys1: 1 1 2 1
keys2: 2 3 3 2
ids: 0 0 0 2
请注意,输出中的最后一列与您想要的不完全一样,但如果交换 属性 成立就没问题。
这是我的完整代码。可能这不是最快的解决方案,但我认为这很简单。
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/sort.h>
#define SHOW_VECTOR(V,size) std::cout << #V << ": "; for(int i =0 ; i < size; i++ ){ std::cout << V[i] << " "; } std::cout << std::endl;
#define RAW_CAST(V) thrust::raw_pointer_cast(V.data())
__global__
void count_kernel(const int *d_keys,const int *d_segs,int *d_cpairs, int siz){
int tidx = threadIdx.x+ blockIdx.x*blockDim.x;
if(tidx < siz){
int sum = 0;
int i = tidx+1;
while(d_segs[i] == d_segs[tidx]){
if(d_keys[i] != d_keys[tidx] &&
d_keys[i] != d_keys[i-1]){
sum++;
}
i++;
}
d_cpairs[tidx] = sum;
}
}
__global__
void scatter_kernel(const int *d_keys,
const int *d_segs,
const int *d_addrs,
int *d_keys1,
int *d_keys2,
int *d_ids,
int siz){
int tidx = threadIdx.x+ blockIdx.x*blockDim.x;
if(tidx < siz){
int base_address = d_addrs[tidx];
int j =0;
int i = tidx+1;
while(d_segs[i] == d_segs[tidx]){
if(d_keys[i] != d_keys[tidx] &&
d_keys[i] != d_keys[i-1]){
d_keys1[base_address+j] = d_keys[tidx];
d_keys2[base_address+j] = d_keys[i];
d_ids[base_address+j] = d_segs[i];
j++;
}
i++;
}
}
}
int main(){
int keyArray[] = {1, 2, 3, 1, 2, 2, 1, 1, 1, 1};
int idsArray[] = {0, 0, 0, 1, 2, 2, 2, 3, 3, 3};
int sz1 = sizeof(keyArray)/sizeof(keyArray[0]);
thrust::host_vector<int> h_keys(keyArray, keyArray+sz1);
thrust::host_vector<int> h_segs(idsArray, idsArray+sz1);
thrust::device_vector<int> d_keys = h_keys;
thrust::device_vector<int> d_segs = h_segs;
thrust::device_vector<int> d_cpairs(sz1);
thrust::device_vector<int> d_addrs(sz1);
//sort each segment to group like keys together
thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_segs.begin());
thrust::stable_sort_by_key(d_segs.begin(), d_segs.end(), d_keys.begin());
SHOW_VECTOR(d_keys,sz1);
SHOW_VECTOR(d_segs,sz1);
//count the number of pairs produced by each key
count_kernel<<<1,sz1>>>(RAW_CAST(d_keys),RAW_CAST(d_segs),RAW_CAST(d_cpairs),sz1);
SHOW_VECTOR(d_cpairs,sz1);
//determine the total number of pairs
int num_pairs = thrust::reduce(d_cpairs.begin(),d_cpairs.end());
std::cout << "num_pairs: " << num_pairs << std::endl;
//compute the addresses
thrust::exclusive_scan(d_cpairs.begin(),d_cpairs.end(),d_addrs.begin());
thrust::device_vector<int> keys1(num_pairs);
thrust::device_vector<int> keys2(num_pairs);
thrust::device_vector<int> ids(num_pairs);
SHOW_VECTOR(d_addrs,sz1);
//fill the vector with the keys and ids
scatter_kernel<<<1,sz1>>>(RAW_CAST(d_keys),
RAW_CAST(d_segs),
RAW_CAST(d_addrs),
RAW_CAST(keys1),
RAW_CAST(keys2),
RAW_CAST(ids),
sz1);
SHOW_VECTOR(keys1,num_pairs);
SHOW_VECTOR(keys2,num_pairs);
SHOW_VECTOR(ids,num_pairs);
return 0;
}