CUDA 或推力:在分组数组中获取 n 个最大数字
CUDA or thrust: getting n maximum numbers in a grouped array
我把一个数组分成n组,每组大小相等,我想得到每组中最大的n个数。
例如:
我计算每个线程的 groupID 并在全局数组上使用 atomicMax。所以当组数少的时候,性能很差。好像thrust::reduce_by_key
可以做到,但我还没想好。
有更好的主意吗?
一种可能的方法是使用 thrust::reduce_by_key。至少在推力方面,我认为没有比这更好的方法了。
这是一个有效的例子:
$ cat t663.cu
#include <iostream>
#include <thrust/reduce.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/functional.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/fill.h>
#include <stdlib.h>
#define N_GROUPS 4
#define GRP_SIZE 4
#define nTPB 256
#define DSIZE (N_GROUPS * GRP_SIZE)
#define SCALE_MOD 1024
#include <time.h>
#include <sys/time.h>
unsigned long long dtime_usec(unsigned long long prev){
#define USECPSEC 1000000ULL
timeval tv1;
gettimeofday(&tv1,0);
return ((tv1.tv_sec * USECPSEC)+tv1.tv_usec) - prev;
}
template <typename T>
__global__ void max_kernel(const T *data, T *result, const int num_groups, const int group_size){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < (num_groups*group_size))
atomicMax(result + (idx/group_size), data[idx]);
}
struct idx_fctr : public thrust::unary_function<int, int>
{
int mod;
idx_fctr(const int _mod) : mod(_mod) {} ;
__host__ __device__ int operator()(const int &val) const {
return val/mod;
}
};
template <typename T>
struct max_fctr : public thrust::binary_function<T, T, T>
{
__host__ __device__ T operator()(const T &val1, const T &val2) const {
return (val1>val2)?val1:val2;
}
};
int main(int argc, char *argv[]){
int grp_size = GRP_SIZE;
int n_groups = N_GROUPS;
if (argc > 2){
grp_size = atoi(argv[1]);
n_groups = atoi(argv[2]);}
int *data;
int dsize = grp_size*n_groups;
data = new int[dsize];
for (int i = 0; i < dsize; i++) data[i] = rand()%SCALE_MOD;
thrust::device_vector<int> d_data(data, data+dsize);
thrust::device_vector<int> d_keys_out(n_groups);
thrust::device_vector<int> d_vals_out(n_groups);
unsigned long long dtime = dtime_usec(0);
thrust::reduce_by_key(thrust::make_transform_iterator(thrust::make_counting_iterator(0), idx_fctr(grp_size)), thrust::make_transform_iterator(thrust::make_counting_iterator(dsize), idx_fctr(grp_size)), d_data.begin(), d_keys_out.begin(), d_vals_out.begin(), thrust::equal_to<int>(), max_fctr<int>());
cudaDeviceSynchronize();
dtime = dtime_usec(dtime);
std::cout << "thrust time: " << dtime/(float)USECPSEC << std::endl;
#ifdef DEBUG_PRINT
std::cout << "data: " << std::endl;
thrust::copy(d_data.begin(), d_data.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "keys: " << std::endl;
thrust::copy(d_keys_out.begin(), d_keys_out.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "maxima: " << std::endl;
thrust::copy(d_vals_out.begin(), d_vals_out.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
thrust::fill(d_vals_out.begin(), d_vals_out.end(), 0);
dtime = dtime_usec(0);
max_kernel<<<(dsize+nTPB-1)/nTPB, nTPB>>>(thrust::raw_pointer_cast(d_data.data()), thrust::raw_pointer_cast(d_vals_out.data()), n_groups, grp_size);
cudaDeviceSynchronize();
dtime = dtime_usec(dtime);
std::cout << "kernel time: " << dtime/(float)USECPSEC << std::endl;
#ifdef DEBUG_PRINT
std::cout << "data: " << std::endl;
thrust::copy(d_data.begin(), d_data.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "keys: " << std::endl;
thrust::copy(d_keys_out.begin(), d_keys_out.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "maxima: " << std::endl;
thrust::copy(d_vals_out.begin(), d_vals_out.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
return 0;
}
$ nvcc -arch=sm_20 -o t663 t663.cu
$ ./t663
data:
359,966,105,115,81,255,74,236,809,205,186,939,498,763,483,326,
keys:
0,1,2,3,
maxima:
966,255,939,763,
$
我在上面展示的方法会自动即时生成组密钥。
编辑: 我修改了代码,使其成为原始 atomicMax 内核和 thrust::reduce_by_key 方法之间的基准比较。对于大数据集,thrust 似乎做得更好。对于小数据集,尤其是当组规模较小时,内核似乎做得更好。性能因尺寸(可以在命令行上传递)和 gpu 类型而异。对于具有不同数据集大小的 运行,将组大小作为第一个命令行参数传递,将组数作为第二个传递。一般来说,对于小规模的群体,内核方法似乎做得更好。对于更大的群体规模,推力法似乎做得更好。
当我运行组大小为1024,组数等于100000的代码时,推力代码运行s在K40c上大约0.03秒,内核方法取大约长 4 倍。如果我将组大小减少到 4,内核方法将比推力方法快 30 倍。比率和断点肯定会因 GPU 类型而异。
我把一个数组分成n组,每组大小相等,我想得到每组中最大的n个数。
例如:
我计算每个线程的 groupID 并在全局数组上使用 atomicMax。所以当组数少的时候,性能很差。好像thrust::reduce_by_key
可以做到,但我还没想好。
有更好的主意吗?
一种可能的方法是使用 thrust::reduce_by_key。至少在推力方面,我认为没有比这更好的方法了。
这是一个有效的例子:
$ cat t663.cu
#include <iostream>
#include <thrust/reduce.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/functional.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/fill.h>
#include <stdlib.h>
#define N_GROUPS 4
#define GRP_SIZE 4
#define nTPB 256
#define DSIZE (N_GROUPS * GRP_SIZE)
#define SCALE_MOD 1024
#include <time.h>
#include <sys/time.h>
unsigned long long dtime_usec(unsigned long long prev){
#define USECPSEC 1000000ULL
timeval tv1;
gettimeofday(&tv1,0);
return ((tv1.tv_sec * USECPSEC)+tv1.tv_usec) - prev;
}
template <typename T>
__global__ void max_kernel(const T *data, T *result, const int num_groups, const int group_size){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < (num_groups*group_size))
atomicMax(result + (idx/group_size), data[idx]);
}
struct idx_fctr : public thrust::unary_function<int, int>
{
int mod;
idx_fctr(const int _mod) : mod(_mod) {} ;
__host__ __device__ int operator()(const int &val) const {
return val/mod;
}
};
template <typename T>
struct max_fctr : public thrust::binary_function<T, T, T>
{
__host__ __device__ T operator()(const T &val1, const T &val2) const {
return (val1>val2)?val1:val2;
}
};
int main(int argc, char *argv[]){
int grp_size = GRP_SIZE;
int n_groups = N_GROUPS;
if (argc > 2){
grp_size = atoi(argv[1]);
n_groups = atoi(argv[2]);}
int *data;
int dsize = grp_size*n_groups;
data = new int[dsize];
for (int i = 0; i < dsize; i++) data[i] = rand()%SCALE_MOD;
thrust::device_vector<int> d_data(data, data+dsize);
thrust::device_vector<int> d_keys_out(n_groups);
thrust::device_vector<int> d_vals_out(n_groups);
unsigned long long dtime = dtime_usec(0);
thrust::reduce_by_key(thrust::make_transform_iterator(thrust::make_counting_iterator(0), idx_fctr(grp_size)), thrust::make_transform_iterator(thrust::make_counting_iterator(dsize), idx_fctr(grp_size)), d_data.begin(), d_keys_out.begin(), d_vals_out.begin(), thrust::equal_to<int>(), max_fctr<int>());
cudaDeviceSynchronize();
dtime = dtime_usec(dtime);
std::cout << "thrust time: " << dtime/(float)USECPSEC << std::endl;
#ifdef DEBUG_PRINT
std::cout << "data: " << std::endl;
thrust::copy(d_data.begin(), d_data.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "keys: " << std::endl;
thrust::copy(d_keys_out.begin(), d_keys_out.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "maxima: " << std::endl;
thrust::copy(d_vals_out.begin(), d_vals_out.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
thrust::fill(d_vals_out.begin(), d_vals_out.end(), 0);
dtime = dtime_usec(0);
max_kernel<<<(dsize+nTPB-1)/nTPB, nTPB>>>(thrust::raw_pointer_cast(d_data.data()), thrust::raw_pointer_cast(d_vals_out.data()), n_groups, grp_size);
cudaDeviceSynchronize();
dtime = dtime_usec(dtime);
std::cout << "kernel time: " << dtime/(float)USECPSEC << std::endl;
#ifdef DEBUG_PRINT
std::cout << "data: " << std::endl;
thrust::copy(d_data.begin(), d_data.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "keys: " << std::endl;
thrust::copy(d_keys_out.begin(), d_keys_out.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "maxima: " << std::endl;
thrust::copy(d_vals_out.begin(), d_vals_out.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
#endif
return 0;
}
$ nvcc -arch=sm_20 -o t663 t663.cu
$ ./t663
data:
359,966,105,115,81,255,74,236,809,205,186,939,498,763,483,326,
keys:
0,1,2,3,
maxima:
966,255,939,763,
$
我在上面展示的方法会自动即时生成组密钥。
编辑: 我修改了代码,使其成为原始 atomicMax 内核和 thrust::reduce_by_key 方法之间的基准比较。对于大数据集,thrust 似乎做得更好。对于小数据集,尤其是当组规模较小时,内核似乎做得更好。性能因尺寸(可以在命令行上传递)和 gpu 类型而异。对于具有不同数据集大小的 运行,将组大小作为第一个命令行参数传递,将组数作为第二个传递。一般来说,对于小规模的群体,内核方法似乎做得更好。对于更大的群体规模,推力法似乎做得更好。
当我运行组大小为1024,组数等于100000的代码时,推力代码运行s在K40c上大约0.03秒,内核方法取大约长 4 倍。如果我将组大小减少到 4,内核方法将比推力方法快 30 倍。比率和断点肯定会因 GPU 类型而异。