类似于推力的 CUB 模板
CUB template similar to thrust
以下是推力代码:
h_in_value[7] = thrust::reduce(thrust::device, d_in1 + a - b, d_ori_rho_L1 + a);
此处,thrust::reduce
获取第一个和最后一个输入迭代器,并将 returns 值推回 CPU(复制到 h_in_value)
可以使用 CUB 获得此功能吗?
- 第一个和最后一个迭代器作为输入
- 返回结果给主机
Can this functionality be obtained using CUB?
是的,使用 CUB 可以做类似的事情。您需要的大部分内容都包含在 here 的 sum reduce 示例片段中。此外,CUB 不会自动将数量复制回主机代码,因此我们需要对其进行管理。这是一种可能的实现方式:
$ cat t125.cu
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
#include <thrust/device_vector.h>
#include <cub/cub.cuh>
#include <iostream>
typedef int mytype;
const int dsize = 10;
const int val = 1;
template <typename T>
T my_cub_reduce(T *begin, T *end){
size_t num_items = end-begin;
T *d_in = begin;
T *d_out, res;
cudaMalloc(&d_out, sizeof(T));
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// Run sum-reduction
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaMemcpy(&res, d_out, sizeof(T), cudaMemcpyDeviceToHost);
cudaFree(d_out);
cudaFree(d_temp_storage);
return res;
}
template <typename T>
typename thrust::iterator_traits<T>::value_type
my_cub_reduce(T begin, T end){
return my_cub_reduce(thrust::raw_pointer_cast(&(begin[0])), thrust::raw_pointer_cast(&(end[0])));
}
int main(){
mytype *d_data, *h_data;
cudaMalloc(&d_data, dsize*sizeof(mytype));
h_data = (mytype *)malloc(dsize*sizeof(mytype));
for (int i = 0; i < dsize; i++) h_data[i] = val;
cudaMemcpy(d_data, h_data, dsize*sizeof(mytype), cudaMemcpyHostToDevice);
std::cout << "thrust reduce: " << thrust::reduce(thrust::device, d_data, d_data+dsize) << std::endl;
std::cout << "cub reduce: " << my_cub_reduce(d_data, d_data+dsize) << std::endl;
thrust::device_vector<int> d(5,1);
// using thrust style container iterators and pointers
std::cout << my_cub_reduce(d.begin(), d.end()) << std::endl;
std::cout << my_cub_reduce(thrust::device_pointer_cast(d.data()), thrust::device_pointer_cast(d.data()+d.size())) << std::endl;
}
$ nvcc -arch=sm_61 -o t125 t125.cu
$ ./t125
thrust reduce: 10
cub reduce: 10
5
5
$
编辑:通过几行额外的代码,我们可以添加对推力式设备容器迭代器和指针的支持。我也更新了上面的代码来证明这一点。
以下是推力代码:
h_in_value[7] = thrust::reduce(thrust::device, d_in1 + a - b, d_ori_rho_L1 + a);
此处,thrust::reduce
获取第一个和最后一个输入迭代器,并将 returns 值推回 CPU(复制到 h_in_value)
可以使用 CUB 获得此功能吗?
- 第一个和最后一个迭代器作为输入
- 返回结果给主机
Can this functionality be obtained using CUB?
是的,使用 CUB 可以做类似的事情。您需要的大部分内容都包含在 here 的 sum reduce 示例片段中。此外,CUB 不会自动将数量复制回主机代码,因此我们需要对其进行管理。这是一种可能的实现方式:
$ cat t125.cu
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
#include <thrust/device_vector.h>
#include <cub/cub.cuh>
#include <iostream>
typedef int mytype;
const int dsize = 10;
const int val = 1;
template <typename T>
T my_cub_reduce(T *begin, T *end){
size_t num_items = end-begin;
T *d_in = begin;
T *d_out, res;
cudaMalloc(&d_out, sizeof(T));
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// Run sum-reduction
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaMemcpy(&res, d_out, sizeof(T), cudaMemcpyDeviceToHost);
cudaFree(d_out);
cudaFree(d_temp_storage);
return res;
}
template <typename T>
typename thrust::iterator_traits<T>::value_type
my_cub_reduce(T begin, T end){
return my_cub_reduce(thrust::raw_pointer_cast(&(begin[0])), thrust::raw_pointer_cast(&(end[0])));
}
int main(){
mytype *d_data, *h_data;
cudaMalloc(&d_data, dsize*sizeof(mytype));
h_data = (mytype *)malloc(dsize*sizeof(mytype));
for (int i = 0; i < dsize; i++) h_data[i] = val;
cudaMemcpy(d_data, h_data, dsize*sizeof(mytype), cudaMemcpyHostToDevice);
std::cout << "thrust reduce: " << thrust::reduce(thrust::device, d_data, d_data+dsize) << std::endl;
std::cout << "cub reduce: " << my_cub_reduce(d_data, d_data+dsize) << std::endl;
thrust::device_vector<int> d(5,1);
// using thrust style container iterators and pointers
std::cout << my_cub_reduce(d.begin(), d.end()) << std::endl;
std::cout << my_cub_reduce(thrust::device_pointer_cast(d.data()), thrust::device_pointer_cast(d.data()+d.size())) << std::endl;
}
$ nvcc -arch=sm_61 -o t125 t125.cu
$ ./t125
thrust reduce: 10
cub reduce: 10
5
5
$
编辑:通过几行额外的代码,我们可以添加对推力式设备容器迭代器和指针的支持。我也更新了上面的代码来证明这一点。