如果我使用 31 个块,为什么这个 CUDA 缩减会失败?
Why does this CUDA reduction fail if I use 31 blocks?
以下 CUDA 代码采用标签列表(0、1、2、3、...)并计算这些标签的权重之和。
为了加速计算,我使用了共享内存,以便每个线程维护自己的 运行 总和。在计算结束时,我执行 CUB 块范围的缩减,然后对全局内存进行原子添加。
如果我使用少于 30 个块,CPU 和 GPU 同意结果,但如果我使用超过这个,则不同意。为什么会这样,我该如何解决?
检查代码中的错误代码不会产生任何结果,cuda-gdb 和 cuda-memcheck 不会显示任何未捕获的错误或内存问题。
我在 Nvidia Quadro P2000 上使用 NVCC v10.1.243 和 运行。
MWE
//Compile with, e.g., nvcc -I /z/downloads/cub-1.8.0/ cuda_reduction.cu -arch=sm_61
#include <algorithm>
#include <cub/cub.cuh>
#include <thrust/device_vector.h>
#include <random>
__global__ void group_summer(
const int32_t *const labels,
const float *const weights,
const int num_elements,
const int num_classes,
double *const sums,
uint32_t *const counts
){
constexpr int num_threads = 128;
assert(num_threads==blockDim.x);
//Get shared memory
extern __shared__ int s[];
double *const sums_shmem = (double*)s;
uint32_t *const counts_shmem = (uint32_t*)&sums_shmem[num_threads*num_classes];
double *const my_sums = &sums_shmem [num_classes*threadIdx.x];
uint32_t *const my_counts = &counts_shmem[num_classes*threadIdx.x];
for(int i=0;i<num_threads*num_classes;i+=num_threads){
sums_shmem[i] = 0;
counts_shmem[i] = 0;
}
__syncthreads();
for(int i=blockIdx.x * blockDim.x + threadIdx.x;i<num_elements;i+=gridDim.x*blockDim.x){
// printf("Thread %d at %d looking at %d with %f at %ld and %ld\n", threadIdx.x, i, labels[i], weights[i], (long int)&my_counts[i], (long int)&my_sums[i]);
const auto l = labels[i];
// printf("Before thread %d at %d now has %d counts and %lf sums\n", threadIdx.x, i, my_counts[l], my_sums[l]);
my_sums[l] += weights[i];
my_counts[l]++;
// printf("After thread %d at %d now has %d counts and %lf sums\n", threadIdx.x, i, my_counts[l], my_sums[l]);
}
__syncthreads();
__shared__ cub::BlockReduce<double, num_threads>::TempStorage double_temp_storage;
__shared__ cub::BlockReduce<uint32_t, num_threads>::TempStorage uint32_t_temp_storage;
for(int l=0;l<num_classes;l++){
// printf("Thread %d has %d counts with total weight %f for label %d\n", threadIdx.x, my_counts[l], my_sums[l], l);
const auto sums_total = cub::BlockReduce<double,num_threads>(double_temp_storage).Reduce(my_sums[l], cub::Sum());
const auto counts_total = cub::BlockReduce<uint32_t,num_threads>(uint32_t_temp_storage).Reduce(my_counts[l], cub::Sum());
if(threadIdx.x==0){
atomicAdd(&sums[l], sums_total);
atomicAdd(&counts[l], counts_total);
}
}
}
void group_summer_cpu(
const std::vector<int32_t> &labels,
const std::vector<float> &weights,
std::vector<double> &sums,
std::vector<uint32_t> &counts
){
for(int i=0;i<labels.size();i++){
const auto l = labels[i];
sums[l] += weights[i];
counts[l]++;
}
}
template<class T>
bool vec_nearly_equal(const std::vector<T> &a, const std::vector<T> &b){
if(a.size()!=b.size())
return false;
for(size_t i=0;i<a.size();i++){
if(std::abs(a[i]-b[i])>1e-4)
return false;
}
return true;
}
void TestGroupSummer(std::mt19937 &gen, const int N, const int label_max, const int num_blocks){
std::vector<int32_t> labels(N);
std::vector<float> weights(N);
std::uniform_int_distribution<int> label_dist(0, label_max);
std::uniform_real_distribution<float> weight_dist(0, 5000);
for(int i=0;i<N;i++){
labels[i] = label_dist(gen);
weights[i] = weight_dist(gen);
}
// for(const auto &x: labels) std::cout<<x<<" "; std::cout<<std::endl;
// for(const auto &x: weights) std::cout<<x<<" "; std::cout<<std::endl;
const int num_classes = 1 + *std::max_element(labels.begin(), labels.end());
thrust::device_vector<int32_t> d_labels(labels.size());
thrust::device_vector<float> d_weights(labels.size());
thrust::device_vector<double> d_sums(num_classes);
thrust::device_vector<uint32_t> d_counts(num_classes);
thrust::copy(labels.begin(), labels.end(), d_labels.begin());
thrust::copy(weights.begin(), weights.end(), d_weights.begin());
constexpr int num_threads = 128;
const int shmem = num_threads * num_classes * (sizeof(double)+sizeof(uint32_t));
std::cout<<"Num blocks: "<<num_blocks<<std::endl;
std::cout<<"Shared memory: "<<shmem<<std::endl;
group_summer<<<num_blocks,num_threads,shmem>>>(
thrust::raw_pointer_cast(d_labels.data()),
thrust::raw_pointer_cast(d_weights.data()),
labels.size(),
num_classes,
thrust::raw_pointer_cast(d_sums.data()),
thrust::raw_pointer_cast(d_counts.data())
);
if(cudaGetLastError()!=CUDA_SUCCESS){
std::cout<<"Kernel failed to launch!"<<std::endl;
}
cudaDeviceSynchronize();
if(cudaGetLastError()!=CUDA_SUCCESS){
std::cout<<"Error in kernel!"<<std::endl;
}
std::vector<double> h_sums(num_classes);
std::vector<uint32_t> h_counts(num_classes);
thrust::copy(d_sums.begin(), d_sums.end(), h_sums.begin());
thrust::copy(d_counts.begin(), d_counts.end(), h_counts.begin());
std::vector<double> correct_sums(num_classes);
std::vector<uint32_t> correct_counts(num_classes);
group_summer_cpu(labels, weights, correct_sums, correct_counts);
std::cout<<"Sums good? " <<vec_nearly_equal(h_sums,correct_sums)<<std::endl;
std::cout<<"Counts good? "<<(h_counts==correct_counts)<<std::endl;
std::cout<<"GPU Sums: "; for(const auto &x: h_sums) std::cout<<x<<" "; std::cout<<std::endl;
std::cout<<"CPU Sums: "; for(const auto &x: correct_sums) std::cout<<x<<" "; std::cout<<std::endl;
std::cout<<"GPU Counts: "; for(const auto &x: h_counts) std::cout<<x<<" "; std::cout<<std::endl;
std::cout<<"CPU Counts: "; for(const auto &x: correct_counts) std::cout<<x<<" "; std::cout<<std::endl;
}
int main(){
std::mt19937 gen;
//These all work
TestGroupSummer(gen, 1000000, 10, 30);
TestGroupSummer(gen, 1000000, 10, 30);
TestGroupSummer(gen, 1000000, 10, 30);
TestGroupSummer(gen, 1000000, 10, 30);
//This fails
TestGroupSummer(gen, 1000000, 10, 31);
}
当我在 Tesla V100 上 运行 你的代码时,除了第一次测试之外,所有结果都是失败的。
你这里有问题:
for(int i=0;i<num_threads*num_classes;i+=num_threads){
sums_shmem[i] = 0;
counts_shmem[i] = 0;
}
那是不正确zero-ing出共享内存。您需要将 i=0
更改为 i=threadIdx.x
。
当我进行更改时,一切都会过去。
顺便说一句,这是不正确的:
if(cudaGetLastError()!=CUDA_SUCCESS)
CUDA_SUCCESS
不是用于 运行 时间 API 的正确枚举标记。您应该改用 cudaSuccess
(有 2 个这样的实例)。
我也觉得你的错误比较容易出事:
if(std::abs(a[i]-b[i])>1e-4)
但这似乎不是这里的问题。我通常希望在测试前看到一些缩放。
以下 CUDA 代码采用标签列表(0、1、2、3、...)并计算这些标签的权重之和。
为了加速计算,我使用了共享内存,以便每个线程维护自己的 运行 总和。在计算结束时,我执行 CUB 块范围的缩减,然后对全局内存进行原子添加。
如果我使用少于 30 个块,CPU 和 GPU 同意结果,但如果我使用超过这个,则不同意。为什么会这样,我该如何解决?
检查代码中的错误代码不会产生任何结果,cuda-gdb 和 cuda-memcheck 不会显示任何未捕获的错误或内存问题。
我在 Nvidia Quadro P2000 上使用 NVCC v10.1.243 和 运行。
MWE
//Compile with, e.g., nvcc -I /z/downloads/cub-1.8.0/ cuda_reduction.cu -arch=sm_61
#include <algorithm>
#include <cub/cub.cuh>
#include <thrust/device_vector.h>
#include <random>
__global__ void group_summer(
const int32_t *const labels,
const float *const weights,
const int num_elements,
const int num_classes,
double *const sums,
uint32_t *const counts
){
constexpr int num_threads = 128;
assert(num_threads==blockDim.x);
//Get shared memory
extern __shared__ int s[];
double *const sums_shmem = (double*)s;
uint32_t *const counts_shmem = (uint32_t*)&sums_shmem[num_threads*num_classes];
double *const my_sums = &sums_shmem [num_classes*threadIdx.x];
uint32_t *const my_counts = &counts_shmem[num_classes*threadIdx.x];
for(int i=0;i<num_threads*num_classes;i+=num_threads){
sums_shmem[i] = 0;
counts_shmem[i] = 0;
}
__syncthreads();
for(int i=blockIdx.x * blockDim.x + threadIdx.x;i<num_elements;i+=gridDim.x*blockDim.x){
// printf("Thread %d at %d looking at %d with %f at %ld and %ld\n", threadIdx.x, i, labels[i], weights[i], (long int)&my_counts[i], (long int)&my_sums[i]);
const auto l = labels[i];
// printf("Before thread %d at %d now has %d counts and %lf sums\n", threadIdx.x, i, my_counts[l], my_sums[l]);
my_sums[l] += weights[i];
my_counts[l]++;
// printf("After thread %d at %d now has %d counts and %lf sums\n", threadIdx.x, i, my_counts[l], my_sums[l]);
}
__syncthreads();
__shared__ cub::BlockReduce<double, num_threads>::TempStorage double_temp_storage;
__shared__ cub::BlockReduce<uint32_t, num_threads>::TempStorage uint32_t_temp_storage;
for(int l=0;l<num_classes;l++){
// printf("Thread %d has %d counts with total weight %f for label %d\n", threadIdx.x, my_counts[l], my_sums[l], l);
const auto sums_total = cub::BlockReduce<double,num_threads>(double_temp_storage).Reduce(my_sums[l], cub::Sum());
const auto counts_total = cub::BlockReduce<uint32_t,num_threads>(uint32_t_temp_storage).Reduce(my_counts[l], cub::Sum());
if(threadIdx.x==0){
atomicAdd(&sums[l], sums_total);
atomicAdd(&counts[l], counts_total);
}
}
}
void group_summer_cpu(
const std::vector<int32_t> &labels,
const std::vector<float> &weights,
std::vector<double> &sums,
std::vector<uint32_t> &counts
){
for(int i=0;i<labels.size();i++){
const auto l = labels[i];
sums[l] += weights[i];
counts[l]++;
}
}
template<class T>
bool vec_nearly_equal(const std::vector<T> &a, const std::vector<T> &b){
if(a.size()!=b.size())
return false;
for(size_t i=0;i<a.size();i++){
if(std::abs(a[i]-b[i])>1e-4)
return false;
}
return true;
}
void TestGroupSummer(std::mt19937 &gen, const int N, const int label_max, const int num_blocks){
std::vector<int32_t> labels(N);
std::vector<float> weights(N);
std::uniform_int_distribution<int> label_dist(0, label_max);
std::uniform_real_distribution<float> weight_dist(0, 5000);
for(int i=0;i<N;i++){
labels[i] = label_dist(gen);
weights[i] = weight_dist(gen);
}
// for(const auto &x: labels) std::cout<<x<<" "; std::cout<<std::endl;
// for(const auto &x: weights) std::cout<<x<<" "; std::cout<<std::endl;
const int num_classes = 1 + *std::max_element(labels.begin(), labels.end());
thrust::device_vector<int32_t> d_labels(labels.size());
thrust::device_vector<float> d_weights(labels.size());
thrust::device_vector<double> d_sums(num_classes);
thrust::device_vector<uint32_t> d_counts(num_classes);
thrust::copy(labels.begin(), labels.end(), d_labels.begin());
thrust::copy(weights.begin(), weights.end(), d_weights.begin());
constexpr int num_threads = 128;
const int shmem = num_threads * num_classes * (sizeof(double)+sizeof(uint32_t));
std::cout<<"Num blocks: "<<num_blocks<<std::endl;
std::cout<<"Shared memory: "<<shmem<<std::endl;
group_summer<<<num_blocks,num_threads,shmem>>>(
thrust::raw_pointer_cast(d_labels.data()),
thrust::raw_pointer_cast(d_weights.data()),
labels.size(),
num_classes,
thrust::raw_pointer_cast(d_sums.data()),
thrust::raw_pointer_cast(d_counts.data())
);
if(cudaGetLastError()!=CUDA_SUCCESS){
std::cout<<"Kernel failed to launch!"<<std::endl;
}
cudaDeviceSynchronize();
if(cudaGetLastError()!=CUDA_SUCCESS){
std::cout<<"Error in kernel!"<<std::endl;
}
std::vector<double> h_sums(num_classes);
std::vector<uint32_t> h_counts(num_classes);
thrust::copy(d_sums.begin(), d_sums.end(), h_sums.begin());
thrust::copy(d_counts.begin(), d_counts.end(), h_counts.begin());
std::vector<double> correct_sums(num_classes);
std::vector<uint32_t> correct_counts(num_classes);
group_summer_cpu(labels, weights, correct_sums, correct_counts);
std::cout<<"Sums good? " <<vec_nearly_equal(h_sums,correct_sums)<<std::endl;
std::cout<<"Counts good? "<<(h_counts==correct_counts)<<std::endl;
std::cout<<"GPU Sums: "; for(const auto &x: h_sums) std::cout<<x<<" "; std::cout<<std::endl;
std::cout<<"CPU Sums: "; for(const auto &x: correct_sums) std::cout<<x<<" "; std::cout<<std::endl;
std::cout<<"GPU Counts: "; for(const auto &x: h_counts) std::cout<<x<<" "; std::cout<<std::endl;
std::cout<<"CPU Counts: "; for(const auto &x: correct_counts) std::cout<<x<<" "; std::cout<<std::endl;
}
int main(){
std::mt19937 gen;
//These all work
TestGroupSummer(gen, 1000000, 10, 30);
TestGroupSummer(gen, 1000000, 10, 30);
TestGroupSummer(gen, 1000000, 10, 30);
TestGroupSummer(gen, 1000000, 10, 30);
//This fails
TestGroupSummer(gen, 1000000, 10, 31);
}
当我在 Tesla V100 上 运行 你的代码时,除了第一次测试之外,所有结果都是失败的。
你这里有问题:
for(int i=0;i<num_threads*num_classes;i+=num_threads){
sums_shmem[i] = 0;
counts_shmem[i] = 0;
}
那是不正确zero-ing出共享内存。您需要将 i=0
更改为 i=threadIdx.x
。
当我进行更改时,一切都会过去。
顺便说一句,这是不正确的:
if(cudaGetLastError()!=CUDA_SUCCESS)
CUDA_SUCCESS
不是用于 运行 时间 API 的正确枚举标记。您应该改用 cudaSuccess
(有 2 个这样的实例)。
我也觉得你的错误比较容易出事:
if(std::abs(a[i]-b[i])>1e-4)
但这似乎不是这里的问题。我通常希望在测试前看到一些缩放。