来自 thrust::reduce 的总和值不正确
Value of sum from thrust::reduce not correct
我一直在尝试实现一些需要在 thrust::device_ptr 上调用 reduce 的代码,但在处理大值时,结果与 CPU 的实现不一致。我必须处理大值。那么有没有办法解决:
我的代码:
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
#define NZ 412//
#define NX 402//
using namespace std;
using real =double;
void allocate_array_2d(real**& preal, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
preal = new real * [dim1];
preal[0] = new real[dim1 * dim2];
for (int i = 1; i < dim1; i++) preal[i] = preal[i - 1] + dim2;
for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2; j++) {
preal[i][j] = 0;
}
}
}
#define cudaCheckError(code) \
{ \
if ((code) != cudaSuccess) { \
fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
cudaGetErrorString(code)); \
} \
}
int main()
{
real** a;
std::cout.precision(30);
allocate_array_2d(a, NZ, NX);//input array
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
a[i][j] = 2.14748e+09;
}
}
real* da;
cudaCheckError(cudaMalloc(&da, NZ * NX * sizeof(real)));
cudaCheckError(cudaMemcpy(da,a[0], NZ * NX * sizeof(real),cudaMemcpyHostToDevice));
///************************
//CUDA KERNELS ARE HERE
// REMOVED FOR CLEAR QUESTION
///*************************
real sum1=0;
thrust::device_ptr<real> dev_ptr = thrust::device_pointer_cast(da);
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
cout<<" \nsum gpu "<< sum1<<"\n";
real sum2=0;
////////CPU PART DOING SAME THING//////
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
sum2 += a[i][j];
}
}
cout<<"\nsum cpu "<< sum2<<"\n";
if((sum2-sum1)<0.001)
std::cout << "\nSUCESS "<< "\n";
else
std::cout << "\nFailure & by "<<sum2-sum1<< "\n";
}
我使用的编译器是nvcc,我的显卡是nvidia 1650,计算能力7.5。
根据the documentation,推力期望求和的类型反映在init
值中:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
^
你的常量类型是整型。如果将其更改为双精度常量:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0.0, thrust::plus<real>());
根据我的测试,您会得到 CPU 和 GPU 之间的匹配结果。 (您也可以将常量转换为 real
类型:(real)0
并使用它,还有其他方法可以解决这个问题,例如放弃使用初始值和二进制操作。)
我一直在尝试实现一些需要在 thrust::device_ptr 上调用 reduce 的代码,但在处理大值时,结果与 CPU 的实现不一致。我必须处理大值。那么有没有办法解决:
我的代码:
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
#define NZ 412//
#define NX 402//
using namespace std;
using real =double;
void allocate_array_2d(real**& preal, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
preal = new real * [dim1];
preal[0] = new real[dim1 * dim2];
for (int i = 1; i < dim1; i++) preal[i] = preal[i - 1] + dim2;
for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2; j++) {
preal[i][j] = 0;
}
}
}
#define cudaCheckError(code) \
{ \
if ((code) != cudaSuccess) { \
fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
cudaGetErrorString(code)); \
} \
}
int main()
{
real** a;
std::cout.precision(30);
allocate_array_2d(a, NZ, NX);//input array
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
a[i][j] = 2.14748e+09;
}
}
real* da;
cudaCheckError(cudaMalloc(&da, NZ * NX * sizeof(real)));
cudaCheckError(cudaMemcpy(da,a[0], NZ * NX * sizeof(real),cudaMemcpyHostToDevice));
///************************
//CUDA KERNELS ARE HERE
// REMOVED FOR CLEAR QUESTION
///*************************
real sum1=0;
thrust::device_ptr<real> dev_ptr = thrust::device_pointer_cast(da);
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
cout<<" \nsum gpu "<< sum1<<"\n";
real sum2=0;
////////CPU PART DOING SAME THING//////
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
sum2 += a[i][j];
}
}
cout<<"\nsum cpu "<< sum2<<"\n";
if((sum2-sum1)<0.001)
std::cout << "\nSUCESS "<< "\n";
else
std::cout << "\nFailure & by "<<sum2-sum1<< "\n";
}
我使用的编译器是nvcc,我的显卡是nvidia 1650,计算能力7.5。
根据the documentation,推力期望求和的类型反映在init
值中:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
^
你的常量类型是整型。如果将其更改为双精度常量:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0.0, thrust::plus<real>());
根据我的测试,您会得到 CPU 和 GPU 之间的匹配结果。 (您也可以将常量转换为 real
类型:(real)0
并使用它,还有其他方法可以解决这个问题,例如放弃使用初始值和二进制操作。)