__device__ class 成员函数更改设备变量值后从设备复制到主机时出现 cudaMemcpy 错误
cudaMemcpy error when copying from device to host after __device__ class member function alters value of device variable
我对我编写的 CUDA 代码的行为感到困惑。我正在为名为 DimmedGridGPU
的 class 中的 __device__
函数编写测试。这个 class 是在 int DIM
上模板化的,我遇到问题的函数是 return 最接近输入值 x
的网格值。我有这个内核命名空间用于单元测试目的,以隔离调用每个 __device__
函数。
此代码的预期行为是 return 来自 do_get_value(x, grid_)
调用的值 3.0
,并将 d_target[0]
设置为此值,然后将其传回到主机端进行单元测试断言。整个内核似乎运行正常,但是当我做最后的传输回主机端时,我收到一个 cudaErrorInvalidValue
错误,我不明白为什么。
这是代码的最小示例,保留了 class 的结构及其功能:
#include <cuda_runtime.h>
#include <fstream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: \"%s\": %s %s %d\n", cudaGetErrorName(code), cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
template <int DIM>
class DimmedGridGPU{
public:
size_t grid_size_;//total size of grid
int b_derivatives_;//if derivatives are going to be used
int b_interpolate_;//if interpolation should be used on the grid
double* grid_;//the grid values
double* grid_deriv_;//derivatives
double dx_[DIM];//grid spacing
double min_[DIM];//grid minimum
double max_[DIM];//maximum
int grid_number_[DIM];//number of points on grid
int b_periodic_[DIM];//if a dimension is periodic
int* d_b_interpolate_;
int* d_b_derivatives_;
DimmedGridGPU(const double* min,
const double* max,
const double* bin_spacing,
const int* b_periodic,
int b_derivatives,
int b_interpolate) : b_derivatives_(b_derivatives), b_interpolate_(b_interpolate), grid_(NULL), grid_deriv_(NULL){
size_t i;
for(i = 0; i < DIM; i++) {
min_[i] = min[i];
max_[i] = max[i];
b_periodic_[i] = b_periodic[i];
grid_number_[i] = (int) ceil((max_[i] - min_[i]) / bin_spacing[i]);
dx_[i] = (max_[i] - min_[i]) / grid_number_[i];
//add one to grid points if
grid_number_[i] = b_periodic_[i] ? grid_number_[i] : grid_number_[i] + 1;
//increment dx to compensate
if(!b_periodic_[i])
max_[i] += dx_[i];
}
grid_size_ = 1;
for(i = 0; i < DIM; i++)
grid_size_ *= grid_number_[i];
gpuErrchk(cudaMallocManaged(&grid_, grid_size_ * sizeof(double)));
if(b_derivatives_) {
gpuErrchk(cudaMallocManaged(&grid_deriv_, DIM * grid_size_ * sizeof(double)));
if(!grid_deriv_) {
printf("Out of memory!! gpugrid.cuh:initialize");
}
}
gpuErrchk(cudaMalloc((void**)&d_b_interpolate_, sizeof(int)));
gpuErrchk(cudaMemcpy(d_b_interpolate_, &b_interpolate, sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMalloc((void**)&d_b_derivatives_, sizeof(int)));
gpuErrchk(cudaMemcpy(d_b_derivatives_, &b_derivatives, sizeof(int), cudaMemcpyHostToDevice));
}
~DimmedGridGPU(){
gpuErrchk(cudaDeviceSynchronize());
if(grid_ != NULL){
gpuErrchk(cudaFree(grid_));
grid_ = NULL;//need to do this so DimmedGrid's destructor functions properly
}
if(grid_deriv_ != NULL){
gpuErrchk(cudaFree(grid_deriv_));
grid_deriv_ = NULL;
}
gpuErrchk(cudaDeviceReset());
}
//gets the value of the grid closest to x
__host__ __device__ double do_get_value( double* x, double* grid_) {
size_t index[DIM];
get_index(x, index);
printf("do_get_value was called on the GPU!, and index[0] is now %d\n", index[0]);
printf("but multi2one(index) gives us %d\n", multi2one(index));
double value = grid_[multi2one(index)];
printf("and value to be returned is %f\n", value);
return value;
}
//gets grid's 1D index from an array of coordinates
__host__ __device__ void get_index(const double* x, size_t result[DIM]) const {
size_t i;
double xi;
printf("get_index was called on the GPU in %i dimension(s)\n", DIM);
for(i = 0; i < DIM; i++) {
xi = x[i];
printf("xi is now %f, min_[i] is %f and dx_[i] is %f\n",xi, min_[i], dx_[i]);
if(b_periodic_[i]){
xi -= (max_[i] - min_[i]) * gpu_int_floor((xi - min_[i]) / (max_[i] - min_[i]));
}
result[i] = (size_t) floor((xi - min_[i]) / dx_[i]);
}
}
//takes a multidimensional index to a 1D index
__host__ __device__ size_t multi2one(const size_t index[DIM]) const {
size_t result = index[DIM-1];
size_t i;
for(i = DIM - 1; i > 0; i--) {
result = result * grid_number_[i-1] + index[i-1];
}
return result;
}
};
__host__ __device__ int gpu_int_floor(double number) {
return (int) number < 0.0 ? -ceil(fabs(number)) : floor(number);
}
namespace kernels{
template <int DIM>
__global__ void get_value_kernel(double* x, double* target_arr, double* grid_, DimmedGridGPU<DIM> g){
target_arr[0] = g.do_get_value(x, grid_);
printf("get_value_kernel has set target[0] to be %f\n", target_arr[0]);//check if the value is set correctly
return;
}
}
int main(){
using namespace kernels;
double min[] = {0};
double max[] = {10};
double bin_spacing[] = {1};
int periodic[] = {0};
DimmedGridGPU<1> g (min, max, bin_spacing, periodic, 0, 0);
for(int i = 0; i < 11; i++){
g.grid_[i] = i;
printf("g.grid_[%d] is now %f\n", i, g.grid_[i]);
}
gpuErrchk(cudaDeviceSynchronize());
double x[] = {3.5};
double* d_x;
gpuErrchk(cudaMalloc(&d_x, sizeof(double)));
gpuErrchk(cudaMemcpy(d_x, x, sizeof(double), cudaMemcpyHostToDevice));
double target[] = {5.0};
double* d_target;
gpuErrchk(cudaMalloc((void**)&d_target, sizeof(double)));
gpuErrchk(cudaMemcpy(d_target, target, sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaDeviceSynchronize());
get_value_kernel<1><<<1,1>>>(d_x, d_target, g.grid_, g);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(target, d_target, sizeof(double), cudaMemcpyDeviceToHost));
printf("and after GPU stuff, target[0] is now %f\n", target[0]);
return(0);
}
那么,当我包含的打印语句清楚地表明设备上使用了正确的值时,为什么这一行(最后 cudaMemcpy
)抛出错误“CudaErrorInvalidValue
” ,并且 do_get_value(x, grid_)
调用的 return 值是否正确?
我已经尝试过使用 cudaMemcpyFromSymbol
,认为赋值可能是创建一个符号而不是通过某种方式传递和更改值,但事实并非如此,因为 d_target
不是有效符号。
这是我的代码的示例输出:
g.grid_[0] is now 0.000000
g.grid_[1] is now 1.000000
g.grid_[2] is now 2.000000
g.grid_[3] is now 3.000000
g.grid_[4] is now 4.000000
g.grid_[5] is now 5.000000
g.grid_[6] is now 6.000000
g.grid_[7] is now 7.000000
g.grid_[8] is now 8.000000
g.grid_[9] is now 9.000000
g.grid_[10] is now 10.000000
get_index was called on the GPU in 1 dimension(s)
xi is now 3.500000, min_[i] is 0.000000 and dx_[i] is 1.000000
do_get_value was called on the GPU!, and index[0] is now 3
but multi2one(index) gives us 3
and value to be returned is 3.000000
get_value_kernel has set target[0] to be 3.000000
GPUassert: "cudaErrorInvalidValue": invalid argument gpugrid.cu 166
So, why does this line (the last cudaMemcpy) throw an error "CudaErrorInvalidValue"...?
问题围绕着你的析构函数:
~DimmedGridGPU(){
析构函数在您可能意想不到的地方被调用。要让自己相信这一点,请向析构函数添加 printf
语句。注意它在打印输出中出现的位置:
$ ./t955
g.grid_[0] is now 0.000000
g.grid_[1] is now 1.000000
g.grid_[2] is now 2.000000
g.grid_[3] is now 3.000000
g.grid_[4] is now 4.000000
g.grid_[5] is now 5.000000
g.grid_[6] is now 6.000000
g.grid_[7] is now 7.000000
g.grid_[8] is now 8.000000
g.grid_[9] is now 9.000000
g.grid_[10] is now 10.000000
Destructor!
get_index was called on the GPU in 1 dimension(s)
xi is now 3.500000, min_[i] is 0.000000 and dx_[i] is 1.000000
do_get_value was called on the GPU!, and index[0] is now 3
but multi2one(index) gives us 3
and value to be returned is 3.000000
get_value_kernel has set target[0] to be 3.000000
GPUassert: "cudaErrorInvalidValue": invalid argument t955.cu 167
鉴于此,很明显在该析构函数中调用 cudaDeviceReset()
现在似乎是个坏主意。 cudaDeviceReset()
会清除所有设备分配,因此当您尝试这样做时:
gpuErrchk(cudaMemcpy(target, d_target, sizeof(double), cudaMemcpyDeviceToHost));
d_target
不再是设备上的有效分配,因此当您尝试将其用作 cudaMemcpy
的设备目标时,运行时会检查此指针值(未被更改设备重置)并确定指针值不再对应于有效分配,并引发运行时错误。
Just like in C++ when you pass an object to a function (or a kernel in this case) as a pass-by-value parameter, the copy constructor for that object gets called. It stands to reason when that object copy goes out of scope, the destructor for it will be called.
我建议将 cudaDeviceReset()
这样的影响全局范围的函数放在对象析构函数中可能是一种脆弱的编程范式,但这也许是一个见仁见智的问题。我假设您现在有足够的信息来解决这个问题。
为了避免下一个可能的问题,简单地注释掉析构函数中对 cudaDeviceReset()
的调用可能不足以使所有问题消失(尽管这个特定的问题会)。既然您知道在该程序的正常执行过程中至少 两次 调用了此析构函数,您可能需要仔细考虑该析构函数中还发生了什么,并且可能会删除更多的东西,或者完全重新构建你的class。
例如,请注意 cudaDeviceReset()
并不是唯一可能在以这种方式使用的对象的析构函数中引起问题的函数。同样,当在对象副本上调用的析构函数中使用时,cudaFree()
可能会对原始对象产生意想不到的后果。
我对我编写的 CUDA 代码的行为感到困惑。我正在为名为 DimmedGridGPU
的 class 中的 __device__
函数编写测试。这个 class 是在 int DIM
上模板化的,我遇到问题的函数是 return 最接近输入值 x
的网格值。我有这个内核命名空间用于单元测试目的,以隔离调用每个 __device__
函数。
此代码的预期行为是 return 来自 do_get_value(x, grid_)
调用的值 3.0
,并将 d_target[0]
设置为此值,然后将其传回到主机端进行单元测试断言。整个内核似乎运行正常,但是当我做最后的传输回主机端时,我收到一个 cudaErrorInvalidValue
错误,我不明白为什么。
这是代码的最小示例,保留了 class 的结构及其功能:
#include <cuda_runtime.h>
#include <fstream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: \"%s\": %s %s %d\n", cudaGetErrorName(code), cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
template <int DIM>
class DimmedGridGPU{
public:
size_t grid_size_;//total size of grid
int b_derivatives_;//if derivatives are going to be used
int b_interpolate_;//if interpolation should be used on the grid
double* grid_;//the grid values
double* grid_deriv_;//derivatives
double dx_[DIM];//grid spacing
double min_[DIM];//grid minimum
double max_[DIM];//maximum
int grid_number_[DIM];//number of points on grid
int b_periodic_[DIM];//if a dimension is periodic
int* d_b_interpolate_;
int* d_b_derivatives_;
DimmedGridGPU(const double* min,
const double* max,
const double* bin_spacing,
const int* b_periodic,
int b_derivatives,
int b_interpolate) : b_derivatives_(b_derivatives), b_interpolate_(b_interpolate), grid_(NULL), grid_deriv_(NULL){
size_t i;
for(i = 0; i < DIM; i++) {
min_[i] = min[i];
max_[i] = max[i];
b_periodic_[i] = b_periodic[i];
grid_number_[i] = (int) ceil((max_[i] - min_[i]) / bin_spacing[i]);
dx_[i] = (max_[i] - min_[i]) / grid_number_[i];
//add one to grid points if
grid_number_[i] = b_periodic_[i] ? grid_number_[i] : grid_number_[i] + 1;
//increment dx to compensate
if(!b_periodic_[i])
max_[i] += dx_[i];
}
grid_size_ = 1;
for(i = 0; i < DIM; i++)
grid_size_ *= grid_number_[i];
gpuErrchk(cudaMallocManaged(&grid_, grid_size_ * sizeof(double)));
if(b_derivatives_) {
gpuErrchk(cudaMallocManaged(&grid_deriv_, DIM * grid_size_ * sizeof(double)));
if(!grid_deriv_) {
printf("Out of memory!! gpugrid.cuh:initialize");
}
}
gpuErrchk(cudaMalloc((void**)&d_b_interpolate_, sizeof(int)));
gpuErrchk(cudaMemcpy(d_b_interpolate_, &b_interpolate, sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMalloc((void**)&d_b_derivatives_, sizeof(int)));
gpuErrchk(cudaMemcpy(d_b_derivatives_, &b_derivatives, sizeof(int), cudaMemcpyHostToDevice));
}
~DimmedGridGPU(){
gpuErrchk(cudaDeviceSynchronize());
if(grid_ != NULL){
gpuErrchk(cudaFree(grid_));
grid_ = NULL;//need to do this so DimmedGrid's destructor functions properly
}
if(grid_deriv_ != NULL){
gpuErrchk(cudaFree(grid_deriv_));
grid_deriv_ = NULL;
}
gpuErrchk(cudaDeviceReset());
}
//gets the value of the grid closest to x
__host__ __device__ double do_get_value( double* x, double* grid_) {
size_t index[DIM];
get_index(x, index);
printf("do_get_value was called on the GPU!, and index[0] is now %d\n", index[0]);
printf("but multi2one(index) gives us %d\n", multi2one(index));
double value = grid_[multi2one(index)];
printf("and value to be returned is %f\n", value);
return value;
}
//gets grid's 1D index from an array of coordinates
__host__ __device__ void get_index(const double* x, size_t result[DIM]) const {
size_t i;
double xi;
printf("get_index was called on the GPU in %i dimension(s)\n", DIM);
for(i = 0; i < DIM; i++) {
xi = x[i];
printf("xi is now %f, min_[i] is %f and dx_[i] is %f\n",xi, min_[i], dx_[i]);
if(b_periodic_[i]){
xi -= (max_[i] - min_[i]) * gpu_int_floor((xi - min_[i]) / (max_[i] - min_[i]));
}
result[i] = (size_t) floor((xi - min_[i]) / dx_[i]);
}
}
//takes a multidimensional index to a 1D index
__host__ __device__ size_t multi2one(const size_t index[DIM]) const {
size_t result = index[DIM-1];
size_t i;
for(i = DIM - 1; i > 0; i--) {
result = result * grid_number_[i-1] + index[i-1];
}
return result;
}
};
__host__ __device__ int gpu_int_floor(double number) {
return (int) number < 0.0 ? -ceil(fabs(number)) : floor(number);
}
namespace kernels{
template <int DIM>
__global__ void get_value_kernel(double* x, double* target_arr, double* grid_, DimmedGridGPU<DIM> g){
target_arr[0] = g.do_get_value(x, grid_);
printf("get_value_kernel has set target[0] to be %f\n", target_arr[0]);//check if the value is set correctly
return;
}
}
int main(){
using namespace kernels;
double min[] = {0};
double max[] = {10};
double bin_spacing[] = {1};
int periodic[] = {0};
DimmedGridGPU<1> g (min, max, bin_spacing, periodic, 0, 0);
for(int i = 0; i < 11; i++){
g.grid_[i] = i;
printf("g.grid_[%d] is now %f\n", i, g.grid_[i]);
}
gpuErrchk(cudaDeviceSynchronize());
double x[] = {3.5};
double* d_x;
gpuErrchk(cudaMalloc(&d_x, sizeof(double)));
gpuErrchk(cudaMemcpy(d_x, x, sizeof(double), cudaMemcpyHostToDevice));
double target[] = {5.0};
double* d_target;
gpuErrchk(cudaMalloc((void**)&d_target, sizeof(double)));
gpuErrchk(cudaMemcpy(d_target, target, sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaDeviceSynchronize());
get_value_kernel<1><<<1,1>>>(d_x, d_target, g.grid_, g);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(target, d_target, sizeof(double), cudaMemcpyDeviceToHost));
printf("and after GPU stuff, target[0] is now %f\n", target[0]);
return(0);
}
那么,当我包含的打印语句清楚地表明设备上使用了正确的值时,为什么这一行(最后 cudaMemcpy
)抛出错误“CudaErrorInvalidValue
” ,并且 do_get_value(x, grid_)
调用的 return 值是否正确?
我已经尝试过使用 cudaMemcpyFromSymbol
,认为赋值可能是创建一个符号而不是通过某种方式传递和更改值,但事实并非如此,因为 d_target
不是有效符号。
这是我的代码的示例输出:
g.grid_[0] is now 0.000000
g.grid_[1] is now 1.000000
g.grid_[2] is now 2.000000
g.grid_[3] is now 3.000000
g.grid_[4] is now 4.000000
g.grid_[5] is now 5.000000
g.grid_[6] is now 6.000000
g.grid_[7] is now 7.000000
g.grid_[8] is now 8.000000
g.grid_[9] is now 9.000000
g.grid_[10] is now 10.000000
get_index was called on the GPU in 1 dimension(s)
xi is now 3.500000, min_[i] is 0.000000 and dx_[i] is 1.000000
do_get_value was called on the GPU!, and index[0] is now 3
but multi2one(index) gives us 3
and value to be returned is 3.000000
get_value_kernel has set target[0] to be 3.000000
GPUassert: "cudaErrorInvalidValue": invalid argument gpugrid.cu 166
So, why does this line (the last cudaMemcpy) throw an error "CudaErrorInvalidValue"...?
问题围绕着你的析构函数:
~DimmedGridGPU(){
析构函数在您可能意想不到的地方被调用。要让自己相信这一点,请向析构函数添加 printf
语句。注意它在打印输出中出现的位置:
$ ./t955
g.grid_[0] is now 0.000000
g.grid_[1] is now 1.000000
g.grid_[2] is now 2.000000
g.grid_[3] is now 3.000000
g.grid_[4] is now 4.000000
g.grid_[5] is now 5.000000
g.grid_[6] is now 6.000000
g.grid_[7] is now 7.000000
g.grid_[8] is now 8.000000
g.grid_[9] is now 9.000000
g.grid_[10] is now 10.000000
Destructor!
get_index was called on the GPU in 1 dimension(s)
xi is now 3.500000, min_[i] is 0.000000 and dx_[i] is 1.000000
do_get_value was called on the GPU!, and index[0] is now 3
but multi2one(index) gives us 3
and value to be returned is 3.000000
get_value_kernel has set target[0] to be 3.000000
GPUassert: "cudaErrorInvalidValue": invalid argument t955.cu 167
鉴于此,很明显在该析构函数中调用 cudaDeviceReset()
现在似乎是个坏主意。 cudaDeviceReset()
会清除所有设备分配,因此当您尝试这样做时:
gpuErrchk(cudaMemcpy(target, d_target, sizeof(double), cudaMemcpyDeviceToHost));
d_target
不再是设备上的有效分配,因此当您尝试将其用作 cudaMemcpy
的设备目标时,运行时会检查此指针值(未被更改设备重置)并确定指针值不再对应于有效分配,并引发运行时错误。
Just like in C++ when you pass an object to a function (or a kernel in this case) as a pass-by-value parameter, the copy constructor for that object gets called. It stands to reason when that object copy goes out of scope, the destructor for it will be called.
我建议将 cudaDeviceReset()
这样的影响全局范围的函数放在对象析构函数中可能是一种脆弱的编程范式,但这也许是一个见仁见智的问题。我假设您现在有足够的信息来解决这个问题。
为了避免下一个可能的问题,简单地注释掉析构函数中对 cudaDeviceReset()
的调用可能不足以使所有问题消失(尽管这个特定的问题会)。既然您知道在该程序的正常执行过程中至少 两次 调用了此析构函数,您可能需要仔细考虑该析构函数中还发生了什么,并且可能会删除更多的东西,或者完全重新构建你的class。
例如,请注意 cudaDeviceReset()
并不是唯一可能在以这种方式使用的对象的析构函数中引起问题的函数。同样,当在对象副本上调用的析构函数中使用时,cudaFree()
可能会对原始对象产生意想不到的后果。