device class 成员函数更改设备变量值后从设备复制到主机时出现 cudaMemcpy 错误

Question

我对我编写的 CUDA 代码的行为感到困惑。我正在为名为 DimmedGridGPU 的 class 中的 __device__ 函数编写测试。这个 class 是在 int DIM 上模板化的，我遇到问题的函数是 return 最接近输入值 x 的网格值。我有这个内核命名空间用于单元测试目的，以隔离调用每个 __device__ 函数。

此代码的预期行为是 return 来自 do_get_value(x, grid_) 调用的值 3.0，并将 d_target[0] 设置为此值，然后将其传回到主机端进行单元测试断言。整个内核似乎运行正常，但是当我做最后的传输回主机端时，我收到一个 cudaErrorInvalidValue 错误，我不明白为什么。

这是代码的最小示例，保留了 class 的结构及其功能：

#include <cuda_runtime.h>
#include <fstream>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
     fprintf(stderr,"GPUassert: \"%s\": %s %s %d\n", cudaGetErrorName(code), cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}


template <int DIM>
class DimmedGridGPU{

public:
  size_t grid_size_;//total size of grid
  int b_derivatives_;//if derivatives are going to be used
  int b_interpolate_;//if interpolation should be used on the grid
  double* grid_;//the grid values
  double* grid_deriv_;//derivatives    
  double dx_[DIM];//grid spacing
  double min_[DIM];//grid minimum
  double max_[DIM];//maximum
  int grid_number_[DIM];//number of points on grid
  int b_periodic_[DIM];//if a dimension is periodic
  int* d_b_interpolate_;
  int* d_b_derivatives_;


  DimmedGridGPU(const double* min, 
        const double* max, 
        const double* bin_spacing, 
        const int* b_periodic, 
        int b_derivatives, 
        int b_interpolate) :   b_derivatives_(b_derivatives), b_interpolate_(b_interpolate), grid_(NULL), grid_deriv_(NULL){
    
    size_t i;

    for(i = 0; i < DIM; i++) {
      min_[i] = min[i];
      max_[i] = max[i];
      b_periodic_[i] = b_periodic[i];

      grid_number_[i] = (int) ceil((max_[i] - min_[i]) / bin_spacing[i]);
      dx_[i] = (max_[i] - min_[i]) / grid_number_[i];
      //add one to grid points if 
      grid_number_[i] = b_periodic_[i] ? grid_number_[i] : grid_number_[i] + 1;
      //increment dx to compensate
      if(!b_periodic_[i])
    max_[i] += dx_[i];
    }

    grid_size_ = 1;
    for(i = 0; i < DIM; i++)
      grid_size_ *= grid_number_[i];
    gpuErrchk(cudaMallocManaged(&grid_, grid_size_ * sizeof(double)));
    if(b_derivatives_) {
      gpuErrchk(cudaMallocManaged(&grid_deriv_, DIM * grid_size_ * sizeof(double)));
      if(!grid_deriv_) {
    printf("Out of memory!! gpugrid.cuh:initialize");   
      }
    }
    
    gpuErrchk(cudaMalloc((void**)&d_b_interpolate_, sizeof(int)));
    gpuErrchk(cudaMemcpy(d_b_interpolate_, &b_interpolate, sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMalloc((void**)&d_b_derivatives_, sizeof(int)));
    gpuErrchk(cudaMemcpy(d_b_derivatives_, &b_derivatives, sizeof(int), cudaMemcpyHostToDevice));
  }

  ~DimmedGridGPU(){
    gpuErrchk(cudaDeviceSynchronize());
    if(grid_ != NULL){
      gpuErrchk(cudaFree(grid_));
      grid_ = NULL;//need to do this so DimmedGrid's destructor functions properly
    }
    
    if(grid_deriv_ != NULL){
      gpuErrchk(cudaFree(grid_deriv_));
      grid_deriv_ = NULL;
    }
      
    gpuErrchk(cudaDeviceReset());
  }
//gets the value of the grid closest to x
  __host__ __device__ double do_get_value( double* x, double* grid_) {

    size_t index[DIM];
    get_index(x, index);
    printf("do_get_value was called on the GPU!, and index[0] is now %d\n", index[0]);
    printf("but multi2one(index) gives us %d\n", multi2one(index));
    double value = grid_[multi2one(index)];
    printf("and value to be returned is %f\n", value);
    return value;
  }
//gets grid's 1D index from an array of coordinates
   __host__ __device__ void get_index(const double* x, size_t result[DIM]) const {
    size_t i;
    double xi;
    printf("get_index was called on the GPU in %i dimension(s)\n", DIM);
    for(i = 0; i < DIM; i++) {
      xi = x[i];
      printf("xi is now %f, min_[i] is %f and dx_[i] is %f\n",xi, min_[i], dx_[i]);
      if(b_periodic_[i]){
    xi -= (max_[i] - min_[i]) * gpu_int_floor((xi - min_[i]) / (max_[i] - min_[i]));
      }
      result[i] = (size_t) floor((xi - min_[i]) / dx_[i]);
    }
  }
//takes a multidimensional index to a 1D index
  __host__ __device__ size_t multi2one(const size_t index[DIM]) const {
    size_t result = index[DIM-1];

    size_t i;    
    for(i = DIM - 1; i > 0; i--) {
      result = result * grid_number_[i-1] + index[i-1];
    }
    
    return result;
    
  }

};

__host__ __device__ int gpu_int_floor(double number) {
  return (int) number < 0.0 ? -ceil(fabs(number)) : floor(number);
}


namespace kernels{
  template <int DIM>
  __global__ void get_value_kernel(double* x, double* target_arr, double* grid_, DimmedGridGPU<DIM>  g){
    target_arr[0] = g.do_get_value(x, grid_);
    printf("get_value_kernel has set target[0] to be %f\n", target_arr[0]);//check if the value is set correctly
    return;
  }
}


int main(){
  using namespace kernels;
  double min[] = {0};
  double max[] = {10};
  double bin_spacing[] = {1};
  int periodic[] = {0};
  DimmedGridGPU<1> g (min, max, bin_spacing, periodic, 0, 0);
  for(int i = 0; i < 11; i++){
    g.grid_[i] = i;
    printf("g.grid_[%d] is now %f\n", i, g.grid_[i]);
  }
  gpuErrchk(cudaDeviceSynchronize());
  double x[] = {3.5};
  
  double* d_x;
  gpuErrchk(cudaMalloc(&d_x, sizeof(double)));
  gpuErrchk(cudaMemcpy(d_x, x, sizeof(double), cudaMemcpyHostToDevice));
  double target[] = {5.0};
  double* d_target;
  gpuErrchk(cudaMalloc((void**)&d_target, sizeof(double)));
  gpuErrchk(cudaMemcpy(d_target, target, sizeof(double), cudaMemcpyHostToDevice));
  gpuErrchk(cudaDeviceSynchronize());
  get_value_kernel<1><<<1,1>>>(d_x, d_target, g.grid_, g);
  gpuErrchk(cudaDeviceSynchronize());
  gpuErrchk(cudaMemcpy(target, d_target, sizeof(double), cudaMemcpyDeviceToHost));
  printf("and after GPU stuff, target[0] is now %f\n", target[0]);
  return(0);
}

那么，当我包含的打印语句清楚地表明设备上使用了正确的值时，为什么这一行（最后 cudaMemcpy）抛出错误“CudaErrorInvalidValue” ，并且 do_get_value(x, grid_) 调用的 return 值是否正确？

我已经尝试过使用 cudaMemcpyFromSymbol，认为赋值可能是创建一个符号而不是通过某种方式传递和更改值，但事实并非如此，因为 d_target 不是有效符号。

这是我的代码的示例输出：

g.grid_[0] is now 0.000000

g.grid_[1] is now 1.000000

g.grid_[2] is now 2.000000

g.grid_[3] is now 3.000000

g.grid_[4] is now 4.000000

g.grid_[5] is now 5.000000

g.grid_[6] is now 6.000000

g.grid_[7] is now 7.000000

g.grid_[8] is now 8.000000

g.grid_[9] is now 9.000000

g.grid_[10] is now 10.000000

get_index was called on the GPU in 1 dimension(s)

xi is now 3.500000, min_[i] is 0.000000 and dx_[i] is 1.000000

do_get_value was called on the GPU!, and index[0] is now 3

but multi2one(index) gives us 3

and value to be returned is 3.000000

get_value_kernel has set target[0] to be 3.000000

GPUassert: "cudaErrorInvalidValue": invalid argument gpugrid.cu 166

Answer 1

So, why does this line (the last cudaMemcpy) throw an error "CudaErrorInvalidValue"...?

问题围绕着你的析构函数：

  ~DimmedGridGPU(){

析构函数在您可能意想不到的地方被调用。要让自己相信这一点，请向析构函数添加 printf 语句。注意它在打印输出中出现的位置：

$ ./t955
g.grid_[0] is now 0.000000
g.grid_[1] is now 1.000000
g.grid_[2] is now 2.000000
g.grid_[3] is now 3.000000
g.grid_[4] is now 4.000000
g.grid_[5] is now 5.000000
g.grid_[6] is now 6.000000
g.grid_[7] is now 7.000000
g.grid_[8] is now 8.000000
g.grid_[9] is now 9.000000
g.grid_[10] is now 10.000000
Destructor!
get_index was called on the GPU in 1 dimension(s)
xi is now 3.500000, min_[i] is 0.000000 and dx_[i] is 1.000000
do_get_value was called on the GPU!, and index[0] is now 3
but multi2one(index) gives us 3
and value to be returned is 3.000000
get_value_kernel has set target[0] to be 3.000000
GPUassert: "cudaErrorInvalidValue": invalid argument t955.cu 167

鉴于此，很明显在该析构函数中调用 cudaDeviceReset() 现在似乎是个坏主意。 cudaDeviceReset() 会清除所有设备分配，因此当您尝试这样做时：

gpuErrchk(cudaMemcpy(target, d_target, sizeof(double), cudaMemcpyDeviceToHost));

d_target 不再是设备上的有效分配，因此当您尝试将其用作 cudaMemcpy 的设备目标时，运行时会检查此指针值（未被更改设备重置）并确定指针值不再对应于有效分配，并引发运行时错误。

Just like in C++ when you pass an object to a function (or a kernel in this case) as a pass-by-value parameter, the copy constructor for that object gets called. It stands to reason when that object copy goes out of scope, the destructor for it will be called.

我建议将 cudaDeviceReset() 这样的影响全局范围的函数放在对象析构函数中可能是一种脆弱的编程范式，但这也许是一个见仁见智的问题。我假设您现在有足够的信息来解决这个问题。

为了避免下一个可能的问题，简单地注释掉析构函数中对 cudaDeviceReset() 的调用可能不足以使所有问题消失（尽管这个特定的问题会）。既然您知道在该程序的正常执行过程中至少两次调用了此析构函数，您可能需要仔细考虑该析构函数中还发生了什么，并且可能会删除更多的东西，或者完全重新构建你的class。

例如，请注意 cudaDeviceReset() 并不是唯一可能在以这种方式使用的对象的析构函数中引起问题的函数。同样，当在对象副本上调用的析构函数中使用时，cudaFree() 可能会对原始对象产生意想不到的后果。

device class 成员函数更改设备变量值后从设备复制到主机时出现 cudaMemcpy 错误

cudaMemcpy error when copying from device to host after device class member function alters value of device variable

c++

templates

cuda

gpu

class

__device__ class 成员函数更改设备变量值后从设备复制到主机时出现 cudaMemcpy 错误

cudaMemcpy error when copying from device to host after __device__ class member function alters value of device variable

c++

templates

cuda

gpu

class

device class 成员函数更改设备变量值后从设备复制到主机时出现 cudaMemcpy 错误

cudaMemcpy error when copying from device to host after device class member function alters value of device variable