从设备复制到主机时,cudaMemcpy 抛出 InvalidValue 错误

cudaMemcpy throws InvalidValue error when copying from device to host

我一直在尝试使用 cuFFT 实现一维 FFT。抛出 InvalidValue 错误并且没有产生有意义的结果。

我已尝试确保捕获每个错误,并且我相信 DeviceToHost 中的 cudaMemcpy 会导致问题,但我不确定原因,也不知道如何修复它。 cudaMemcpy 中的数据大小参数遵循与 cuFFT 文档提供的相同关系。


#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include <cuda_runtime_api.h>
#include <cufft.h>

// cuda macros
#define NX            100         // number of points
#define BATCH         1           // number of ffts to perform
#define RANK          1           //
#define IDIST         1           // distance between 1st elements of batches
#define ISTRIDE       1           // do every ISTRIDEth index
#define ODIST         1           // distance between 1st elements of output
#define OSTRIDE       1           // distance between output elements

void fft1d() {

  // create plan for performing fft
  cufftHandle plan;
  if (cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH) != CUFFT_SUCCESS) {
    printf("Failed to create 1D plan\n");
    return;
  }

  // assemble data
  double temp_data[] = {2.598076211353316, 3.2402830701637395, 3.8494572900049224, 4.419388724529261, 4.944267282795252, 5.41874215947433, 5.837976382931011, 6.197696125093141, 6.494234270429254, 6.724567799874842, 6.886348608602047, 6.97792744346504, 6.998370716093996, 6.9474700202387565, 6.8257442563389, 6.6344343416615565, 6.37549055993378, 6.051552679431957, 5.665923042211819, 5.222532898817316, 4.725902331664744, 4.181094175657916, 3.5936624057845576, 2.9695955178498603, 2.315255479544737, 1.6373128742041732, 0.9426788984240022, 0.23843490677753865, -0.46823977812093664, -1.1701410542749289, -1.8601134815746807, -2.531123226988873, -3.176329770049035, -3.7891556376344524, -4.363353457155562, -4.893069644570959, -5.3729040779788875, -5.797965148448726, -6.163919626883915, -6.467036838555256, -6.704226694973039, -6.873071195387157, -6.971849076777267, -6.999553361041935, -6.955901620504255, -6.84133885708361, -6.657032965782207, -6.404862828733319, -6.0873991611848375, -5.707878304681281, -5.270169234606201, -4.778734118422206, -4.23858282669252, -3.6552218606153755, -3.0345982167228436, -2.383038761007964, -1.707185730522749, -1.0139290199674, -0.31033594356630245, 0.39642081173600463, 1.0991363072871054, 1.7906468025248339, 2.463902784786862, 3.1120408346390414, 3.728453594100783, 4.306857124485735, 4.841354967187034, 5.326498254347925, 5.757341256627454, 6.129491801786784, 6.439156050110601, 6.683177170206378, 6.859067520906216, 6.965034011197066, 6.999996379650895, 6.963598207007518, 6.85621054964381, 6.678928156888352, 6.433558310743566, 6.122602401787424, 5.749230429076629, 5.317248684008804, 4.831060947586139, 4.295623596650021, 3.7163950767501706, 3.0992802567403803, 2.4505702323708074, 1.7768781925409076, 1.0850720020162676, 0.3822041878858906, -0.3245599564963766, -1.0280154171511335, -1.7209909100394047, -2.3964219877733033, -3.0474230571943477, -3.667357573646071, -4.249905696354359, -4.78912871521179, -5.279529592175676, -5.716109000098287};
  cufftReal *idata;
  cudaMalloc((void**) &idata, sizeof(cufftComplex)*NX);
  if (cudaGetLastError() != cudaSuccess) {
    printf("Failed to allocate memory space for input data.\n");
    return;
  }

  cudaMemcpy(idata, temp_data, sizeof(temp_data)/sizeof(double), cudaMemcpyHostToDevice);
  if (cudaGetLastError() != cudaSuccess) {
    printf("Failed to load time data to memory.\n");
    return;
  }

  // prepare memory for return data
  cufftComplex *odata;
  cudaMalloc((void**) &odata, sizeof(cufftComplex)*(NX/2 + 1));
  if (cudaGetLastError() != cudaSuccess) {
    printf("Failed to allocate memory for output data.\n");
  }

  // perform fft
  if (cufftExecR2C(plan, idata, odata) != CUFFT_SUCCESS) {
    printf("Failed to perform fft.\n");
    return;
  }

我认为错误是在这里抛出的,在 cudaMemcpy 处。


  // grab data from graphics and print (memcpy waits until complete) cuda memcopy doesn't complete
  // can return errors from previous cuda calls if they haven't been caught
  cufftComplex *out_temp_data;
  size_t num_bytes = (NX/2 + 1)*sizeof(cufftComplex);
  cudaMemcpy(out_temp_data, odata, num_bytes, cudaMemcpyDeviceToHost);
  int error_value = cudaGetLastError();
  printf("cudaMemcpy from device state: %i\n", error_value);
  if(error_value != cudaSuccess) {
    printf("Failed to pull data from device.\n");
    return;
  }

  for (size_t i = 0; i < (NX/2 + 1); i++) {
    printf("%lu %f %f\n", i, out_temp_data[i].x, out_temp_data[i].y);
  }

  // clean up
  cufftDestroy(plan);
  cudaFree(idata);

}


int main() {

  fft1d();
  return 0;
}

必须先分配内存,cudaMemcpy 才能写入数据。感谢 generic-opto-guy 指出这一点。

在这种情况下:

out_temp_data = new cufftComplex[NX/2 + 1];