将 2D thrust::device_vector 复数矩阵传递给 CUDA 核函数

Pass 2D thrust::device_vector Complex Matrix to CUDA kernel function

我是 Cuda 的新手,我正在尝试使用 Cuda 将我现有的项目移动到 GPU。 我的代码基于复杂的矩阵和复杂的缓冲区。

第一步,我尝试将嵌套的For循环代码移至Cuda(其余类似):

     typedef thrust::complex<double> smp_t;

  uint8_t *binbuffer = (uint8_t*) malloc(8 * bufsize * sizeof(uint8_t));
  smp_t *sgbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
  smp_t *cnbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));

 // Create matrix.
 thrust::complex<double> i_unit(0.0, 1.0);
 thrust::host_vector<thrust::host_vector<smp_t>> tw(decfactor);

  // Fill the Matrix
  for (size_t row = 0; row < 8; row++) {
       for (size_t col = 0; col < 8; col++) {
               std::complex<double> tmp =
                      exp(-i_unit * 2.0*M_PI * ((double) col*row) / (double)8);
              tw[row].push_back(tmp);
      }
  }

/* The Code To Move to the GPU processing */
for (unsigned int i = 0; i < bufsize; i++) {
        for (size_t ch = 0; ch < 8; ch++)
                for (size_t k = 0; k < 8; k++)
                        cnbuf[ch*bufsize + i] += sgbuf[k*bufsize+i] * tw[ch].at(k);
}

这是 .cu 文件中将替换当前嵌套 for 循环的代码:

   __global__ void kernel_func(cuDoubleComplex *cnbuf, cuDoubleComplex *sgbuf, smp_t *tw, size_t block_size) {
    unsigned int ch = threadIdx.x;
    unsigned int k = blockIdx.x;

     for (int x = 0; x < block_size; ++x) {
            unsigned int sig_index = k*block_size+x;
            unsigned int tw_index = ch*k;
            unsigned int cn_index = ch*block_size+x;


            cuDoubleComplex temp = cuCmul(sgbuf[sig_index], make_cuDoubleComplex(tw[tw_index].real(), tw[tw_index].imag()));
            cnbuf[cn_index] = cuCadd(temp, cnbuf[cn_index]);
     }
}

void kernel_wrap(
            smp_t *cnbuf,
            smp_t *sgbuf,
            thrust::host_vector<thrust::host_vector<smp_t>>tw,
            size_t buffer_size) {
    smp_t *d_sgbuf;
    smp_t *d_cnbuf;
    thrust::device_vector<smp_t> d_tw(8*8);
    thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());

    cudaMalloc((void **)&d_sgbuf, buffer_size);
    cudaMalloc((void **)&d_cnbuf, buffer_size);

    cudaMemcpy(d_sgbuf, sgbuf, buffer_size, cudaMemcpyDeviceToHost);
    cudaMemcpy(d_cnbuf, cnbuf, buffer_size, cudaMemcpyDeviceToHost);

    thrust::raw_pointer_cast(d_tw.data());

    kernel_func<<<8, 8>>>(
   reinterpret_cast<cuDoubleComplex*>(d_cnbuf),
                    reinterpret_cast<cuDoubleComplex*>(d_sgbuf),
                    thrust::raw_pointer_cast(d_tw.data()),
                    buffer_size
    );

    cudaError_t varCudaError1 = cudaGetLastError();
    if (varCudaError1 != cudaSuccess)
    {
            std::cout << "Failed to launch subDelimiterExamine kernel (error code: " << cudaGetErrorString(varCudaError1) << ")!" << std::endl;
            exit(EXIT_FAILURE);
    }

    cudaMemcpy(sgbuf, d_sgbuf, buffer_size, cudaMemcpyHostToDevice);
    cudaMemcpy(cnbuf, d_cnbuf, buffer_size, cudaMemcpyHostToDevice);

}

当我运行代码时,我得到错误:

Failed to launch subDelimiterExamine kernel (error code: invalid argument)!

我认为引起麻烦的说法是'd_tw'。 所以,我的问题是:

  1. > 转换为 >(从 2d 矩阵到一个扁平的 arr)?
  2. 在 CUDA 中处理二维复数有更好的方法吗?
  3. 关于 Cuda 中复杂数组的文档非常少,我在哪里可以阅读大量关于 Cuda 复杂矩阵的工作?

谢谢!!!!

有各种各样的问题。我会列出一些,可能会遗漏一些。所以请参考我给出的示例代码以了解更多差异。

  1. 最直接的问题在这里:

    thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
    

这就是导致您看到的无效参数错误的原因。在引擎盖下,thrust 将尝试为此使用 cudaMemcpyAsync 操作,因为这本质上是从主机到设备的副本。我们将通过将其替换为普通的 cudaMemcpy 操作来解决此问题,但要了解如何构建它,有必要了解第 2 项。

  1. 您似乎认为向量的向量意味着连续存储。 It does not 并且该声明并非特定于推力。由于 thrust::host_vector 向量(甚至 std::vector 向量)并不意味着连续存储,我们不能轻易构造单个操作,例如 cudaMemcpythrust::copy 到复制此数据。因此,有必要明确地将其展平。

  2. 您在cudaMemcpy操作上的复制方向普遍落后。你应该有 cudaMemcpyHostToDevice 的地方你有 cudaMemcpyDeviceToHost,反之亦然。

  3. CUDA cuComplex.h 头文件早于 thrust,提供了一种快速的 C 风格方法来处理复数。没有它的文档——你必须阅读文件本身并弄清楚如何使用它,这似乎已经完成了。但是,由于您无论如何都在使用 thrust::complex<>,因此使用该编码范例并编写看起来几乎与主机代码完全一样的设备代码要简单得多。

  4. 您的各种传输大小有误。 cudaMemcpy 传输大小 字节

下面是一个示例,由您展示的部分拼凑而成,并进行了各种“修复”。我并没有声称它以任何方式完美或正确,但它避免了我上面概述的问题。此外,根据您使用或使用 -DUSE_KERNEL 定义进行编译的方式,它将 运行 您的“原始”主机代码并显示输出,或者内核代码并显示输出。根据我的测试,输出匹配。

$ cat t1751.cu
#include <thrust/complex.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>
#include <cstdint>
#include <cuComplex.h>

typedef thrust::complex<double> smp_t;
__global__ void kernel_func_old(cuDoubleComplex *cnbuf, cuDoubleComplex *sgbuf, smp_t *tw, size_t block_size) {
    unsigned int ch = threadIdx.x;
    unsigned int k = blockIdx.x;

     for (int x = 0; x < block_size; ++x) {
            unsigned int sig_index = k*block_size+x;
            unsigned int tw_index = ch*k;
            unsigned int cn_index = ch*block_size+x;


            cuDoubleComplex temp = cuCmul(sgbuf[sig_index], make_cuDoubleComplex(tw[tw_index].real(), tw[tw_index].imag()));
            cnbuf[cn_index] = cuCadd(temp, cnbuf[cn_index]);
     }
}
__global__ void kernel_func(smp_t *cnbuf, smp_t *sgbuf, smp_t *tw, size_t block_size) {
    unsigned row = blockIdx.x;
    unsigned col = threadIdx.x;
    unsigned idx = row*block_size+col;
    for (int k = 0; k < 8; k++)
      cnbuf[idx] += sgbuf[k*block_size+col] * tw[row*block_size+k];
}

void kernel_wrap(
            smp_t *cnbuf,
            smp_t *sgbuf,
            thrust::host_vector<thrust::host_vector<smp_t>>tw,
            size_t buffer_size) {
    smp_t *d_sgbuf;
    smp_t *d_cnbuf;
    thrust::device_vector<smp_t> d_tw(8*8);
//    thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
    thrust::host_vector<smp_t> htw(buffer_size*buffer_size);
    for (int i = 0; i < buffer_size; i++)
      for (int j = 0; j < buffer_size; j++)
        htw[i*buffer_size + j] = tw[i][j];

    cudaMemcpy(thrust::raw_pointer_cast(d_tw.data()), &htw[0], 8*8*sizeof(smp_t), cudaMemcpyHostToDevice);
    cudaMalloc((void **)&d_sgbuf, buffer_size*buffer_size*sizeof(smp_t));
    cudaMalloc((void **)&d_cnbuf, buffer_size*buffer_size*sizeof(smp_t));

    cudaMemcpy(d_sgbuf, sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);
    cudaMemcpy(d_cnbuf, cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);

    thrust::raw_pointer_cast(d_tw.data());

    kernel_func<<<8, 8>>>(d_cnbuf,d_sgbuf,thrust::raw_pointer_cast(d_tw.data()),buffer_size);

    cudaError_t varCudaError1 = cudaGetLastError();
    if (varCudaError1 != cudaSuccess)
    {
            std::cout << "Failed to launch subDelimiterExamine kernel (error code: " << cudaGetErrorString(varCudaError1) << ")!" << std::endl;
            exit(EXIT_FAILURE);
    }

//    cudaMemcpy(sgbuf, d_sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
    cudaMemcpy(cnbuf, d_cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
    for (int i = 0; i < 8; i++)
      for (int j = 0; j < 8; j++)
        std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
}

int main(){
  const int bufsize = 8;
  const int decfactor = 8;

  uint8_t *binbuffer = (uint8_t*) malloc(8 * bufsize * sizeof(uint8_t));
  smp_t *sgbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
  smp_t *cnbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
  memset(cnbuf, 0, 8*bufsize*sizeof(smp_t));
 // Create matrix.
 thrust::complex<double> i_unit(0.0, 1.0);
#ifndef USE_KERNEL
 std::vector<std::vector<smp_t> > tw(decfactor);
#else
 thrust::host_vector<thrust::host_vector<smp_t>> tw(decfactor);
#endif

  // Fill the Matrix
  for (size_t row = 0; row < 8; row++) {
       for (size_t col = 0; col < 8; col++) {
              std::complex<double> tmp = exp(-i_unit * 2.0*M_PI * ((double) col*row) / (double)8);
              tw[row].push_back(tmp);
      }
  }
  thrust::complex<double> test(1.0, 1.0);
  for (int i = 0; i < 8*8; i++) sgbuf[i]  = test;
#ifndef USE_KERNEL
/* The Code To Move to the GPU processing */
for (unsigned int i = 0; i < bufsize; i++) {
        for (size_t ch = 0; ch < 8; ch++)
                for (size_t k = 0; k < 8; k++)
                        cnbuf[ch*bufsize + i] += sgbuf[k*bufsize+i] * tw[ch].at(k);
}
    for (int i = 0; i < 8; i++)
      for (int j = 0; j < 8; j++)
        std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
#else

  kernel_wrap(cnbuf,sgbuf,tw,bufsize);
#endif

}
$ nvcc -o t1751 t1751.cu -std=c++11
$ ./t1751 >out_host.txt
$ nvcc -o t1751 t1751.cu -std=c++11 -DUSE_KERNEL
$ ./t1751 >out_device.txt
$ diff out_host.txt out_device.txt
$

请记住,这主要是您的代码,我并不是说它是正确的、没有缺陷的或适合任何特定目的。使用它需要您自担风险。