递归使用自实现 cuIDFT.cu 导致每次重新运行代码时都会更改输出

Question

我已经实现了反离散余弦 t运行sform (IDCT) 的 CUDA 版本，通过 "translating" MATLAB 内置函数 idct.m 到 CUDA:

我的实现是 cuIDCT.cu，当 m = n 以及 m 和 n[ 时都有效=100=]是偶数。

cuIDCT.cu

#include <stdio.h> #include <stdlib.h> #include <cuda.h> #include <cufft.h> #include <cuComplex.h> // round up n/m inline int iDivUp(int n, int m) { return (n + m - 1) / m; } typedef cufftComplex complex; #define PI 3.1415926535897932384626433832795028841971693993751 __global__ void idct_ComputeWeightsKernel(const int n, complex *ww) { const int pos = threadIdx.x + blockIdx.x * blockDim.x; if (pos >= n) return; ww[pos].x = sqrtf(2*n) * cosf(pos*PI/(2*n)); ww[pos].y = sqrtf(2*n) * sinf(pos*PI/(2*n)); } __global__ void idct_ComputeEvenKernel(const float *b, const int n, const int m, complex *ww, complex *y) { const int ix = threadIdx.x + blockIdx.x * blockDim.x; const int iy = threadIdx.y + blockIdx.y * blockDim.y; if (ix >= n || iy >= m) return; const int pos = ix + iy*n; // Compute precorrection factor ww[0].x = ww[0].x / sqrtf(2); ww[0].y = ww[0].y / sqrtf(2); y[iy + ix*m].x = ww[iy].x * b[pos]; y[iy + ix*m].y = ww[iy].y * b[pos]; } __global__ void Reordering_a0_Kernel(complex *y, const int n, const int m, complex *yy) { const int ix = threadIdx.x + blockIdx.x * blockDim.x; const int iy = threadIdx.y + blockIdx.y * blockDim.y; if (ix >= n || iy >= m) return; const int pos = ix + iy*n; yy[iy + ix*n].x = y[pos].x / (float) n; yy[iy + ix*n].y = y[pos].y / (float) n; } __global__ void Reordering_a_Kernel(complex *yy, const int n, const int m, float *a) { const int ix = threadIdx.x + blockIdx.x * blockDim.x; const int iy = threadIdx.y + blockIdx.y * blockDim.y; if (ix >= n || iy >= m) return; const int pos = ix + iy*n; // Re-order elements of each column according to equations (5.93) and (5.94) in Jain if (iy < n/2) { a[ix + 2*iy*n] = yy[pos].x; a[ix + (2*iy+1)*n] = yy[ix + (m-iy-1)*n].x; } } /** * a = idct(b), where a is of size [n m]. * @param b, input array * @param n, first dimension of a * @param m, second dimension of a * @param a, output array */ void cuIDCT(float *h_in, int n, int m, float *h_out) // a is of size [n m] { const int data_size = n * m * sizeof(float); // device memory allocation float *d_in, *d_out; cudaMalloc(&d_in, data_size); cudaMalloc(&d_out, data_size); // transfer data from host to device cudaMemcpy(d_in, h_in, data_size, cudaMemcpyHostToDevice); // compute IDCT using CUDA // begin============================================ // Compute weights complex *ww; cudaMalloc(&ww, n*sizeof(complex)); dim3 threads(256); dim3 blocks(iDivUp(n, threads.x)); idct_ComputeWeightsKernel<<<blocks, threads>>>(n, ww); complex *y; complex *yy; cufftHandle plan; dim3 threads1(32, 6); dim3 blocks2(iDivUp(n, threads1.x), iDivUp(m, threads1.y)); // for even case int Length[1] = {m}; // for each IFFT, the length is m cudaMalloc(&y, n*m*sizeof(complex)); idct_ComputeEvenKernel<<<blocks2, threads1>>>(d_in, n, m, ww, y); cufftPlanMany(&plan, 1, Length, Length, 1, m, Length, 1, m, CUFFT_C2C, n); cufftExecC2C(plan, y, y, CUFFT_INVERSE); // y is of size [n m] cudaMalloc(&yy, n*m*sizeof(complex)); Reordering_a0_Kernel<<<blocks2, threads1>>>(y, n, m, yy); Reordering_a_Kernel<<<blocks2, threads1>>>(yy, n, m, d_out); // end============================================ // transfer result from device to host cudaMemcpy(h_out, d_out, data_size, cudaMemcpyDeviceToHost); // cleanup cufftDestroy(plan); cudaFree(ww); cudaFree(y); cudaFree(yy); cudaFree(d_in); cudaFree(d_out); }

然后我使用以下代码将我的 CUDA IDCT（即 cuIDCT.cu）的结果与 MATLAB idct.m 的结果进行了比较：

一个测试main.cpp函数，并且

一个 MATLAB 主函数 main.m 从 CUDA 读取结果并将其与 MATLAB 进行比较。

main.cpp

#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <helper_functions.h> #include <stdlib.h> #include <stdio.h> // N must equal to M, and both must be even numbers #define N 256 #define M 256 void WriteDataFile(const char *name, int w, int h, const float *in, const float *out) { FILE *stream; stream = fopen(name, "wb"); float data = 202021.25f; fwrite(&data, sizeof(float), 1, stream); fwrite(&w, sizeof(w), 1, stream); fwrite(&h, sizeof(h), 1, stream); for (int i = 0; i < h; i++) for (int j = 0; j < w; j++) { const int pos = j + i * h; fwrite(in + pos, sizeof(float), 1, stream); fwrite(out + pos, sizeof(float), 1, stream); } fclose(stream); } void cuIDCT(float *b, int n, int m, float *a); int main() { // host memory allocation float *h_in = new float [N * M]; float *h_out = new float [N * M]; float *h_temp = new float [N * M]; // input data initialization for (int i = 0; i < N * M; i++) { h_in[i] = (float)rand()/(float)RAND_MAX; h_out[i] = h_in[i]; h_temp[i] = h_in[i]; } // please comment either one case for testing // test case 1: use cuIDCT.cu once // cuIDCT(h_in, N, M, h_out); // test case 2: iteratively use cuIDCT.cu for (int i = 0; i < 4; i++) { if (i % 2 == 0) cuIDCT(h_out, N, M, h_temp); else cuIDCT(h_temp, N, M, h_out); } // write data, for further visualization using MATLAB WriteDataFile("test.flo", N, M, h_in, h_out); // cleanup delete [] h_in; delete [] h_out; delete [] h_temp; cudaDeviceReset(); }

main.m

clc;clear; % read [h_in, h_out] = read_data('test.flo'); % MATLAB result, for test case 1, comment the for-loop matlab_out = h_in; for i = 1:4 matlab_out = idct(matlab_out); end % compare err = matlab_out - h_out; % show figure(1); subplot(221); imshow(h_in, []); title('h\_in'); colorbar subplot(222); imshow(h_out, []); title('h\_out'); colorbar subplot(223); imshow(matlab_out, []); title('matlab\_out'); colorbar subplot(224); imshow(err, []); title('error map'); colorbar disp(['maximum error between CUDA and MATLAB is ' ... num2str(max(max(abs(err))))])

我运行 Visual Studio 11（即 VS2012）在 Windows 7 上的代码使用 Nvidia GPU Tesla K20c，使用 CUDA 工具包版本 7.5，我的 MATLAB 版本是 R2015b。

我的测试步骤:

对于测试用例 1。取消注释测试用例 1 并注释测试用例 2。

运行 main.cpp.

运行 main.m 在 MATLAB 中。

重复步骤 1 和步骤 2（没有任何变化，只需重新运行代码）。

我重复了步骤 3 20 次。输出结果不变，main.m中的结果为：

results of test case 1

最大误差为 7.7152e-07。

对于测试用例 2。取消注释测试用例 2 并注释测试用例 1。

运行 main.cpp.

运行 main.m 在 MATLAB 中。

重复步骤 1 和步骤 2（没有任何变化，只需重新运行代码）。

我重复了步骤 3 20 次。输出结果改了，main.m中的结果是（not enough reputation to put all images, only wrong case is showed below）：

one situation (the wrong one) of test case 2

最大误差为0.45341（2次）、0.44898（1次）、0.26186（1次）、0.26301（1次）、9.5716e-07（15次）。

从测试结果来看，我的结论是：

从测试用例 1：cuIDCT.cu 在数字上是正确的（错误 ~10^-7）到 idct.m。

来自测试用例 2：递归使用 cuIDCT.cu 导致结果不稳定（即每次重新运行代码时输出都会改变，有时可能在数字上是错误的，误差 ~0.1）

我的问题：

从测试用例 1 中我们知道 cuIDCT.cu 在数值上与 idct.m 是正确的。但是为什么每次重新运行代码时递归使用cuIDCT.cu会导致不同的输出结果？

非常感谢任何帮助或建议。

Answer 1

我相信由于您 idct_ComputeEvenKernel:

中的这段代码，您的结果会出现变化

// Compute precorrection factor
ww[0].x = ww[0].x / sqrtf(2);
ww[0].y = ww[0].y / sqrtf(2);

目前还不完全清楚您的意图是什么，但是这段代码是否可以按照您的意愿行事值得怀疑。您可能对 CUDA 执行模型感到困惑。

以上代码将由您为通过线程检查的内核启动的每个 CUDA 线程执行：

if (ix >= n || iy >= m) return;

我相信这意味着 65536 个线程将在该内核中全部执行这段代码。此外，线程将以 more-or-less 任何顺序执行该代码（并非所有 CUDA 线程都以 lock-step 执行）。他们甚至可能在试图将他们的值写到位置 ww[0] 时互相踩到对方。所以 ww[0] 中的最终结果将是相当不可预测的。

当我注释掉这些代码行时，结果对我来说变得稳定（尽管与那些行的结果不同），与运行运行.[=33 保持不变=]

我想指出其他问题。无论您在哪里计算复数的 .x 和 .y 值，我的建议都是重新编写代码（例如）：

y[iy + ix*m].x = ww[iy].x * b[pos];
y[iy + ix*m].y = ww[iy].y * b[pos];

对此：

complex temp1, temp2;
temp1 = ww[iy];
temp2.x = temp1.x * b[pos];
temp2.y = temp2.y * b[pos];
y[iy + ix*m] = temp2;

至少根据我的测试，编译器似乎没有为您进行这种优化，side-effect 的一个好处是使用 cuda-memcheck --tool initcheck ... 测试您的代码要容易得多。在第一个实现中，编译器将加载 y[iy + ix*m] 作为一个 8 字节的量，修改它的 4 或 8 个字节，然后将 y[iy + ix*m] 作为一个 8 字节的量存储。第二种实现应该更有效（它消除了 y[] 的负载），并消除了未初始化数量 (y[]) 的负载，cuda-memcheck 工具会将其报告为危险。

无论您是运行代码的 1 遍版本还是代码的 4 遍版本，我所描述的这种可变性都应该是可能的。因此，我认为您关于 1-pass 版本正确的说法是值得怀疑的。我认为如果你运行 1-pass 版本足够，你最终会看到可变性（尽管它可能需要不同的初始内存条件，或者运行在不同的 GPU 类型上）。即使在您自己的结果中，我们也看到 4 遍代码的 20 个运行中有 15 个产生 "correct" 结果，即残差为 ~1e-7

这是我修改后的 cuIDCT.cu 文件，根据您发布的版本 here 进行了修改。我在下面做的假设是您只想计算 ww[0] 上的缩放一次，在这种情况下，我们可以轻松地将该算术作为先前 idct_ComputeWeightsKernel:

的附录来处理

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cufft.h>
#include <cuComplex.h>
#include <helper_cuda.h>
#include "assert.h"

// round up n/m
inline int iDivUp(int n, int m)
{
    return (n + m - 1) / m;
}

typedef cufftComplex complex;

#define PI 3.1415926535897932384626433832795028841971693993751

#define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
    if( CUFFT_SUCCESS != err) {
        fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
            _cudaGetErrorEnum(err)); \
            cudaDeviceReset(); assert(0); \
    }
}


__global__
    void idct_ComputeWeightsKernel(const int n, complex *ww)
{
    const int pos = threadIdx.x + blockIdx.x * blockDim.x;

    if (pos >= n) return;
    complex temp;
    temp.x =  sqrtf(2*n) * cosf(pos*PI/(2*n));
    temp.y =  sqrtf(2*n) * sinf(pos*PI/(2*n));
    if (pos == 0) {
      temp.x /= sqrtf(2);
      temp.y /= sqrtf(2);}
    ww[pos] = temp;
}

__global__
    void idct_ComputeEvenKernel(const float *b, const int n, const int m, complex *ww, complex *y)
{
    const int ix = threadIdx.x + blockIdx.x * blockDim.x;
    const int iy = threadIdx.y + blockIdx.y * blockDim.y;
    if (ix >= n || iy >= m) return;

    const int pos = ix + iy*n;
/*  handle this in idct_ComputeWeightsKernel
    // Compute precorrection factor
    ww[0].x = ww[0].x / sqrtf(2);
    ww[0].y = ww[0].y / sqrtf(2);
*/
    complex temp1, temp2;
    temp1 = ww[iy];
    temp2.x = temp1.x * b[pos];
    temp2.y = temp1.y * b[pos];
    y[iy + ix*m] = temp2;
}

__global__
    void Reordering_a0_Kernel(complex *y, const int n, const int m, complex *yy)
{
    const int ix = threadIdx.x + blockIdx.x * blockDim.x;
    const int iy = threadIdx.y + blockIdx.y * blockDim.y;
    if (ix >= n || iy >= m) return;

    const int pos = ix + iy*n;
    complex temp1, temp2;
    temp1 = y[pos];
    temp2.x = temp1.x / (float) n;
    temp2.y = temp1.y / (float) n;
    yy[iy + ix*n] = temp2;
}

__global__
    void Reordering_a_Kernel(complex *yy, const int n, const int m, float *a)
{
    const int ix = threadIdx.x + blockIdx.x * blockDim.x;
    const int iy = threadIdx.y + blockIdx.y * blockDim.y;
    if (ix >= n || iy >= m) return;

    const int pos = ix + iy*n;

    // Re-order elements of each column according to equations (5.93) and (5.94) in Jain
    if (iy < n/2)
    {
        a[ix + 2*iy*n]     = yy[pos].x;
        a[ix + (2*iy+1)*n] = yy[ix + (m-iy-1)*n].x;
    }
}

/**
* a = idct(b), where a is of size [n m].
* @param b, input array
* @param n, first dimension of a
* @param m, second dimension of a
* @param a, output array
*/
void cuIDCT(float *h_in, int n, int m, float *h_out) // a is of size [n m]
{
    const int data_size   = n * m * sizeof(float);

    // device memory allocation
    float *d_in, *d_out;
    checkCudaErrors(cudaMalloc(&d_in,  data_size));
    checkCudaErrors(cudaMalloc(&d_out, data_size));

    // transfer data from host to device
    checkCudaErrors(cudaMemcpy(d_in, h_in, data_size, cudaMemcpyHostToDevice));

    // compute IDCT using CUDA
    // begin============================================
    // Compute weights
    complex *ww;
    checkCudaErrors(cudaMalloc(&ww, n*sizeof(complex)));
    dim3 threads(256);
    dim3 blocks(iDivUp(n, threads.x));
    idct_ComputeWeightsKernel<<<blocks, threads>>>(n, ww);

    complex *y;
    complex *yy;
    cufftHandle plan;

    dim3 threads1(32, 6);
    dim3 blocks2(iDivUp(n,   threads1.x), iDivUp(m, threads1.y)); // for even case

    int Length[1] = {m}; // for each IFFT, the length is m

    checkCudaErrors(cudaMalloc(&y,  n*m*sizeof(complex)));

    idct_ComputeEvenKernel<<<blocks2, threads1>>>(d_in, n, m, ww, y);
    cufftSafeCall(cufftPlanMany(&plan, 1, Length,
                                Length, 1, m,
                                Length, 1, m, CUFFT_C2C, n));
    cufftSafeCall(cufftExecC2C(plan, y, y, CUFFT_INVERSE)); // y is of size [n m]

    checkCudaErrors(cudaMalloc(&yy,  n*m*sizeof(complex)));
    Reordering_a0_Kernel<<<blocks2, threads1>>>(y, n, m, yy);
    cudaMemset(d_out, 0, data_size);
    Reordering_a_Kernel<<<blocks2, threads1>>>(yy, n, m, d_out);
    // end============================================

    // transfer result from device to host
    checkCudaErrors(cudaMemcpy(h_out, d_out, data_size, cudaMemcpyDeviceToHost));


    // cleanup
    cufftDestroy(plan);
    checkCudaErrors(cudaFree(ww));
    checkCudaErrors(cudaFree(y));
    checkCudaErrors(cudaFree(yy));
    checkCudaErrors(cudaFree(d_in));
    checkCudaErrors(cudaFree(d_out));
}

你会注意到我在 d_out 上添加了一个额外的 cudaMemset，因为它帮助我解决了 cuda-memcheck --tool initcheck ... 的问题。应该没必要，想删也可以。

递归使用自实现 cuIDFT.cu 导致每次重新运行代码时都会更改输出

Recursively use of self-implemented cuIDFT.cu leads to changing output every time when re-runing the code

matlab

cuda

gpu

dct

cufft