devicesync 和 cudamemcopy 中引用的 Cuda 错误非法内存

Cuda error illegal memory referenced in devicesync and cudamemcopy

在我的代码中我创建了一个主变量

h4_in = (double*)calloc(2 * countlog, sizeof(double));
h4_out = (double*)calloc(23 * countlog, sizeof(double));

countlog 是一个变量,它基本上表示二维数组的行大小(我将其实现为一维数组)

//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
{
    h4_in[count * 2 + 0] = prc[count];
    h4_in[count * 2 + 1] = h_stat1out[count * 6];
}

下面是我在主程序中调用CUDA的方式

//free cuda memory from previous call
cudaFree(d3_in);
cudaFree(d3_out);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "cudaDeviceReset Failed :%s\n", cudaGetErrorString(cudaStatus));
}
//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
{
    h4_in[count * 2 + 0] = prc[count];
    h4_in[count * 2 + 1] = h_stat1out[count * 6];
}
//Query device to get parameters
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &threadsPerBlock, calcstats2, 0, countlog);
// Round up according to array size 
blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;
//allocate memory on gpu
cudaStatus = cudaMalloc((void **)&d4_in, 2 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_in :%s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaMalloc((void **)&d4_out, 23 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_out :%s\n", cudaGetErrorString(cudaStatus));
}
//transfer array to gpu
cudaStatus = cudaMemcpy(d4_in, h4_in, 2 * countlog * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 host2device :%s\n", cudaGetErrorString(cudaStatus));
}
//launch threads
calcstats2 <<<blocksPerGrid, threadsPerBlock>>>(d4_out, d4_in, countlog);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "calcstats2 kernel failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "Device Sync failed: %s\n", cudaGetErrorString(cudaStatus));
}
//transfer data back to host
cudaStatus = cudaMemcpy(h4_out, d4_out, 23 * countlog * sizeof(double), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 device2host :%s\n", cudaGetErrorString(cudaStatus));
}
//free cuda
cudaFree(d4_in);
cudaFree(d4_out);

内核调用如下

__global__ void calcstats2(double *d4_out, double *d4_in, int size)
{
    int idx = blockDim.x*blockIdx.x + threadIdx.x;
    double X, A, B, C, D, BX, BA, BB, BC;
    if (idx < 4)
    {
        d4_out[idx * 23 + 0] = -1;
        d4_out[idx * 23 + 1] = -1;
        d4_out[idx * 23 + 2] = -1;
        d4_out[idx * 23 + 3] = -1;
        d4_out[idx * 23 + 4] = -1;
        d4_out[idx * 23 + 5] = -1;
        d4_out[idx * 23 + 6] = -1;
        d4_out[idx * 23 + 7] = -1;
        d4_out[idx * 23 + 8] = -1;
        d4_out[idx * 23 + 9] = -1;
        d4_out[idx * 23 + 10] = -1;
        d4_out[idx * 23 + 11] = -1;
        d4_out[idx * 23 + 12] = -1;
        d4_out[idx * 23 + 13] = -1;
        d4_out[idx * 23 + 14] = -1;
        d4_out[idx * 23 + 15] = -1;
        d4_out[idx * 23 + 16] = -1;
        d4_out[idx * 23 + 17] = -1;
        d4_out[idx * 23 + 18] = -1;
        d4_out[idx * 23 + 19] = -1;
        d4_out[idx * 23 + 20] = -1;
        d4_out[idx * 23 + 21] = -1;
        d4_out[idx * 23 + 22] = -1;
    }
    else
    {
        X = d4_in[idx * 2 - 8];
        A = d4_in[idx * 2 - 6];
        B = d4_in[idx * 2 - 4];
        C = d4_in[idx * 2 - 2];
        D = d4_in[idx * 2 - 0];
        BX = d4_in[idx * 2 - 5];
        BA = d4_in[idx * 2 - 3];
        BB = d4_in[idx * 2 - 1];
        BC = d4_in[idx * 2 + 1];
        //start the stats calcs here
        d4_out[idx * 23 + 0] = fabs(X - D) / fabs(A - X);
        d4_out[idx * 23 + 1] = fabs(A - D) / fabs(A - X);
        d4_out[idx * 23 + 2] = fabs(B - D) / fabs(C - B);
        d4_out[idx * 23 + 3] = fabs(C - D) / fabs(C - B);
        d4_out[idx * 23 + 4] = fabs(B - D) / fabs(A - B);
        d4_out[idx * 23 + 5] = fabs(A - D) / fabs(A - B);
        d4_out[idx * 23 + 6] = fabs(X - C) / fabs(A - X);
        d4_out[idx * 23 + 7] = fabs(A - C) / fabs(A - X);
        d4_out[idx * 23 + 8] = fabs(C - B) / fabs(A - B);
        d4_out[idx * 23 + 9] = fabs(A - B) / fabs(A - X);
        d4_out[idx * 23 + 10] = fabs(C - D) / fabs(A - B);
        d4_out[idx * 23 + 11] = fabs(C - D) / fabs(A - X);
        d4_out[idx * 23 + 12] = fabs(C - B) / fabs(A - X);
        d4_out[idx * 23 + 13] = BC;
        d4_out[idx * 23 + 14] = BB;
        d4_out[idx * 23 + 15] = BA;
        d4_out[idx * 23 + 16] = BX;
        d4_out[idx * 23 + 17] = BB + BC;
        d4_out[idx * 23 + 18] = BA + BB + BC;
        d4_out[idx * 23 + 19] = BX + BA + BB + BC;
        d4_out[idx * 23 + 20] = BA + BB;
        d4_out[idx * 23 + 21] = BX + BA + BB;
        d4_out[idx * 23 + 22] = BX + BA;
    }
}

我在要托管的 cudamemcppy 设备和 cudadevicesynchronise 中收到一个错误,指出遇到了非法内存访问。在堆栈溢出帮助之后,我更正了我的代码以使其成为一维数组,我已经为主机和设备数组分配了相同的内存。奇怪的是

  1. 此程序运行在较小的文件上成功(输入是 OHLC 数据)但在较大的文件上出现此错误

  2. 即使对于较大的文件,也有 3 个其他内核调用 运行 成功,没有任何问题。

任何帮助将不胜感激。

提前致谢

阿布舍克

PS 我正在使用具有 2GB 内存的单个 GTX 760 卡(ASUS 供应商:https://www.asus.com/Graphics-Cards/GTX760DC2OC2GD5/)。 cuda 版本也是 7。IDE 是 VS 2013。

您正在(可能)启动比实际需要更多的线程:

 blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;

并且您的内核中没有针对这种情况的线程检查。编号高于 countlog 的线程将越界访问您的数组。

尝试将内核中的 else 语句更改为:

else if (idx < size)