devicesync 和 cudamemcopy 中引用的 Cuda 错误非法内存
Cuda error illegal memory referenced in devicesync and cudamemcopy
在我的代码中我创建了一个主变量
h4_in = (double*)calloc(2 * countlog, sizeof(double));
h4_out = (double*)calloc(23 * countlog, sizeof(double));
countlog 是一个变量,它基本上表示二维数组的行大小(我将其实现为一维数组)
//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
{
h4_in[count * 2 + 0] = prc[count];
h4_in[count * 2 + 1] = h_stat1out[count * 6];
}
下面是我在主程序中调用CUDA的方式
//free cuda memory from previous call
cudaFree(d3_in);
cudaFree(d3_out);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset Failed :%s\n", cudaGetErrorString(cudaStatus));
}
//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
{
h4_in[count * 2 + 0] = prc[count];
h4_in[count * 2 + 1] = h_stat1out[count * 6];
}
//Query device to get parameters
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &threadsPerBlock, calcstats2, 0, countlog);
// Round up according to array size
blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;
//allocate memory on gpu
cudaStatus = cudaMalloc((void **)&d4_in, 2 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_in :%s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaMalloc((void **)&d4_out, 23 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_out :%s\n", cudaGetErrorString(cudaStatus));
}
//transfer array to gpu
cudaStatus = cudaMemcpy(d4_in, h4_in, 2 * countlog * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 host2device :%s\n", cudaGetErrorString(cudaStatus));
}
//launch threads
calcstats2 <<<blocksPerGrid, threadsPerBlock>>>(d4_out, d4_in, countlog);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "calcstats2 kernel failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "Device Sync failed: %s\n", cudaGetErrorString(cudaStatus));
}
//transfer data back to host
cudaStatus = cudaMemcpy(h4_out, d4_out, 23 * countlog * sizeof(double), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 device2host :%s\n", cudaGetErrorString(cudaStatus));
}
//free cuda
cudaFree(d4_in);
cudaFree(d4_out);
内核调用如下
__global__ void calcstats2(double *d4_out, double *d4_in, int size)
{
int idx = blockDim.x*blockIdx.x + threadIdx.x;
double X, A, B, C, D, BX, BA, BB, BC;
if (idx < 4)
{
d4_out[idx * 23 + 0] = -1;
d4_out[idx * 23 + 1] = -1;
d4_out[idx * 23 + 2] = -1;
d4_out[idx * 23 + 3] = -1;
d4_out[idx * 23 + 4] = -1;
d4_out[idx * 23 + 5] = -1;
d4_out[idx * 23 + 6] = -1;
d4_out[idx * 23 + 7] = -1;
d4_out[idx * 23 + 8] = -1;
d4_out[idx * 23 + 9] = -1;
d4_out[idx * 23 + 10] = -1;
d4_out[idx * 23 + 11] = -1;
d4_out[idx * 23 + 12] = -1;
d4_out[idx * 23 + 13] = -1;
d4_out[idx * 23 + 14] = -1;
d4_out[idx * 23 + 15] = -1;
d4_out[idx * 23 + 16] = -1;
d4_out[idx * 23 + 17] = -1;
d4_out[idx * 23 + 18] = -1;
d4_out[idx * 23 + 19] = -1;
d4_out[idx * 23 + 20] = -1;
d4_out[idx * 23 + 21] = -1;
d4_out[idx * 23 + 22] = -1;
}
else
{
X = d4_in[idx * 2 - 8];
A = d4_in[idx * 2 - 6];
B = d4_in[idx * 2 - 4];
C = d4_in[idx * 2 - 2];
D = d4_in[idx * 2 - 0];
BX = d4_in[idx * 2 - 5];
BA = d4_in[idx * 2 - 3];
BB = d4_in[idx * 2 - 1];
BC = d4_in[idx * 2 + 1];
//start the stats calcs here
d4_out[idx * 23 + 0] = fabs(X - D) / fabs(A - X);
d4_out[idx * 23 + 1] = fabs(A - D) / fabs(A - X);
d4_out[idx * 23 + 2] = fabs(B - D) / fabs(C - B);
d4_out[idx * 23 + 3] = fabs(C - D) / fabs(C - B);
d4_out[idx * 23 + 4] = fabs(B - D) / fabs(A - B);
d4_out[idx * 23 + 5] = fabs(A - D) / fabs(A - B);
d4_out[idx * 23 + 6] = fabs(X - C) / fabs(A - X);
d4_out[idx * 23 + 7] = fabs(A - C) / fabs(A - X);
d4_out[idx * 23 + 8] = fabs(C - B) / fabs(A - B);
d4_out[idx * 23 + 9] = fabs(A - B) / fabs(A - X);
d4_out[idx * 23 + 10] = fabs(C - D) / fabs(A - B);
d4_out[idx * 23 + 11] = fabs(C - D) / fabs(A - X);
d4_out[idx * 23 + 12] = fabs(C - B) / fabs(A - X);
d4_out[idx * 23 + 13] = BC;
d4_out[idx * 23 + 14] = BB;
d4_out[idx * 23 + 15] = BA;
d4_out[idx * 23 + 16] = BX;
d4_out[idx * 23 + 17] = BB + BC;
d4_out[idx * 23 + 18] = BA + BB + BC;
d4_out[idx * 23 + 19] = BX + BA + BB + BC;
d4_out[idx * 23 + 20] = BA + BB;
d4_out[idx * 23 + 21] = BX + BA + BB;
d4_out[idx * 23 + 22] = BX + BA;
}
}
我在要托管的 cudamemcppy 设备和 cudadevicesynchronise 中收到一个错误,指出遇到了非法内存访问。在堆栈溢出帮助之后,我更正了我的代码以使其成为一维数组,我已经为主机和设备数组分配了相同的内存。奇怪的是
此程序运行在较小的文件上成功(输入是 OHLC 数据)但在较大的文件上出现此错误
即使对于较大的文件,也有 3 个其他内核调用 运行 成功,没有任何问题。
任何帮助将不胜感激。
提前致谢
阿布舍克
PS 我正在使用具有 2GB 内存的单个 GTX 760 卡(ASUS 供应商:https://www.asus.com/Graphics-Cards/GTX760DC2OC2GD5/)。 cuda 版本也是 7。IDE 是 VS 2013。
您正在(可能)启动比实际需要更多的线程:
blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;
并且您的内核中没有针对这种情况的线程检查。编号高于 countlog
的线程将越界访问您的数组。
尝试将内核中的 else 语句更改为:
else if (idx < size)
在我的代码中我创建了一个主变量
h4_in = (double*)calloc(2 * countlog, sizeof(double));
h4_out = (double*)calloc(23 * countlog, sizeof(double));
countlog 是一个变量,它基本上表示二维数组的行大小(我将其实现为一维数组)
//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
{
h4_in[count * 2 + 0] = prc[count];
h4_in[count * 2 + 1] = h_stat1out[count * 6];
}
下面是我在主程序中调用CUDA的方式
//free cuda memory from previous call
cudaFree(d3_in);
cudaFree(d3_out);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset Failed :%s\n", cudaGetErrorString(cudaStatus));
}
//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
{
h4_in[count * 2 + 0] = prc[count];
h4_in[count * 2 + 1] = h_stat1out[count * 6];
}
//Query device to get parameters
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &threadsPerBlock, calcstats2, 0, countlog);
// Round up according to array size
blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;
//allocate memory on gpu
cudaStatus = cudaMalloc((void **)&d4_in, 2 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_in :%s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaMalloc((void **)&d4_out, 23 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_out :%s\n", cudaGetErrorString(cudaStatus));
}
//transfer array to gpu
cudaStatus = cudaMemcpy(d4_in, h4_in, 2 * countlog * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 host2device :%s\n", cudaGetErrorString(cudaStatus));
}
//launch threads
calcstats2 <<<blocksPerGrid, threadsPerBlock>>>(d4_out, d4_in, countlog);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "calcstats2 kernel failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "Device Sync failed: %s\n", cudaGetErrorString(cudaStatus));
}
//transfer data back to host
cudaStatus = cudaMemcpy(h4_out, d4_out, 23 * countlog * sizeof(double), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 device2host :%s\n", cudaGetErrorString(cudaStatus));
}
//free cuda
cudaFree(d4_in);
cudaFree(d4_out);
内核调用如下
__global__ void calcstats2(double *d4_out, double *d4_in, int size)
{
int idx = blockDim.x*blockIdx.x + threadIdx.x;
double X, A, B, C, D, BX, BA, BB, BC;
if (idx < 4)
{
d4_out[idx * 23 + 0] = -1;
d4_out[idx * 23 + 1] = -1;
d4_out[idx * 23 + 2] = -1;
d4_out[idx * 23 + 3] = -1;
d4_out[idx * 23 + 4] = -1;
d4_out[idx * 23 + 5] = -1;
d4_out[idx * 23 + 6] = -1;
d4_out[idx * 23 + 7] = -1;
d4_out[idx * 23 + 8] = -1;
d4_out[idx * 23 + 9] = -1;
d4_out[idx * 23 + 10] = -1;
d4_out[idx * 23 + 11] = -1;
d4_out[idx * 23 + 12] = -1;
d4_out[idx * 23 + 13] = -1;
d4_out[idx * 23 + 14] = -1;
d4_out[idx * 23 + 15] = -1;
d4_out[idx * 23 + 16] = -1;
d4_out[idx * 23 + 17] = -1;
d4_out[idx * 23 + 18] = -1;
d4_out[idx * 23 + 19] = -1;
d4_out[idx * 23 + 20] = -1;
d4_out[idx * 23 + 21] = -1;
d4_out[idx * 23 + 22] = -1;
}
else
{
X = d4_in[idx * 2 - 8];
A = d4_in[idx * 2 - 6];
B = d4_in[idx * 2 - 4];
C = d4_in[idx * 2 - 2];
D = d4_in[idx * 2 - 0];
BX = d4_in[idx * 2 - 5];
BA = d4_in[idx * 2 - 3];
BB = d4_in[idx * 2 - 1];
BC = d4_in[idx * 2 + 1];
//start the stats calcs here
d4_out[idx * 23 + 0] = fabs(X - D) / fabs(A - X);
d4_out[idx * 23 + 1] = fabs(A - D) / fabs(A - X);
d4_out[idx * 23 + 2] = fabs(B - D) / fabs(C - B);
d4_out[idx * 23 + 3] = fabs(C - D) / fabs(C - B);
d4_out[idx * 23 + 4] = fabs(B - D) / fabs(A - B);
d4_out[idx * 23 + 5] = fabs(A - D) / fabs(A - B);
d4_out[idx * 23 + 6] = fabs(X - C) / fabs(A - X);
d4_out[idx * 23 + 7] = fabs(A - C) / fabs(A - X);
d4_out[idx * 23 + 8] = fabs(C - B) / fabs(A - B);
d4_out[idx * 23 + 9] = fabs(A - B) / fabs(A - X);
d4_out[idx * 23 + 10] = fabs(C - D) / fabs(A - B);
d4_out[idx * 23 + 11] = fabs(C - D) / fabs(A - X);
d4_out[idx * 23 + 12] = fabs(C - B) / fabs(A - X);
d4_out[idx * 23 + 13] = BC;
d4_out[idx * 23 + 14] = BB;
d4_out[idx * 23 + 15] = BA;
d4_out[idx * 23 + 16] = BX;
d4_out[idx * 23 + 17] = BB + BC;
d4_out[idx * 23 + 18] = BA + BB + BC;
d4_out[idx * 23 + 19] = BX + BA + BB + BC;
d4_out[idx * 23 + 20] = BA + BB;
d4_out[idx * 23 + 21] = BX + BA + BB;
d4_out[idx * 23 + 22] = BX + BA;
}
}
我在要托管的 cudamemcppy 设备和 cudadevicesynchronise 中收到一个错误,指出遇到了非法内存访问。在堆栈溢出帮助之后,我更正了我的代码以使其成为一维数组,我已经为主机和设备数组分配了相同的内存。奇怪的是
此程序运行在较小的文件上成功(输入是 OHLC 数据)但在较大的文件上出现此错误
即使对于较大的文件,也有 3 个其他内核调用 运行 成功,没有任何问题。
任何帮助将不胜感激。
提前致谢
阿布舍克
PS 我正在使用具有 2GB 内存的单个 GTX 760 卡(ASUS 供应商:https://www.asus.com/Graphics-Cards/GTX760DC2OC2GD5/)。 cuda 版本也是 7。IDE 是 VS 2013。
您正在(可能)启动比实际需要更多的线程:
blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;
并且您的内核中没有针对这种情况的线程检查。编号高于 countlog
的线程将越界访问您的数组。
尝试将内核中的 else 语句更改为:
else if (idx < size)