Cuda GPUassert:遇到非法内存访问
Cuda GPUassert: an illegal memory access was encountered
我试图使用 __device __ 变量而不是使用 cudaMalloc 动态声明它来制作游戏程序,但它一直告诉我 GPUassert:遇到非法内存访问调用 cudaDeviceSynchronization() 的倒数第三行。我试过使用 cudaMalloc 的版本,结果很好。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define M 3
#define N 3
#define K 3
using namespace std;
__device__ double A_dev[M * K];
__device__ double B_dev[K * N];
__device__ double C_dev[M * N];
__global__ void gemm(double* A, double* B, double* C, int m, int n, int k)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = x * n + y;
double sum = 0.0;
for (int j = 0; j < k; j++)
{
sum += A[x * k + j] * B[n * j + y];
}
C[i] = sum;
printf("The value is %f", C[i]);
}
int main(void)
{
double A_h[M * K];
double B_h[K * N];
double C_h[M * N];
for (int i = 0; i < M*K; i++)
{
A_h[i] = (double)i;
B_h[i] = (double)i;
C_h[i] = 0.0;
}
gpuErrchk(cudaMemcpyToSymbol(A_dev, A_h, M * K * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(B_dev, B_h, K * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(C_dev, C_h, M * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
dim3 dimGrid(1, 1, 1);
dim3 dimBlock(3, 3, 1);
gemm <<<dimGrid, dimBlock >>> (A_dev, B_dev, C_dev, 3, 3, 3);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpyFromSymbol(C_h, C_dev, M * N * sizeof(double), 0, cudaMemcpyDeviceToHost));
return 0;
}
当使用 __device__
变量时,它们本质上处于全局范围内,我们不会将它们作为内核参数传递。您可以直接在内核代码中使用这些变量,而无需为它们设置内核参数。
如果您对代码进行以下更改,它将 运行 没有错误:
#include <iostream>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define M 3
#define N 3
#define K 3
using namespace std;
__device__ double A_dev[M * K];
__device__ double B_dev[K * N];
__device__ double C_dev[M * N];
__global__ void gemm(int m, int n, int k)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = x * n + y;
double sum = 0.0;
for (int j = 0; j < k; j++)
{
sum += A_dev[x * k + j] * B_dev[n * j + y];
}
C_dev[i] = sum;
printf("The value is %f", C_dev[i]);
}
int main(void)
{
double A_h[M * K];
double B_h[K * N];
double C_h[M * N];
for (int i = 0; i < M*K; i++)
{
A_h[i] = (double)i;
B_h[i] = (double)i;
C_h[i] = 0.0;
}
gpuErrchk(cudaMemcpyToSymbol(A_dev, A_h, M * K * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(B_dev, B_h, K * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(C_dev, C_h, M * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
dim3 dimGrid(1, 1, 1);
dim3 dimBlock(3, 3, 1);
gemm <<<dimGrid, dimBlock >>> (3, 3, 3);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpyFromSymbol(C_h, C_dev, M * N * sizeof(double), 0, cudaMemcpyDeviceToHost));
return 0;
}
我试图使用 __device __ 变量而不是使用 cudaMalloc 动态声明它来制作游戏程序,但它一直告诉我 GPUassert:遇到非法内存访问调用 cudaDeviceSynchronization() 的倒数第三行。我试过使用 cudaMalloc 的版本,结果很好。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define M 3
#define N 3
#define K 3
using namespace std;
__device__ double A_dev[M * K];
__device__ double B_dev[K * N];
__device__ double C_dev[M * N];
__global__ void gemm(double* A, double* B, double* C, int m, int n, int k)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = x * n + y;
double sum = 0.0;
for (int j = 0; j < k; j++)
{
sum += A[x * k + j] * B[n * j + y];
}
C[i] = sum;
printf("The value is %f", C[i]);
}
int main(void)
{
double A_h[M * K];
double B_h[K * N];
double C_h[M * N];
for (int i = 0; i < M*K; i++)
{
A_h[i] = (double)i;
B_h[i] = (double)i;
C_h[i] = 0.0;
}
gpuErrchk(cudaMemcpyToSymbol(A_dev, A_h, M * K * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(B_dev, B_h, K * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(C_dev, C_h, M * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
dim3 dimGrid(1, 1, 1);
dim3 dimBlock(3, 3, 1);
gemm <<<dimGrid, dimBlock >>> (A_dev, B_dev, C_dev, 3, 3, 3);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpyFromSymbol(C_h, C_dev, M * N * sizeof(double), 0, cudaMemcpyDeviceToHost));
return 0;
}
当使用 __device__
变量时,它们本质上处于全局范围内,我们不会将它们作为内核参数传递。您可以直接在内核代码中使用这些变量,而无需为它们设置内核参数。
如果您对代码进行以下更改,它将 运行 没有错误:
#include <iostream>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define M 3
#define N 3
#define K 3
using namespace std;
__device__ double A_dev[M * K];
__device__ double B_dev[K * N];
__device__ double C_dev[M * N];
__global__ void gemm(int m, int n, int k)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = x * n + y;
double sum = 0.0;
for (int j = 0; j < k; j++)
{
sum += A_dev[x * k + j] * B_dev[n * j + y];
}
C_dev[i] = sum;
printf("The value is %f", C_dev[i]);
}
int main(void)
{
double A_h[M * K];
double B_h[K * N];
double C_h[M * N];
for (int i = 0; i < M*K; i++)
{
A_h[i] = (double)i;
B_h[i] = (double)i;
C_h[i] = 0.0;
}
gpuErrchk(cudaMemcpyToSymbol(A_dev, A_h, M * K * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(B_dev, B_h, K * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(C_dev, C_h, M * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
dim3 dimGrid(1, 1, 1);
dim3 dimBlock(3, 3, 1);
gemm <<<dimGrid, dimBlock >>> (3, 3, 3);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpyFromSymbol(C_h, C_dev, M * N * sizeof(double), 0, cudaMemcpyDeviceToHost));
return 0;
}