curandState 的 PyCUDA 非法内存访问*
PyCUDA illegal memory access of curandState*
我正在研究入侵物种的传播,并尝试使用 XORWOW 随机数生成器在 PyCUDA 内核中生成随机数。我需要能够在研究中用作输入的矩阵非常大(高达 8,000 x 8,000)。
索引 XORWOW 生成器的 curandState*
时,错误似乎发生在 get_random_number
内部。代码在较小的矩阵上执行时没有错误,并产生正确的结果。我 运行 我的代码在 2 个 NVidia Tesla K20X GPU 上。
内核代码和设置:
kernel_code = '''
#include <curand_kernel.h>
#include <math.h>
extern "C" {
__device__ float get_random_number(curandState* global_state, int thread_id) {
curandState local_state = global_state[thread_id];
float num = curand_uniform(&local_state);
global_state[thread_id] = local_state;
return num;
}
__global__ void survival_of_the_fittest(float* grid_a, float* grid_b, curandState* global_state, int grid_size, float* survival_probabilities) {
int x = threadIdx.x + blockIdx.x * blockDim.x; // column index of cell
int y = threadIdx.y + blockIdx.y * blockDim.y; // row index of cell
// make sure this cell is within bounds of grid
if (x < grid_size && y < grid_size) {
int thread_id = y * grid_size + x; // thread index
grid_b[thread_id] = grid_a[thread_id]; // copy current cell
float num;
// ignore cell if it is not already populated
if (grid_a[thread_id] > 0.0) {
num = get_random_number(global_state, thread_id);
// agents in this cell die
if (num < survival_probabilities[thread_id]) {
grid_b[thread_id] = 0.0; // cell dies
//printf("Cell (%d,%d) died (probability of death was %f)\n", x, y, survival_probabilities[thread_id]);
}
}
}
}
mod = SourceModule(kernel_code, no_extern_c = True)
survival = mod.get_function('survival_of_the_fittest')
数据设置:
matrix_size = 2000
block_dims = 32
grid_dims = (matrix_size + block_dims - 1) // block_dims
grid_a = gpuarray.to_gpu(np.ones((matrix_size,matrix_size)).astype(np.float32))
grid_b = gpuarray.to_gpu(np.zeros((matrix_size,matrix_size)).astype(np.float32))
generator = curandom.XORWOWRandomNumberGenerator()
grid_size = np.int32(matrix_size)
survival_probabilities = gpuarray.to_gpu(np.random.uniform(0,1,(matrix_size,matrix_size)))
内核调用:
survival(grid_a, grid_b, generator.state, grid_size, survival_probabilities,
grid = (grid_dims, grid_dims), block = (block_dims, block_dims, 1))
我希望能够为高达 (8,000 x 8,000) 的矩阵生成 (0,1] 范围内的随机数,但是在大型矩阵上执行我的代码会导致非法内存访问错误。
pycuda._driver.LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
我是否在 get_random_number
中错误地索引了 curandState*
?如果不是,还有什么可能导致此错误?
这里的问题是 this code 决定了 PyCUDA curandom
接口为其内部状态分配的状态大小与您 post 中的这段代码之间的脱节:
matrix_size = 2000
block_dims = 32
grid_dims = (matrix_size + block_dims - 1) // block_dims
您似乎假设 PyCUDA 会神奇地为您在代码中 select 的任何块和网格维度分配足够的状态。这显然不太可能,尤其是在大网格尺寸下。你要么需要
- 修改您的代码以使用与
curandom
模块内部用于您选择使用的任何生成器相同的块和网格大小,或者
- 分配和管理您自己的状态暂存 space 以便您分配足够的状态来服务您 select
的块和网格大小
关于这两种方法中哪一种更适合您的应用程序,我将其留作 reader 的练习。
我正在研究入侵物种的传播,并尝试使用 XORWOW 随机数生成器在 PyCUDA 内核中生成随机数。我需要能够在研究中用作输入的矩阵非常大(高达 8,000 x 8,000)。
索引 XORWOW 生成器的 curandState*
时,错误似乎发生在 get_random_number
内部。代码在较小的矩阵上执行时没有错误,并产生正确的结果。我 运行 我的代码在 2 个 NVidia Tesla K20X GPU 上。
内核代码和设置:
kernel_code = '''
#include <curand_kernel.h>
#include <math.h>
extern "C" {
__device__ float get_random_number(curandState* global_state, int thread_id) {
curandState local_state = global_state[thread_id];
float num = curand_uniform(&local_state);
global_state[thread_id] = local_state;
return num;
}
__global__ void survival_of_the_fittest(float* grid_a, float* grid_b, curandState* global_state, int grid_size, float* survival_probabilities) {
int x = threadIdx.x + blockIdx.x * blockDim.x; // column index of cell
int y = threadIdx.y + blockIdx.y * blockDim.y; // row index of cell
// make sure this cell is within bounds of grid
if (x < grid_size && y < grid_size) {
int thread_id = y * grid_size + x; // thread index
grid_b[thread_id] = grid_a[thread_id]; // copy current cell
float num;
// ignore cell if it is not already populated
if (grid_a[thread_id] > 0.0) {
num = get_random_number(global_state, thread_id);
// agents in this cell die
if (num < survival_probabilities[thread_id]) {
grid_b[thread_id] = 0.0; // cell dies
//printf("Cell (%d,%d) died (probability of death was %f)\n", x, y, survival_probabilities[thread_id]);
}
}
}
}
mod = SourceModule(kernel_code, no_extern_c = True)
survival = mod.get_function('survival_of_the_fittest')
数据设置:
matrix_size = 2000
block_dims = 32
grid_dims = (matrix_size + block_dims - 1) // block_dims
grid_a = gpuarray.to_gpu(np.ones((matrix_size,matrix_size)).astype(np.float32))
grid_b = gpuarray.to_gpu(np.zeros((matrix_size,matrix_size)).astype(np.float32))
generator = curandom.XORWOWRandomNumberGenerator()
grid_size = np.int32(matrix_size)
survival_probabilities = gpuarray.to_gpu(np.random.uniform(0,1,(matrix_size,matrix_size)))
内核调用:
survival(grid_a, grid_b, generator.state, grid_size, survival_probabilities,
grid = (grid_dims, grid_dims), block = (block_dims, block_dims, 1))
我希望能够为高达 (8,000 x 8,000) 的矩阵生成 (0,1] 范围内的随机数,但是在大型矩阵上执行我的代码会导致非法内存访问错误。
pycuda._driver.LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
我是否在 get_random_number
中错误地索引了 curandState*
?如果不是,还有什么可能导致此错误?
这里的问题是 this code 决定了 PyCUDA curandom
接口为其内部状态分配的状态大小与您 post 中的这段代码之间的脱节:
matrix_size = 2000
block_dims = 32
grid_dims = (matrix_size + block_dims - 1) // block_dims
您似乎假设 PyCUDA 会神奇地为您在代码中 select 的任何块和网格维度分配足够的状态。这显然不太可能,尤其是在大网格尺寸下。你要么需要
- 修改您的代码以使用与
curandom
模块内部用于您选择使用的任何生成器相同的块和网格大小,或者 - 分配和管理您自己的状态暂存 space 以便您分配足够的状态来服务您 select 的块和网格大小
关于这两种方法中哪一种更适合您的应用程序,我将其留作 reader 的练习。