CUDA 内核指针参数变为 NULL
CUDA kernel pointer arguments become NULL
我的 CUDA 内核需要大量数组,这些数组需要作为指向内核的指针传递。问题是就在内核启动之前,所有指针都有有效地址,而且 cudaMalloc
和 cudaMemcpy
调用总是 return cudaSuccess
,但所有这些参数一旦变为 null内核已启动!
我对发生的事情一无所知。这是我 运行 我的代码 cuda-gdb
时得到的结果
CUDA Exception: Device Illegal Address
The exception was triggered in device 0.
Program received signal CUDA_EXCEPTION_10, Device Illegal Address.
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (64,0,0), device 0, sm 1, warp 2, lane 0]
0x00000000062a3dd8 in compute_data_and_match_kernel<<<(2,1,1),(512,1,1)>>> (a11=0x0, a12=0x0, a22=0x0, b1=0x0, b2=0x0, mask=0x0, wx=0x0, wy=0x0, du=0x0, dv=0x0, uu=0x0,
vv=0x0, Ix_c1=0x0, Ix_c2=0x0, Ix_c3=0x0, Iy_c1=0x0, Iy_c2=0x0, Iy_c3=0x0, Iz_c1=0x0, Iz_c2=0x0, Iz_c3=0x0, Ixx_c1=0x0, Ixx_c2=0x0, Ixx_c3=0x0, Ixy_c1=0x0, Ixy_c2=0x0,
Ixy_c3=0x0, Iyy_c1=0x0, Iyy_c2=0x0, Iyy_c3=0x0, Ixz_c1=0x0, Ixz_c2=0x0, Ixz_c3=0x0, Iyz_c1=0x0, Iyz_c2=0x0, Iyz_c3=0x0, desc_weight=0x0, desc_flow_x=0x0,
desc_flow_y=0x0, half_delta_over3=0.0833333358, half_beta=0, half_gamma_over3=0.833333313, width=59, height=26, stride=60) at opticalflow_aux.cu:441
441 ix_c1_val = Ix_c1[index]; iy_c1_val = Iy_c1[index]; iz_c1_val = Iz_c1[index];
(cuda-gdb)
有什么非常明显的东西是我遗漏的吗?
提前致谢。
编辑 1:
正如 Gilles 所建议的,我试图将主机指针和数据复制到一个结构中,然后再复制到设备上。为了简单起见(MCVE),我在结构中只使用了一个指针:
#include <cuda.h>
#include <stdio.h>
typedef struct test {
float *ptr;
} test_t;
__global__ void test_kernel(test_t *s) {
s->ptr[0] = s->ptr[1] = s->ptr[2] = s->ptr[3] = s->ptr[4] = 100;
s->ptr[5] = s->ptr[6] = s->ptr[7] = s->ptr[8] = s->ptr[9] = 100;
}
int main() {
float arr[] = {0,1,2,3,4,5,6,7,8,9};
test_t *h_struct;
h_struct = (test_t *)malloc(sizeof(test_t));
h_struct->ptr = arr;
test_t *d_struct;
float *d_data;
cudaMalloc((void **)&d_struct, sizeof(test_t));
cudaMalloc((void **)&d_data, sizeof(float)*10);
// Copy the data from host to device
cudaMemcpy(d_data, h_struct->ptr, sizeof(float)*10, cudaMemcpyHostToDevice);
// Point the host struct ptr to device memory
h_struct->ptr = d_data;
// copy the host struct to device
cudaMemcpy(d_struct, h_struct, sizeof(test_t), cudaMemcpyHostToDevice);
// Kernel Launch
test_kernel<<<1,1>>>(d_struct);
// copy the device array to host
cudaMemcpy(h_struct->ptr, d_data, sizeof(float)*10, cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_struct);
// Verifying if all the values have been set to 100
int i;
for(i=0 ; i<10 ; i++)
printf("%f\t", h_struct->ptr[i]);
return 0;
}
当我检查 d_struct->ptr
的值时,就在内核启动之前它显示 0x0
。 (我已经在调试模式下使用 nsight 检查了这些值)
不确定这是否是问题所在,但我相信用于向内核传递参数的堆栈大小是有限的。您可能需要创建一个存储参数的结构,将其复制到设备并仅将指向它的指针作为参数传递给内核。然后,在内核中,您从结构中检索参数...
编辑:添加了提交代码的更正版本。
这对我有用,并举例说明了我描述的原则。
#include <cuda.h>
#include <stdio.h>
typedef struct test {
float *ptr;
} test_t;
__global__ void test_kernel(test_t *s) {
s->ptr[0] = s->ptr[1] = s->ptr[2] = s->ptr[3] = s->ptr[4] = 100;
s->ptr[5] = s->ptr[6] = s->ptr[7] = s->ptr[8] = s->ptr[9] = 100;
}
int main() {
float arr[] = {0,1,2,3,4,5,6,7,8,9};
test_t *h_struct;
h_struct = (test_t *)malloc(sizeof(test_t));
test_t *d_struct;
float *d_data;
cudaMalloc((void **)&d_struct, sizeof(test_t));
cudaMalloc((void **)&d_data, sizeof(float)*10);
// Copy the data from host to device
cudaMemcpy(d_data, arr, sizeof(float)*10, cudaMemcpyHostToDevice);
// Point the host struct ptr to device memory
h_struct->ptr = d_data;
// copy the host struct to device
cudaMemcpy(d_struct, h_struct, sizeof(test_t), cudaMemcpyHostToDevice);
// Kernel Launch
test_kernel<<<1,1>>>(d_struct);
// copy the device array to host
cudaMemcpy(arr, d_data, sizeof(float)*10, cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_struct);
// Verifying if all the values have been set to 100
int i;
for(i=0 ; i<10 ; i++)
printf("%f\t", arr[i]);
return 0;
}
我的 CUDA 内核需要大量数组,这些数组需要作为指向内核的指针传递。问题是就在内核启动之前,所有指针都有有效地址,而且 cudaMalloc
和 cudaMemcpy
调用总是 return cudaSuccess
,但所有这些参数一旦变为 null内核已启动!
我对发生的事情一无所知。这是我 运行 我的代码 cuda-gdb
CUDA Exception: Device Illegal Address
The exception was triggered in device 0.
Program received signal CUDA_EXCEPTION_10, Device Illegal Address.
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (64,0,0), device 0, sm 1, warp 2, lane 0]
0x00000000062a3dd8 in compute_data_and_match_kernel<<<(2,1,1),(512,1,1)>>> (a11=0x0, a12=0x0, a22=0x0, b1=0x0, b2=0x0, mask=0x0, wx=0x0, wy=0x0, du=0x0, dv=0x0, uu=0x0,
vv=0x0, Ix_c1=0x0, Ix_c2=0x0, Ix_c3=0x0, Iy_c1=0x0, Iy_c2=0x0, Iy_c3=0x0, Iz_c1=0x0, Iz_c2=0x0, Iz_c3=0x0, Ixx_c1=0x0, Ixx_c2=0x0, Ixx_c3=0x0, Ixy_c1=0x0, Ixy_c2=0x0,
Ixy_c3=0x0, Iyy_c1=0x0, Iyy_c2=0x0, Iyy_c3=0x0, Ixz_c1=0x0, Ixz_c2=0x0, Ixz_c3=0x0, Iyz_c1=0x0, Iyz_c2=0x0, Iyz_c3=0x0, desc_weight=0x0, desc_flow_x=0x0,
desc_flow_y=0x0, half_delta_over3=0.0833333358, half_beta=0, half_gamma_over3=0.833333313, width=59, height=26, stride=60) at opticalflow_aux.cu:441
441 ix_c1_val = Ix_c1[index]; iy_c1_val = Iy_c1[index]; iz_c1_val = Iz_c1[index];
(cuda-gdb)
有什么非常明显的东西是我遗漏的吗? 提前致谢。
编辑 1: 正如 Gilles 所建议的,我试图将主机指针和数据复制到一个结构中,然后再复制到设备上。为了简单起见(MCVE),我在结构中只使用了一个指针:
#include <cuda.h>
#include <stdio.h>
typedef struct test {
float *ptr;
} test_t;
__global__ void test_kernel(test_t *s) {
s->ptr[0] = s->ptr[1] = s->ptr[2] = s->ptr[3] = s->ptr[4] = 100;
s->ptr[5] = s->ptr[6] = s->ptr[7] = s->ptr[8] = s->ptr[9] = 100;
}
int main() {
float arr[] = {0,1,2,3,4,5,6,7,8,9};
test_t *h_struct;
h_struct = (test_t *)malloc(sizeof(test_t));
h_struct->ptr = arr;
test_t *d_struct;
float *d_data;
cudaMalloc((void **)&d_struct, sizeof(test_t));
cudaMalloc((void **)&d_data, sizeof(float)*10);
// Copy the data from host to device
cudaMemcpy(d_data, h_struct->ptr, sizeof(float)*10, cudaMemcpyHostToDevice);
// Point the host struct ptr to device memory
h_struct->ptr = d_data;
// copy the host struct to device
cudaMemcpy(d_struct, h_struct, sizeof(test_t), cudaMemcpyHostToDevice);
// Kernel Launch
test_kernel<<<1,1>>>(d_struct);
// copy the device array to host
cudaMemcpy(h_struct->ptr, d_data, sizeof(float)*10, cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_struct);
// Verifying if all the values have been set to 100
int i;
for(i=0 ; i<10 ; i++)
printf("%f\t", h_struct->ptr[i]);
return 0;
}
当我检查 d_struct->ptr
的值时,就在内核启动之前它显示 0x0
。 (我已经在调试模式下使用 nsight 检查了这些值)
不确定这是否是问题所在,但我相信用于向内核传递参数的堆栈大小是有限的。您可能需要创建一个存储参数的结构,将其复制到设备并仅将指向它的指针作为参数传递给内核。然后,在内核中,您从结构中检索参数...
编辑:添加了提交代码的更正版本。 这对我有用,并举例说明了我描述的原则。
#include <cuda.h>
#include <stdio.h>
typedef struct test {
float *ptr;
} test_t;
__global__ void test_kernel(test_t *s) {
s->ptr[0] = s->ptr[1] = s->ptr[2] = s->ptr[3] = s->ptr[4] = 100;
s->ptr[5] = s->ptr[6] = s->ptr[7] = s->ptr[8] = s->ptr[9] = 100;
}
int main() {
float arr[] = {0,1,2,3,4,5,6,7,8,9};
test_t *h_struct;
h_struct = (test_t *)malloc(sizeof(test_t));
test_t *d_struct;
float *d_data;
cudaMalloc((void **)&d_struct, sizeof(test_t));
cudaMalloc((void **)&d_data, sizeof(float)*10);
// Copy the data from host to device
cudaMemcpy(d_data, arr, sizeof(float)*10, cudaMemcpyHostToDevice);
// Point the host struct ptr to device memory
h_struct->ptr = d_data;
// copy the host struct to device
cudaMemcpy(d_struct, h_struct, sizeof(test_t), cudaMemcpyHostToDevice);
// Kernel Launch
test_kernel<<<1,1>>>(d_struct);
// copy the device array to host
cudaMemcpy(arr, d_data, sizeof(float)*10, cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_struct);
// Verifying if all the values have been set to 100
int i;
for(i=0 ; i<10 ; i++)
printf("%f\t", arr[i]);
return 0;
}