执行 CUDA 程序时出现分段错误
Segmentation Fault while executing CUDA program
我是 NVIDIA CUDA 编程的新手,并且正在 'Segmentation Fault' 执行我的程序,该程序使用 CUBLAS 库。我已经安装了 NVIDIA CUDA Toolkit 6.5.
以下是我的代码:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <helper_cuda.h>
int main(int argc, char **argv)
{
cublasStatus_t status;
float *h_A;
float *h_B;
float *h_C;
float *d_A = 0;
float *d_B = 0;
float *d_C = 0;
int n2 = 5;
float *h_T;
cublasHandle_t handle;
int dev = findCudaDevice(argc, (const char **)argv);
if (dev == -1)
{
return EXIT_FAILURE;
}
/* Initialize CUBLAS */
printf("simpleCUBLAS test running..\n");
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
printf("Allocating A\n");
/* Allocate host memory for the matrices */
h_A = (float *)malloc(n2 * sizeof(h_A[0]));
if (h_A == 0)
{
fprintf(stderr, "!!!! host memory allocation error (A)\n");
return EXIT_FAILURE;
}
printf("Allocated A\n");
h_B = (float *)malloc(n2 * sizeof(h_B[0]));
if (h_B == 0)
{
fprintf(stderr, "!!!! host memory allocation error (B)\n");
return EXIT_FAILURE;
}
printf("Allocated B\n");
h_C = (float *)malloc(n2 * sizeof(h_C[0]));
if (h_C == 0)
{
fprintf(stderr, "!!!! host memory allocation error (C)\n");
return EXIT_FAILURE;
}
printf("Allocated C\n");
h_T = (float *)malloc(n2 * sizeof(h_T[0]));
if (h_T == 0)
{
fprintf(stderr, "!!!! host memory allocation error (C)\n");
return EXIT_FAILURE;
}
/* Fill the matrices with test data */
int i;
for (i = 0; i < n2; i++)
{
h_A[i] = i;
h_B[i] = i;
//h_A[i] = rand() / (float)RAND_MAX;
//h_B[i] = rand() / (float)RAND_MAX;
h_C[i] = 0;
}
printf("Filled A,, B, C\n");
/* Allocate device memory for the matrices */
if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
return EXIT_FAILURE;
}
printf("Allocated d_A\n");
if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
return EXIT_FAILURE;
}
printf("Allocated d_B\n");
if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
return EXIT_FAILURE;
}
printf("Allocated d_C\n");
status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write A)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write B)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write C)\n");
return EXIT_FAILURE;
}
fprintf(stderr, "!!!! error test\n");
printf("Vectors set.\n");
status = cublasGetVector(n2, sizeof(h_T[0]), d_A, 1, h_T, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (read T)\n");
return EXIT_FAILURE;
}
int f;
for (f = 0; f < n2; f++)
{
printf("T[%d]=%f\n", f, h_T[f]);
}
status = cublasSdot(handle, n2, d_A, 1, d_B, 1, d_C);
printf("Dot product done.\n");
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_A) != cudaSuccess)
{
fprintf(stderr, "!!!! memory free error (A)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_B) != cudaSuccess)
{
fprintf(stderr, "!!!! memory free error (B)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_C) != cudaSuccess)
{
fprintf(stderr, "!!!! memory free error (C)\n");
return EXIT_FAILURE;
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! shutdown error (A)\n");
return EXIT_FAILURE;
}
return 0;
}
程序在调用 'cublasSdot' 之前执行得很好。我能够通过放置某些打印语句来推断这一点。
我程序的输出:
GPU Device 0: "GRID K520" with compute capability 3.0
simpleCUBLAS test running..
Allocating A
Allocated A
Allocated B
Allocated C
Filled A,, B, C
Allocated d_A
Allocated d_B
Allocated d_C
!!!! error test
Vectors set.
T[0]=0.000000
T[1]=1.000000
T[2]=2.000000
T[3]=3.000000
T[4]=4.000000
Segmentation fault
我想我调用方法 cublasSDot()
的方式可能不正确。哪里不对请指教
注意:我参考名为 'simpleCUBLAS.cpp' 的 CUDA 工具包 6.5 示例文件创建了上述程序。我打印了数组 'T 的值来测试方法 'cublasGetVector'。
'cublasSDot' 表示向量点积 [More information].
我能够通过在 cublasCreate() 之后添加以下代码来解决处理 'Segmentation Fault' 的问题。
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
这将确保 CUBLAS 库知道输入是标量。
资料来源:Retaining dot product on GPGPU using CUBLAS routine
我是 NVIDIA CUDA 编程的新手,并且正在 'Segmentation Fault' 执行我的程序,该程序使用 CUBLAS 库。我已经安装了 NVIDIA CUDA Toolkit 6.5.
以下是我的代码:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <helper_cuda.h>
int main(int argc, char **argv)
{
cublasStatus_t status;
float *h_A;
float *h_B;
float *h_C;
float *d_A = 0;
float *d_B = 0;
float *d_C = 0;
int n2 = 5;
float *h_T;
cublasHandle_t handle;
int dev = findCudaDevice(argc, (const char **)argv);
if (dev == -1)
{
return EXIT_FAILURE;
}
/* Initialize CUBLAS */
printf("simpleCUBLAS test running..\n");
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
printf("Allocating A\n");
/* Allocate host memory for the matrices */
h_A = (float *)malloc(n2 * sizeof(h_A[0]));
if (h_A == 0)
{
fprintf(stderr, "!!!! host memory allocation error (A)\n");
return EXIT_FAILURE;
}
printf("Allocated A\n");
h_B = (float *)malloc(n2 * sizeof(h_B[0]));
if (h_B == 0)
{
fprintf(stderr, "!!!! host memory allocation error (B)\n");
return EXIT_FAILURE;
}
printf("Allocated B\n");
h_C = (float *)malloc(n2 * sizeof(h_C[0]));
if (h_C == 0)
{
fprintf(stderr, "!!!! host memory allocation error (C)\n");
return EXIT_FAILURE;
}
printf("Allocated C\n");
h_T = (float *)malloc(n2 * sizeof(h_T[0]));
if (h_T == 0)
{
fprintf(stderr, "!!!! host memory allocation error (C)\n");
return EXIT_FAILURE;
}
/* Fill the matrices with test data */
int i;
for (i = 0; i < n2; i++)
{
h_A[i] = i;
h_B[i] = i;
//h_A[i] = rand() / (float)RAND_MAX;
//h_B[i] = rand() / (float)RAND_MAX;
h_C[i] = 0;
}
printf("Filled A,, B, C\n");
/* Allocate device memory for the matrices */
if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
return EXIT_FAILURE;
}
printf("Allocated d_A\n");
if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
return EXIT_FAILURE;
}
printf("Allocated d_B\n");
if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
return EXIT_FAILURE;
}
printf("Allocated d_C\n");
status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write A)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write B)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write C)\n");
return EXIT_FAILURE;
}
fprintf(stderr, "!!!! error test\n");
printf("Vectors set.\n");
status = cublasGetVector(n2, sizeof(h_T[0]), d_A, 1, h_T, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (read T)\n");
return EXIT_FAILURE;
}
int f;
for (f = 0; f < n2; f++)
{
printf("T[%d]=%f\n", f, h_T[f]);
}
status = cublasSdot(handle, n2, d_A, 1, d_B, 1, d_C);
printf("Dot product done.\n");
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_A) != cudaSuccess)
{
fprintf(stderr, "!!!! memory free error (A)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_B) != cudaSuccess)
{
fprintf(stderr, "!!!! memory free error (B)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_C) != cudaSuccess)
{
fprintf(stderr, "!!!! memory free error (C)\n");
return EXIT_FAILURE;
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! shutdown error (A)\n");
return EXIT_FAILURE;
}
return 0;
}
程序在调用 'cublasSdot' 之前执行得很好。我能够通过放置某些打印语句来推断这一点。
我程序的输出:
GPU Device 0: "GRID K520" with compute capability 3.0
simpleCUBLAS test running..
Allocating A
Allocated A
Allocated B
Allocated C
Filled A,, B, C
Allocated d_A
Allocated d_B
Allocated d_C
!!!! error test
Vectors set.
T[0]=0.000000
T[1]=1.000000
T[2]=2.000000
T[3]=3.000000
T[4]=4.000000
Segmentation fault
我想我调用方法 cublasSDot()
的方式可能不正确。哪里不对请指教
注意:我参考名为 'simpleCUBLAS.cpp' 的 CUDA 工具包 6.5 示例文件创建了上述程序。我打印了数组 'T 的值来测试方法 'cublasGetVector'。 'cublasSDot' 表示向量点积 [More information].
我能够通过在 cublasCreate() 之后添加以下代码来解决处理 'Segmentation Fault' 的问题。
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
这将确保 CUBLAS 库知道输入是标量。 资料来源:Retaining dot product on GPGPU using CUBLAS routine