将 CUDA-gdb 与 NVRTC 结合使用
Using CUDA-gdb with NVRTC
我有一个生成 CUDA C++ 源代码的应用程序,在运行时使用 NVRTC
将其编译成 PTX,然后使用 CUDA 驱动程序从中创建 CUDA 模块 API。
如果我使用 cuda-gdb
调试此应用程序,它会在回溯中显示内核(发生错误的位置),但不显示行号。
我将生成的源代码导出到一个文件中,并使用 --directory
选项将目录提供给 cuda-gdb
。我还尝试将其文件名传递给 nvrtcCreateProgram()
(name
参数)。我将编译选项 --device-debug
和 --generate-line-info
与 NVRTC 一起使用。
有没有办法让cuda-gdb
知道生成的源代码文件的位置,并在其回溯中显示行号信息?
我能够使用 cuda-gdb
在 nvrtc
生成的内核上进行内核源代码级调试,如下所示:
- 从vectorAdd_nvrtc示例代码开始
- 修改
compileFileToPTX
例程(由nvrtc_helper.h
提供),在compile-cu-to-ptx步骤中添加--device-debug
开关。
- 修改
loadPTX
例程(由nvrtc_helper.h
提供)为cuModuleLoadDataEx
load/JIT PTX-添加CU_JIT_GENERATE_DEBUG_INFO
选项(设置为1)到二进制步骤。
- 使用
-g
选项编译主函数 (vectorAdd.cpp)。
这是一个完整的测试case/session。我只显示项目中的 vectorAdd.cpp 文件,因为这是我修改的唯一文件。其他项目文件与示例项目中的相同:
$ cat vectorAdd.cpp
/**
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/**
* Vector addition: C = A + B.
*
* This sample is a very basic sample that implements element by element
* vector addition. It is the same as the sample illustrating Chapter 2
* of the programming guide with some additions like error checking.
*/
#include <stdio.h>
#include <cmath>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda.h>
#include <cuda_runtime.h>
// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <nvrtc_helper.h>
#include <iostream>
#include <fstream>
/**
* Host main routine
*/
void my_compileFileToPTX(char *filename, int argc, char **argv, char **ptxResult,
size_t *ptxResultSize, int requiresCGheaders) {
std::ifstream inputFile(filename,
std::ios::in | std::ios::binary | std::ios::ate);
if (!inputFile.is_open()) {
std::cerr << "\nerror: unable to open " << filename << " for reading!\n";
exit(1);
}
std::streampos pos = inputFile.tellg();
size_t inputSize = (size_t)pos;
char *memBlock = new char[inputSize + 1];
inputFile.seekg(0, std::ios::beg);
inputFile.read(memBlock, inputSize);
inputFile.close();
memBlock[inputSize] = '\x0';
int numCompileOptions = 0;
char *compileParams[2];
std::string compileOptions;
if (requiresCGheaders) {
char HeaderNames[256];
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h");
#else
snprintf(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h");
#endif
compileOptions = "--include-path=";
std::string path = sdkFindFilePath(HeaderNames, argv[0]);
if (!path.empty()) {
std::size_t found = path.find(HeaderNames);
path.erase(found);
} else {
printf(
"\nCooperativeGroups headers not found, please install it in %s "
"sample directory..\n Exiting..\n",
argv[0]);
}
compileOptions += path.c_str();
compileParams[0] = reinterpret_cast<char *>(
malloc(sizeof(char) * (compileOptions.length() + 1)));
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(compileParams[0], sizeof(char) * (compileOptions.length() + 1),
"%s", compileOptions.c_str());
#else
snprintf(compileParams[0], compileOptions.size(), "%s",
compileOptions.c_str());
#endif
numCompileOptions++;
}
compileOptions = "--device-debug ";
compileParams[numCompileOptions] = reinterpret_cast<char *>(malloc(sizeof(char) * (compileOptions.length() + 1)));
snprintf(compileParams[numCompileOptions], compileOptions.size(), "%s", compileOptions.c_str());
numCompileOptions++;
// compile
nvrtcProgram prog;
NVRTC_SAFE_CALL("nvrtcCreateProgram",
nvrtcCreateProgram(&prog, memBlock, filename, 0, NULL, NULL));
nvrtcResult res = nvrtcCompileProgram(prog, numCompileOptions, compileParams);
// dump log
size_t logSize;
NVRTC_SAFE_CALL("nvrtcGetProgramLogSize",
nvrtcGetProgramLogSize(prog, &logSize));
char *log = reinterpret_cast<char *>(malloc(sizeof(char) * logSize + 1));
NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log));
log[logSize] = '\x0';
if (strlen(log) >= 2) {
std::cerr << "\n compilation log ---\n";
std::cerr << log;
std::cerr << "\n end log ---\n";
}
free(log);
NVRTC_SAFE_CALL("nvrtcCompileProgram", res);
// fetch PTX
size_t ptxSize;
NVRTC_SAFE_CALL("nvrtcGetPTXSize", nvrtcGetPTXSize(prog, &ptxSize));
char *ptx = reinterpret_cast<char *>(malloc(sizeof(char) * ptxSize));
NVRTC_SAFE_CALL("nvrtcGetPTX", nvrtcGetPTX(prog, ptx));
NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog));
*ptxResult = ptx;
*ptxResultSize = ptxSize;
#ifdef DUMP_PTX
std::ofstream my_f;
my_f.open("vectorAdd.ptx");
for (int i = 0; i < ptxSize; i++)
my_f << ptx[i];
my_f.close();
#endif
if (requiresCGheaders) free(compileParams[0]);
}
CUmodule my_loadPTX(char *ptx, int argc, char **argv) {
CUmodule module;
CUcontext context;
int major = 0, minor = 0;
char deviceName[256];
// Picks the best CUDA device available
CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGet(&cuDevice, 0));
checkCudaErrors(cuCtxCreate(&context, 0, cuDevice));
CUjit_option opt[1];
opt[0] = CU_JIT_GENERATE_DEBUG_INFO;
void **vals = new void *[1];
vals[0] = (void *)(size_t)1;
checkCudaErrors(cuModuleLoadDataEx(&module, ptx, 1, opt, vals));
free(ptx);
return module;
}
int main(int argc, char **argv) {
char *ptx, *kernel_file;
size_t ptxSize;
kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
my_compileFileToPTX(kernel_file, argc, argv, &ptx, &ptxSize, 0);
CUmodule module = my_loadPTX(ptx, argc, argv);
CUfunction kernel_addr;
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd"));
// Print the vector length to be used, and compute its size
int numElements = 50000;
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
// Allocate the host input vector A
float *h_A = reinterpret_cast<float *>(malloc(size));
// Allocate the host input vector B
float *h_B = reinterpret_cast<float *>(malloc(size));
// Allocate the host output vector C
float *h_C = reinterpret_cast<float *>(malloc(size));
// Verify that allocations succeeded
if (h_A == NULL || h_B == NULL || h_C == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand() / static_cast<float>(RAND_MAX);
h_B[i] = rand() / static_cast<float>(RAND_MAX);
}
// Allocate the device input vector A
CUdeviceptr d_A;
checkCudaErrors(cuMemAlloc(&d_A, size));
// Allocate the device input vector B
CUdeviceptr d_B;
checkCudaErrors(cuMemAlloc(&d_B, size));
// Allocate the device output vector C
CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, size));
// Copy the host input vectors A and B in host memory to the device input
// vectors in device memory
printf("Copy input data from the host memory to the CUDA device\n");
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
threadsPerBlock);
dim3 cudaBlockSize(threadsPerBlock, 1, 1);
dim3 cudaGridSize(blocksPerGrid, 1, 1);
void *arr[] = {reinterpret_cast<void *>(&d_A), reinterpret_cast<void *>(&d_B),
reinterpret_cast<void *>(&d_C),
reinterpret_cast<void *>(&numElements)};
checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
cudaGridSize.z, /* grid dim */
cudaBlockSize.x, cudaBlockSize.y,
cudaBlockSize.z, /* block dim */
0, 0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
// Copy the device result vector in device memory to the host result vector
// in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
// Free host memory
free(h_A);
free(h_B);
free(h_C);
printf("Done\n");
return 0;
}
$ nvcc -g -I/usr/local/cuda/samples/common/inc -o test vectorAdd.cpp -lnvrtc -lcuda
$ cuda-gdb ./test
NVIDIA (R) CUDA Debugger
10.0 release
Portions Copyright (C) 2007-2018 NVIDIA Corporation
GNU gdb (GDB) 7.12
Copyright (C) 2016 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-pc-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./test...done.
(cuda-gdb) break vectorAdd
Function "vectorAdd" not defined.
Make breakpoint pending on future shared library load? (y or [n]) y
Breakpoint 1 (vectorAdd) pending.
(cuda-gdb) r
Starting program: /home/user2/misc/junk/vectorAdd_nvrtc/test
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[New Thread 0x7fffedc00700 (LWP 16789)]
> Using CUDA Device [1]: Tesla K40m
> GPU Device has SM 3.5 compute capability
[New Thread 0x7fffed3ff700 (LWP 16790)]
[Vector addition of 50000 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
Thread 1 "test" hit Breakpoint 1, vectorAdd<<<(196,1,1),(256,1,1)>>> (A=0x7fffce800000, B=0x7fffce830e00, C=0x7fffce861c00, numElements=50000) at ./vectorAdd_kernel.cu:21
21 int i = blockDim.x * blockIdx.x + threadIdx.x;
(cuda-gdb) step
23 if (i < numElements) {
(cuda-gdb) step
24 C[i] = A[i] + B[i];
(cuda-gdb) step
26 }
(cuda-gdb) quit
A debugging session is active.
Inferior 1 [process 16777] will be killed.
Quit anyway? (y or n) y
$
我有一个生成 CUDA C++ 源代码的应用程序,在运行时使用 NVRTC
将其编译成 PTX,然后使用 CUDA 驱动程序从中创建 CUDA 模块 API。
如果我使用 cuda-gdb
调试此应用程序,它会在回溯中显示内核(发生错误的位置),但不显示行号。
我将生成的源代码导出到一个文件中,并使用 --directory
选项将目录提供给 cuda-gdb
。我还尝试将其文件名传递给 nvrtcCreateProgram()
(name
参数)。我将编译选项 --device-debug
和 --generate-line-info
与 NVRTC 一起使用。
有没有办法让cuda-gdb
知道生成的源代码文件的位置,并在其回溯中显示行号信息?
我能够使用 cuda-gdb
在 nvrtc
生成的内核上进行内核源代码级调试,如下所示:
- 从vectorAdd_nvrtc示例代码开始
- 修改
compileFileToPTX
例程(由nvrtc_helper.h
提供),在compile-cu-to-ptx步骤中添加--device-debug
开关。 - 修改
loadPTX
例程(由nvrtc_helper.h
提供)为cuModuleLoadDataEx
load/JIT PTX-添加CU_JIT_GENERATE_DEBUG_INFO
选项(设置为1)到二进制步骤。 - 使用
-g
选项编译主函数 (vectorAdd.cpp)。
这是一个完整的测试case/session。我只显示项目中的 vectorAdd.cpp 文件,因为这是我修改的唯一文件。其他项目文件与示例项目中的相同:
$ cat vectorAdd.cpp
/**
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/**
* Vector addition: C = A + B.
*
* This sample is a very basic sample that implements element by element
* vector addition. It is the same as the sample illustrating Chapter 2
* of the programming guide with some additions like error checking.
*/
#include <stdio.h>
#include <cmath>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda.h>
#include <cuda_runtime.h>
// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <nvrtc_helper.h>
#include <iostream>
#include <fstream>
/**
* Host main routine
*/
void my_compileFileToPTX(char *filename, int argc, char **argv, char **ptxResult,
size_t *ptxResultSize, int requiresCGheaders) {
std::ifstream inputFile(filename,
std::ios::in | std::ios::binary | std::ios::ate);
if (!inputFile.is_open()) {
std::cerr << "\nerror: unable to open " << filename << " for reading!\n";
exit(1);
}
std::streampos pos = inputFile.tellg();
size_t inputSize = (size_t)pos;
char *memBlock = new char[inputSize + 1];
inputFile.seekg(0, std::ios::beg);
inputFile.read(memBlock, inputSize);
inputFile.close();
memBlock[inputSize] = '\x0';
int numCompileOptions = 0;
char *compileParams[2];
std::string compileOptions;
if (requiresCGheaders) {
char HeaderNames[256];
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h");
#else
snprintf(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h");
#endif
compileOptions = "--include-path=";
std::string path = sdkFindFilePath(HeaderNames, argv[0]);
if (!path.empty()) {
std::size_t found = path.find(HeaderNames);
path.erase(found);
} else {
printf(
"\nCooperativeGroups headers not found, please install it in %s "
"sample directory..\n Exiting..\n",
argv[0]);
}
compileOptions += path.c_str();
compileParams[0] = reinterpret_cast<char *>(
malloc(sizeof(char) * (compileOptions.length() + 1)));
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(compileParams[0], sizeof(char) * (compileOptions.length() + 1),
"%s", compileOptions.c_str());
#else
snprintf(compileParams[0], compileOptions.size(), "%s",
compileOptions.c_str());
#endif
numCompileOptions++;
}
compileOptions = "--device-debug ";
compileParams[numCompileOptions] = reinterpret_cast<char *>(malloc(sizeof(char) * (compileOptions.length() + 1)));
snprintf(compileParams[numCompileOptions], compileOptions.size(), "%s", compileOptions.c_str());
numCompileOptions++;
// compile
nvrtcProgram prog;
NVRTC_SAFE_CALL("nvrtcCreateProgram",
nvrtcCreateProgram(&prog, memBlock, filename, 0, NULL, NULL));
nvrtcResult res = nvrtcCompileProgram(prog, numCompileOptions, compileParams);
// dump log
size_t logSize;
NVRTC_SAFE_CALL("nvrtcGetProgramLogSize",
nvrtcGetProgramLogSize(prog, &logSize));
char *log = reinterpret_cast<char *>(malloc(sizeof(char) * logSize + 1));
NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log));
log[logSize] = '\x0';
if (strlen(log) >= 2) {
std::cerr << "\n compilation log ---\n";
std::cerr << log;
std::cerr << "\n end log ---\n";
}
free(log);
NVRTC_SAFE_CALL("nvrtcCompileProgram", res);
// fetch PTX
size_t ptxSize;
NVRTC_SAFE_CALL("nvrtcGetPTXSize", nvrtcGetPTXSize(prog, &ptxSize));
char *ptx = reinterpret_cast<char *>(malloc(sizeof(char) * ptxSize));
NVRTC_SAFE_CALL("nvrtcGetPTX", nvrtcGetPTX(prog, ptx));
NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog));
*ptxResult = ptx;
*ptxResultSize = ptxSize;
#ifdef DUMP_PTX
std::ofstream my_f;
my_f.open("vectorAdd.ptx");
for (int i = 0; i < ptxSize; i++)
my_f << ptx[i];
my_f.close();
#endif
if (requiresCGheaders) free(compileParams[0]);
}
CUmodule my_loadPTX(char *ptx, int argc, char **argv) {
CUmodule module;
CUcontext context;
int major = 0, minor = 0;
char deviceName[256];
// Picks the best CUDA device available
CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGet(&cuDevice, 0));
checkCudaErrors(cuCtxCreate(&context, 0, cuDevice));
CUjit_option opt[1];
opt[0] = CU_JIT_GENERATE_DEBUG_INFO;
void **vals = new void *[1];
vals[0] = (void *)(size_t)1;
checkCudaErrors(cuModuleLoadDataEx(&module, ptx, 1, opt, vals));
free(ptx);
return module;
}
int main(int argc, char **argv) {
char *ptx, *kernel_file;
size_t ptxSize;
kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
my_compileFileToPTX(kernel_file, argc, argv, &ptx, &ptxSize, 0);
CUmodule module = my_loadPTX(ptx, argc, argv);
CUfunction kernel_addr;
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd"));
// Print the vector length to be used, and compute its size
int numElements = 50000;
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
// Allocate the host input vector A
float *h_A = reinterpret_cast<float *>(malloc(size));
// Allocate the host input vector B
float *h_B = reinterpret_cast<float *>(malloc(size));
// Allocate the host output vector C
float *h_C = reinterpret_cast<float *>(malloc(size));
// Verify that allocations succeeded
if (h_A == NULL || h_B == NULL || h_C == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand() / static_cast<float>(RAND_MAX);
h_B[i] = rand() / static_cast<float>(RAND_MAX);
}
// Allocate the device input vector A
CUdeviceptr d_A;
checkCudaErrors(cuMemAlloc(&d_A, size));
// Allocate the device input vector B
CUdeviceptr d_B;
checkCudaErrors(cuMemAlloc(&d_B, size));
// Allocate the device output vector C
CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, size));
// Copy the host input vectors A and B in host memory to the device input
// vectors in device memory
printf("Copy input data from the host memory to the CUDA device\n");
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
threadsPerBlock);
dim3 cudaBlockSize(threadsPerBlock, 1, 1);
dim3 cudaGridSize(blocksPerGrid, 1, 1);
void *arr[] = {reinterpret_cast<void *>(&d_A), reinterpret_cast<void *>(&d_B),
reinterpret_cast<void *>(&d_C),
reinterpret_cast<void *>(&numElements)};
checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
cudaGridSize.z, /* grid dim */
cudaBlockSize.x, cudaBlockSize.y,
cudaBlockSize.z, /* block dim */
0, 0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
// Copy the device result vector in device memory to the host result vector
// in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
// Free host memory
free(h_A);
free(h_B);
free(h_C);
printf("Done\n");
return 0;
}
$ nvcc -g -I/usr/local/cuda/samples/common/inc -o test vectorAdd.cpp -lnvrtc -lcuda
$ cuda-gdb ./test
NVIDIA (R) CUDA Debugger
10.0 release
Portions Copyright (C) 2007-2018 NVIDIA Corporation
GNU gdb (GDB) 7.12
Copyright (C) 2016 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-pc-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./test...done.
(cuda-gdb) break vectorAdd
Function "vectorAdd" not defined.
Make breakpoint pending on future shared library load? (y or [n]) y
Breakpoint 1 (vectorAdd) pending.
(cuda-gdb) r
Starting program: /home/user2/misc/junk/vectorAdd_nvrtc/test
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[New Thread 0x7fffedc00700 (LWP 16789)]
> Using CUDA Device [1]: Tesla K40m
> GPU Device has SM 3.5 compute capability
[New Thread 0x7fffed3ff700 (LWP 16790)]
[Vector addition of 50000 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
Thread 1 "test" hit Breakpoint 1, vectorAdd<<<(196,1,1),(256,1,1)>>> (A=0x7fffce800000, B=0x7fffce830e00, C=0x7fffce861c00, numElements=50000) at ./vectorAdd_kernel.cu:21
21 int i = blockDim.x * blockIdx.x + threadIdx.x;
(cuda-gdb) step
23 if (i < numElements) {
(cuda-gdb) step
24 C[i] = A[i] + B[i];
(cuda-gdb) step
26 }
(cuda-gdb) quit
A debugging session is active.
Inferior 1 [process 16777] will be killed.
Quit anyway? (y or n) y
$