CUDA 共享内存问题（以及将 CUDA 与 python/ctypes 一起使用）

Question

不知何故，当我在下面的代码中修改 d_updated_water_flow_map 时，d_terrain_height_map 也被修改了。

更改两个数组的分配顺序可以解决问题，但我认为这只是掩盖了问题的根本原因。

cudaCheck(cudaMalloc((void **)&d_water_flow_map, SIZE * 4)); 
cudaCheck(cudaMalloc((void **)&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
cudaCheck(cudaMalloc((void **)&d_terrain_height_map, SIZE));

我正在将内核编译成一个 DLL，并在 Blender 3D python 解释器中的 python 文件下方调用它。所有值都是 32 位浮点数。

cu_include.h

#pragma once  

#ifdef MATHLIBRARY_EXPORTS  
#define MATHLIBRARY_API __declspec(dllexport)   
#else  
#define MATHLIBRARY_API __declspec(dllimport)   
#endif  


extern "C" __declspec(dllexport)
void init(float *t_height_map,
float *w_height_map,
float *s_height_map,
int SIZE_X,
int SIZE_Y);

extern "C" __declspec(dllexport)
void run_hydro_erosion(int cycles,
float t_step,
float min_tilt_angle,
float SEDIMENT_CAP,
float DISSOLVE_CONST,
float DEPOSIT_CONST,
int SIZE_X,
int SIZE_Y,
float PIPE_LENGTH,
float ADJACENT_LENGTH,
float TIME_STEP,
float MIN_TILT_ANGLE);

extern "C" __declspec(dllexport)
void free_mem();

extern "C" __declspec(dllexport)
void procedural_rain(float *water_height_map, float *rain_map, int SIZE_X, int SIZE_Y);

erosion_kernel.dll

#include "cu_include.h"

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <algorithm>
#include <random>

// includes CUDA
#include <cuda_runtime.h>

using namespace std;

#define FLOW_RIGHT 0
#define FLOW_UP 1
#define FLOW_LEFT 2
#define FLOW_DOWN 3
#define X_VEL 0
#define Y_VEL 1
#define LEFT_CELL row, col - 1
#define RIGHT_CELL row, col + 1
#define ABOVE_CELL row - 1, col
#define BELOW_CELL row + 1, col

// CUDA API error checking macro
#define T 1024
#define M 1536
#define blockSize 1024
#define cudaCheck(error) \
  if (error != cudaSuccess) { \
    printf("Fatal error: %s at %s:%d\n", \
      cudaGetErrorString(error), \
      __FILE__, __LINE__); \
    exit(1); \
              }


__global__ void update_water_flow(float *water_height_map, float *water_flow_map, float *d_updated_water_flow_map, int SIZE_X, int SIZE_Y)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int col = index % SIZE_X;
    int row = index / SIZE_X; 

    index = row * (SIZE_X * 4) + col * 4;   // 3D index
    d_updated_water_flow_map[index + FLOW_RIGHT] = 0;
    d_updated_water_flow_map[index + FLOW_UP] = 0;
    d_updated_water_flow_map[index + FLOW_LEFT] = 0;
    d_updated_water_flow_map[index + FLOW_DOWN] = 0;

}

static float *terrain_height_map;
static float *water_height_map;
static float *sediment_height_map;

void init(float *t_height_map,
    float *w_height_map,
    float *s_height_map,
    int SIZE_X,
    int SIZE_Y)
{
    /* set vars HOST*/
    terrain_height_map = t_height_map;
    water_height_map = w_height_map;
    sediment_height_map = s_height_map;
}

void run_hydro_erosion(int cycles,
    float t_step,
    float min_tilt_angle,
    float SEDIMENT_CAP,
    float DISSOLVE_CONST,
    float DEPOSIT_CONST,
    int SIZE_X,
    int SIZE_Y,
    float PIPE_LENGTH,
    float ADJACENT_LENGTH,
    float TIME_STEP,
    float MIN_TILT_ANGLE)
{ 
    int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;
    int SIZE = SIZE_X * SIZE_Y * sizeof(float);

    float *d_terrain_height_map, *d_updated_terrain_height_map;
    float *d_water_height_map, *d_updated_water_height_map;
    float *d_sediment_height_map, *d_updated_sediment_height_map;

    float *d_suspended_sediment_level;
    float *d_updated_suspended_sediment_level;
    float *d_water_flow_map;
    float *d_updated_water_flow_map;
    float *d_prev_water_height_map;
    float *d_water_velocity_vec;
    float *d_rain_map;

    cudaCheck(cudaMalloc(&d_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_prev_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_water_flow_map, SIZE * 4));
    cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
    cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_terrain_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_sediment_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_sediment_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_suspended_sediment_level, SIZE));
    cudaCheck(cudaMalloc(&d_updated_suspended_sediment_level, SIZE));
    cudaCheck(cudaMalloc(&d_rain_map, SIZE));
    cudaCheck(cudaMalloc(&d_water_velocity_vec, SIZE * 2));

    cudaCheck(cudaMemcpy(d_terrain_height_map, terrain_height_map, SIZE, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_water_height_map, water_height_map, SIZE, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_sediment_height_map, sediment_height_map, SIZE, cudaMemcpyHostToDevice));

    cout << "init terrain_height_map" << endl;
    for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
        cout << terrain_height_map[i] << ", ";
        if (i % SIZE_X == 0 && i != 0) cout << endl;
    }

    /* launch the kernel on the GPU */
    float *temp;
    while (cycles--) {
        update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y); 
        temp = d_water_flow_map;
        d_water_flow_map = d_updated_water_flow_map;
        d_updated_water_flow_map = temp;        
    }
    cudaCheck(cudaMemcpy(terrain_height_map, d_terrain_height_map, SIZE, cudaMemcpyDeviceToHost)); 


    cout << "updated terrain" << endl;
    for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
        cout << terrain_height_map[i] << ", ";
        if (i % SIZE_X == 0 && i != 0) cout << endl;
    } 
}

Python 文件

import bpy
import numpy
import ctypes
import random

width = 4
height = 4

size_x = width
size_y = height
N = size_x * size_y

scrpt_cycles = 1
kernel_cycles = 1
time_step = 0.005 
pipe_length = 1.0
adjacent_length = 1.0
min_tilt_angle = 10
sediment_cap = 0.01
dissolve_const = 0.01
deposit_const = 0.01

# initialize arrays
ter_height_map = numpy.ones((N), dtype=numpy.float32)
water_height_map = numpy.zeros((N), dtype=numpy.float32)
sed_height_map = numpy.zeros((N), dtype=numpy.float32)
rain_map = numpy.ones((N), dtype=numpy.float32)


# load terrain height from image
for i in range(0, len(ter_height_map)):
    ter_height_map[i] = 1


# import DLL
E = ctypes.cdll.LoadLibrary("E:/Programming/CUDA/erosion/Release/erosion_kernel.dll")

# initialize device memory
E.init( ctypes.c_void_p(ter_height_map.ctypes.data), 
        ctypes.c_void_p(water_height_map.ctypes.data),
        ctypes.c_void_p(sed_height_map.ctypes.data),
        ctypes.c_int(size_x),
        ctypes.c_int(size_y))


# run erosion
while(scrpt_cycles):
    scrpt_cycles = scrpt_cycles - 1  
    E.run_hydro_erosion(ctypes.c_int(kernel_cycles),
                        ctypes.c_float(time_step),
                        ctypes.c_float(min_tilt_angle), 
                        ctypes.c_float(sediment_cap), 
                        ctypes.c_float(dissolve_const), 
                        ctypes.c_float(deposit_const),
                        ctypes.c_int(size_x),
                        ctypes.c_int(size_y),
                        ctypes.c_float(pipe_length),
                        ctypes.c_float(adjacent_length),
                        ctypes.c_float(time_step),
                        ctypes.c_float(min_tilt_angle))

错误输出：

预期输出（在我注释掉update_water_flow之后）：

//update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);

显卡：GTX460M

Answer 1

（请注意，此答案中的代码还提供了完整的 recipe/example，说明如何在与 python 应用程序共享的库中使用 CUDA 代码（例如 CUDA 设备内核）python ctypes。如果您想使用 CUDA 库功能，答案提供了一个示例，使用 python ctypes。）

这里的问题是内核越界写入，显然 compiler/runtime 将分配定位在设备内存中足够近的位置，第一次分配时超出范围导致代码写入进入第二个分配：

cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));

越界访问的出现是因为内核启动涉及到足够多的线程（在这种情况下它启动了 1024 个线程）而我们实际上只有 "need" SIZE_X*SIZE_Y 个线程（即本例中的 16):

#define blockSize 1024
...
int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;
...
update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);

这当然是 "typical" 在 CUDA 编程中，为了启动足够多的线程，但是在执行此操作时在内核中包含一个 "thread check" 很重要，以防止任何 "extra" 线程进行任何非法的越界访问。在这种情况下，一种可能的内核线程检查可能是这样的：

if ((row >= SIZE_Y) || (col >= SIZE_X)) return;

这是一个基于所提供代码的完整示例（尽管在 linux 上，并删除了 python 代码中的搅拌机依赖项），显示了前后效果。请注意，我们甚至可以运行使用 cuda-memcheck 这样的代码，在这种情况下它会指出越界访问（为清楚起见，从下面的第一个示例中省略）：

$ cat t383.cu
extern "C"
void init(float *t_height_map,
float *w_height_map,
float *s_height_map,
int SIZE_X,
int SIZE_Y);

extern "C"
void run_hydro_erosion(int cycles,
float t_step,
float min_tilt_angle,
float SEDIMENT_CAP,
float DISSOLVE_CONST,
float DEPOSIT_CONST,
int SIZE_X,
int SIZE_Y,
float PIPE_LENGTH,
float ADJACENT_LENGTH,
float TIME_STEP,
float MIN_TILT_ANGLE);

extern "C"
void free_mem();

extern "C"
void procedural_rain(float *water_height_map, float *rain_map, int SIZE_X, int SIZE_Y);

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <algorithm>
#include <random>

// includes CUDA
#include <cuda_runtime.h>

using namespace std;

#define FLOW_RIGHT 0
#define FLOW_UP 1
#define FLOW_LEFT 2
#define FLOW_DOWN 3
#define X_VEL 0
#define Y_VEL 1
#define LEFT_CELL row, col - 1
#define RIGHT_CELL row, col + 1
#define ABOVE_CELL row - 1, col
#define BELOW_CELL row + 1, col

// CUDA API error checking macro
#define T 1024
#define M 1536
#define blockSize 1024
#define cudaCheck(error) \
  if (error != cudaSuccess) { \
    printf("Fatal error: %s at %s:%d\n", \
      cudaGetErrorString(error), \
      __FILE__, __LINE__); \
    exit(1); \
              }


__global__ void update_water_flow(float *water_height_map, float *water_flow_map, float *d_updated_water_flow_map, int SIZE_X, int SIZE_Y)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int col = index % SIZE_X;
    int row = index / SIZE_X;

    index = row * (SIZE_X * 4) + col * 4;   // 3D index
#ifdef FIX
    if ((row >= SIZE_Y) || (col >= SIZE_X)) return;
#endif
    d_updated_water_flow_map[index + FLOW_RIGHT] = 0;
    d_updated_water_flow_map[index + FLOW_UP] = 0;
    d_updated_water_flow_map[index + FLOW_LEFT] = 0;
    d_updated_water_flow_map[index + FLOW_DOWN] = 0;

}

static float *terrain_height_map;
static float *water_height_map;
static float *sediment_height_map;

void init(float *t_height_map,
    float *w_height_map,
    float *s_height_map,
    int SIZE_X,
    int SIZE_Y)
{
    /* set vars HOST*/
    terrain_height_map = t_height_map;
    water_height_map = w_height_map;
    sediment_height_map = s_height_map;
}

void run_hydro_erosion(int cycles,
    float t_step,
    float min_tilt_angle,
    float SEDIMENT_CAP,
    float DISSOLVE_CONST,
    float DEPOSIT_CONST,
    int SIZE_X,
    int SIZE_Y,
    float PIPE_LENGTH,
    float ADJACENT_LENGTH,
    float TIME_STEP,
    float MIN_TILT_ANGLE)
{
    int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;
    int SIZE = SIZE_X * SIZE_Y * sizeof(float);

    float *d_terrain_height_map, *d_updated_terrain_height_map;
    float *d_water_height_map, *d_updated_water_height_map;
    float *d_sediment_height_map, *d_updated_sediment_height_map;

    float *d_suspended_sediment_level;
    float *d_updated_suspended_sediment_level;
    float *d_water_flow_map;
    float *d_updated_water_flow_map;
    float *d_prev_water_height_map;
    float *d_water_velocity_vec;
    float *d_rain_map;

    cudaCheck(cudaMalloc(&d_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_prev_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_water_flow_map, SIZE * 4));
    cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
    cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_terrain_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_sediment_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_sediment_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_suspended_sediment_level, SIZE));
    cudaCheck(cudaMalloc(&d_updated_suspended_sediment_level, SIZE));
    cudaCheck(cudaMalloc(&d_rain_map, SIZE));
    cudaCheck(cudaMalloc(&d_water_velocity_vec, SIZE * 2));

    cudaCheck(cudaMemcpy(d_terrain_height_map, terrain_height_map, SIZE, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_water_height_map, water_height_map, SIZE, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_sediment_height_map, sediment_height_map, SIZE, cudaMemcpyHostToDevice));

    cout << "init terrain_height_map" << endl;
    for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
        cout << terrain_height_map[i] << ", ";
        if (i % SIZE_X == 0 && i != 0) cout << endl;
    }

    /* launch the kernel on the GPU */
    float *temp;
    while (cycles--) {
        update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);
        temp = d_water_flow_map;
        d_water_flow_map = d_updated_water_flow_map;
        d_updated_water_flow_map = temp;
    }
    cudaCheck(cudaMemcpy(terrain_height_map, d_terrain_height_map, SIZE, cudaMemcpyDeviceToHost));


    cout << "updated terrain" << endl;
    for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
        cout << terrain_height_map[i] << ", ";
        if (i % SIZE_X == 0 && i != 0) cout << endl;
    }
}
$ cat t383.py
import numpy
import ctypes
import random

width = 4
height = 4

size_x = width
size_y = height
N = size_x * size_y

scrpt_cycles = 1
kernel_cycles = 1
time_step = 0.005
pipe_length = 1.0
adjacent_length = 1.0
min_tilt_angle = 10
sediment_cap = 0.01
dissolve_const = 0.01
deposit_const = 0.01

# initialize arrays
ter_height_map = numpy.ones((N), dtype=numpy.float32)
water_height_map = numpy.zeros((N), dtype=numpy.float32)
sed_height_map = numpy.zeros((N), dtype=numpy.float32)
rain_map = numpy.ones((N), dtype=numpy.float32)


# load terrain height from image
for i in range(0, len(ter_height_map)):
    ter_height_map[i] = 1


# import DLL
E = ctypes.cdll.LoadLibrary("./t383.so")

# initialize device memory
E.init( ctypes.c_void_p(ter_height_map.ctypes.data),
        ctypes.c_void_p(water_height_map.ctypes.data),
        ctypes.c_void_p(sed_height_map.ctypes.data),
        ctypes.c_int(size_x),
        ctypes.c_int(size_y))


# run erosion
while(scrpt_cycles):
    scrpt_cycles = scrpt_cycles - 1
    E.run_hydro_erosion(ctypes.c_int(kernel_cycles),
                        ctypes.c_float(time_step),
                        ctypes.c_float(min_tilt_angle),
                        ctypes.c_float(sediment_cap),
                        ctypes.c_float(dissolve_const),
                        ctypes.c_float(deposit_const),
                        ctypes.c_int(size_x),
                        ctypes.c_int(size_y),
                        ctypes.c_float(pipe_length),
                        ctypes.c_float(adjacent_length),
                        ctypes.c_float(time_step),
                        ctypes.c_float(min_tilt_angle))
$ nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu
$ python t383.py
init terrain_height_map
1, 1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, updated terrain
0, 0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 
$ nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu -DFIX
$ cuda-memcheck python t383.py
========= CUDA-MEMCHECK
init terrain_height_map
1, 1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, updated terrain
1, 1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 
========= ERROR SUMMARY: 0 errors
$

如果我们在没有修复的情况下编译前面的示例，但是运行它带有 cuda-memcheck 我们将得到指示越界访问的输出：

$nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu
$ cuda-memcheck python t383.py
========= CUDA-MEMCHECK
init terrain_height_map
1, 1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 1,
========= Invalid __global__ write of size 4
=========     at 0x000002f0 in update_water_flow(float*, float*, float*, int, int)
=========     by thread (31,0,0) in block (0,0,0)
=========     Address 0x1050d6009f0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204505]
=========     Host Frame:./t383.so [0x1c291]
=========     Host Frame:./t383.so [0x39e33]
=========     Host Frame:./t383.so [0x6879]
=========     Host Frame:./t383.so (_Z43__device_stub__Z17update_water_flowPfS_S_iiPfS_S_ii + 0xe3) [0x6747]
=========     Host Frame:./t383.so (_Z17update_water_flowPfS_S_ii + 0x38) [0x6781]
=========     Host Frame:./t383.so (run_hydro_erosion + 0x8f2) [0x648b]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call_unix64 + 0x4c) [0x5adc]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call + 0x1fc) [0x540c]
=========     Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so (_ctypes_callproc + 0x48e) [0x145fe]
=========     Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so [0x15f9e]
=========     Host Frame:python (PyEval_EvalFrameEx + 0x98d) [0x1244dd]
=========     Host Frame:python [0x167d14]
=========     Host Frame:python (PyRun_FileExFlags + 0x92) [0x65bf4]
=========     Host Frame:python (PyRun_SimpleFileExFlags + 0x2ee) [0x6612d]
=========     Host Frame:python (Py_Main + 0xb5e) [0x66d92]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
=========     Host Frame:python [0x177c2e]
=========
========= Invalid __global__ write of size 4
=========     at 0x000002f0 in update_water_flow(float*, float*, float*, int, int)
=========     by thread (30,0,0) in block (0,0,0)
=========     Address 0x1050d6009e0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204505]
=========     Host Frame:./t383.so [0x1c291]
=========     Host Frame:./t383.so [0x39e33]
=========     Host Frame:./t383.so [0x6879]
=========     Host Frame:./t383.so (_Z43__device_stub__Z17update_water_flowPfS_S_iiPfS_S_ii + 0xe3) [0x6747]
=========     Host Frame:./t383.so (_Z17update_water_flowPfS_S_ii + 0x38) [0x6781]
=========     Host Frame:./t383.so (run_hydro_erosion + 0x8f2) [0x648b]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call_unix64 + 0x4c) [0x5adc]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call + 0x1fc) [0x540c]
=========     Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so (_ctypes_callproc + 0x48e) [0x145fe]
=========     Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so [0x15f9e]
=========     Host Frame:python (PyEval_EvalFrameEx + 0x98d) [0x1244dd]
=========     Host Frame:python [0x167d14]
=========     Host Frame:python (PyRun_FileExFlags + 0x92) [0x65bf4]
=========     Host Frame:python (PyRun_SimpleFileExFlags + 0x2ee) [0x6612d]
=========     Host Frame:python (Py_Main + 0xb5e) [0x66d92]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
=========     Host Frame:python [0x177c2e]
=========
... (output truncated for brevity of presentation)
========= ERROR SUMMARY: 18 errors
$

CUDA 共享内存问题（以及将 CUDA 与 python/ctypes 一起使用）

CUDA shared memory issue (and using CUDA with python/ctypes)

c++

python

ctypes

cuda