从主机到设备的 PyCUDA 值未获得正确的值

Question

我打算在 PyCUDA 中编写一个内核来生成二维高斯补丁。但是，我在主机中定义的值在将它们复制到设备后会发生变化。下面是代码。

import numpy as np
import matplotlib.pyplot as plt
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""
#include <stdio.h>
__global__ void gaussian2D(float *output, float x, float y, float sigma, int 
n_rows, int n_cols)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
printf("%d ", n_cols);
if (i < n_cols && j < n_rows) {
   size_t idx = j*n_cols +i;
//printf("%d ", idx);
}
}
""")
# host code
def gpu_gaussian2D(point, sigma, shape):
    # Convert parameters into numpy array
    x, y = np.array(point, dtype=np.float32)
    sigma = np.float32(sigma)
    n_rows, n_cols = np.array(shape, dtype=np.int)
    print(n_rows)
    output = np.empty((1, shape[0]*shape[1]), dtype= np.float32)
    # Get kernel function
    gaussian2D = kernel.get_function("gaussian2D")
    # Define block, grid and compute
    blockDim = (32, 32, 1) # 1024 threads in total
    dx, mx = divmod(shape[1], blockDim[0])
    dy, my = divmod(shape[0], blockDim[1])
    gridDim = ((dx + (mx>0)), (dy + (my>0)), 1)
    # Kernel function
    gaussian2D (
        cuda.Out(output), cuda.In(x), cuda.In(y), cuda.In(sigma), 
        cuda.In(n_rows), cuda.In(n_cols),
        block=blockDim, grid=gridDim)
    return output

point = (5, 5)
sigma = 3.0
shape = (10, 10)
result = gpu_gaussian2D(point, sigma, shape)

检查 n_cols 的打印值后，它不是预期的 10。任何人都可以帮助我，我不知道这里出了什么问题。

Answer 1

.In() and .Out() 仅用于将通过内核中的指针参数传递的缓冲区（因此仅适用于此处的 output）。普通的传值参数可以直接使用

$ cat t7.py
import numpy as np
# import matplotlib.pyplot as plt
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""
#include <stdio.h>
__global__ void gaussian2D(float *output, float x, float y, float sigma, int
n_rows, int n_cols)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
printf("%d ", n_cols);
if (i < n_cols && j < n_rows) {
   size_t idx = j*n_cols +i;
//printf("%d ", idx);
}
}
""")
# host code
def gpu_gaussian2D(point, sigma, shape):
    # Convert parameters into numpy array
    x, y = np.array(point, dtype=np.float32)
    sigma = np.float32(sigma)
    n_rows, n_cols = np.array(shape, dtype=np.int)
    print(n_rows)
    output = np.empty((1, shape[0]*shape[1]), dtype= np.float32)
    # Get kernel function
    gaussian2D = kernel.get_function("gaussian2D")
    # Define block, grid and compute
    blockDim = (32, 32, 1) # 1024 threads in total
    dx, mx = divmod(shape[1], blockDim[0])
    dy, my = divmod(shape[0], blockDim[1])
    gridDim = ((dx + (mx>0)), (dy + (my>0)), 1)
    # Kernel function
    gaussian2D (
        cuda.Out(output), x, y, sigma,
        n_rows, n_cols,
        block=blockDim, grid=gridDim)
    return output

point = (5, 5)
sigma = 3.0
shape = (10, 10)
result = gpu_gaussian2D(point, sigma, shape)
$ python t7.py
10
10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10

从主机到设备的 PyCUDA 值未获得正确的值

PyCUDA value from host to device not get the correct value

python

cuda

gpu

pycuda