传递给内核的 cupy 变量被忽略
cupy variables passed to kernel ignored
我已经修改了一个 cupy
示例来测试一个简单的函数,但是有些变量似乎没有取正确的值。这是代码:
import cupy as cp
import numpy as np
import sys
from cupy import prof
from timeit import default_timer as timer
_cupy_preprocessing_src = r"""
extern "C"
{
__global__ void _cupy_preprocessing(
const float * __restrict__ toNormalize,
float * __restrict__ normalized,
const int w,
const int h,
const float B,
const float G,
const float R)
{
const int tx { static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x) };
const int stride { static_cast<int>(blockDim.x * gridDim.x) };
for(int tid = tx; tid < (w * h); tid += stride)
{
normalized[tid] = toNormalize[tid + w * h * 2] * 255.0 - B;
normalized[tid + w * h] = toNormalize[tid + w * h] * 255.0 - G;
normalized[tid + w * h * 2] = toNormalize[tid] * 255.0 - R;
}
}
}
"""
def _preprocessing(toNorm, norm, w, h, B, G, R):
device_id = cp.cuda.Device()
numSM = device_id.attributes["MultiProcessorCount"]
threadsperblock = (128, )
blockspergrid = (numSM * 20, )
module = cp.RawModule(code=_cupy_preprocessing_src, options=("-std=c++11"))
kernel = module.get_function("_cupy_preprocessing")
kernel_args = (toNorm, norm, w, h, B, G, R)
kernel(blockspergrid, threadsperblock, kernel_args)
cp.cuda.runtime.deviceSynchronize()
def gpu_preprocessing(toNorm, w, h, B, G, R):
norm = cp.empty(toNorm.shape, dtype=toNorm.dtype)
_preprocessing(toNorm, norm, w, h, B, G, R)
return norm
def cpu_preprocessing(toNorm, w, h, B, G, R):
norm = np.empty(toNorm.shape, dtype=toNorm.dtype)
for i in range(w * h):
norm[i] = toNorm[i + w * h * 2] * 255.0 - B;
norm[i + w * h] = toNorm[i + w * h] * 255.0 - G;
norm[i + w * h * 2] = toNorm[i] * 255.0 - R;
return norm
if __name__ == "__main__":
w = 512
h = 512
B = 1.0
G = 1.0
R = 1.0
x = np.zeros((w * h * 3, ), dtype=np.float32)
x[:w * h] = np.ones((w * h, ), dtype=np.float32)
x[w * h:w * h * 2] = np.ones((w * h, ), dtype=np.float32) + 1.0
x[w * h * 2:] = np.ones((w * h, ), dtype=np.float32) + 2.0
d_x = cp.array(x)
start = timer()
cpu_ppre = cpu_preprocessing(x, w, h, B, G, R)
end = timer()
print("CPU time: {:f}".format(end - start))
start = timer()
gpu_ppre = gpu_preprocessing(d_x, w, h, B, G, R)
end = timer()
print("GPU time: {:f}".format(end - start))
gpu_ppre = cp.asnumpy(gpu_ppre)
print(cpu_ppre)
print(gpu_ppre)
如果B = G = R = 0.0
、cpu_preprocessing
和gpu_preprocessing
、return是同一个数组,而如果B
、G
和R
不同于零,cpu_preprocessing
return 是期望值,而 gpu_preprocessing
似乎忽略了 B
、G
和 R
。
我错过了什么吗?
尝试换行
kernel_args = (toNorm, norm, w, h, B, G, R)
到
kernel_args = (toNorm, norm, w, h, cp.float32(B), cp.float32(G), cp.float32(R))
看看结果是不是固定的。您正在传递 Python 浮点数,但我认为 CuPy 无法推断要转换为的正确位宽。
此外,您漏掉了一个逗号(应该是元组):options=("-std=c++11", )
.
我已经修改了一个 cupy
示例来测试一个简单的函数,但是有些变量似乎没有取正确的值。这是代码:
import cupy as cp
import numpy as np
import sys
from cupy import prof
from timeit import default_timer as timer
_cupy_preprocessing_src = r"""
extern "C"
{
__global__ void _cupy_preprocessing(
const float * __restrict__ toNormalize,
float * __restrict__ normalized,
const int w,
const int h,
const float B,
const float G,
const float R)
{
const int tx { static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x) };
const int stride { static_cast<int>(blockDim.x * gridDim.x) };
for(int tid = tx; tid < (w * h); tid += stride)
{
normalized[tid] = toNormalize[tid + w * h * 2] * 255.0 - B;
normalized[tid + w * h] = toNormalize[tid + w * h] * 255.0 - G;
normalized[tid + w * h * 2] = toNormalize[tid] * 255.0 - R;
}
}
}
"""
def _preprocessing(toNorm, norm, w, h, B, G, R):
device_id = cp.cuda.Device()
numSM = device_id.attributes["MultiProcessorCount"]
threadsperblock = (128, )
blockspergrid = (numSM * 20, )
module = cp.RawModule(code=_cupy_preprocessing_src, options=("-std=c++11"))
kernel = module.get_function("_cupy_preprocessing")
kernel_args = (toNorm, norm, w, h, B, G, R)
kernel(blockspergrid, threadsperblock, kernel_args)
cp.cuda.runtime.deviceSynchronize()
def gpu_preprocessing(toNorm, w, h, B, G, R):
norm = cp.empty(toNorm.shape, dtype=toNorm.dtype)
_preprocessing(toNorm, norm, w, h, B, G, R)
return norm
def cpu_preprocessing(toNorm, w, h, B, G, R):
norm = np.empty(toNorm.shape, dtype=toNorm.dtype)
for i in range(w * h):
norm[i] = toNorm[i + w * h * 2] * 255.0 - B;
norm[i + w * h] = toNorm[i + w * h] * 255.0 - G;
norm[i + w * h * 2] = toNorm[i] * 255.0 - R;
return norm
if __name__ == "__main__":
w = 512
h = 512
B = 1.0
G = 1.0
R = 1.0
x = np.zeros((w * h * 3, ), dtype=np.float32)
x[:w * h] = np.ones((w * h, ), dtype=np.float32)
x[w * h:w * h * 2] = np.ones((w * h, ), dtype=np.float32) + 1.0
x[w * h * 2:] = np.ones((w * h, ), dtype=np.float32) + 2.0
d_x = cp.array(x)
start = timer()
cpu_ppre = cpu_preprocessing(x, w, h, B, G, R)
end = timer()
print("CPU time: {:f}".format(end - start))
start = timer()
gpu_ppre = gpu_preprocessing(d_x, w, h, B, G, R)
end = timer()
print("GPU time: {:f}".format(end - start))
gpu_ppre = cp.asnumpy(gpu_ppre)
print(cpu_ppre)
print(gpu_ppre)
如果B = G = R = 0.0
、cpu_preprocessing
和gpu_preprocessing
、return是同一个数组,而如果B
、G
和R
不同于零,cpu_preprocessing
return 是期望值,而 gpu_preprocessing
似乎忽略了 B
、G
和 R
。
我错过了什么吗?
尝试换行
kernel_args = (toNorm, norm, w, h, B, G, R)
到
kernel_args = (toNorm, norm, w, h, cp.float32(B), cp.float32(G), cp.float32(R))
看看结果是不是固定的。您正在传递 Python 浮点数,但我认为 CuPy 无法推断要转换为的正确位宽。
此外,您漏掉了一个逗号(应该是元组):options=("-std=c++11", )
.