在每个源缓冲区的元素上使用一些算法从另一个 GPU 内存缓冲区分配一个 GPU 内存缓冲区值的正确方法是什么?
What's the correct way to assign one GPU memory buffer value from another GPU memory buffer with some arithmetic on each source buffer's element?
我是使用 Cuda 工具包进行 GPU 编程的新手,我必须编写一些代码来提供我在标题中提到的功能。
我想粘贴代码以显示我到底想做什么。
void CTrtModelWrapper::forward(void **bindings,
unsigned height,
unsigned width,
short channel,
ColorSpaceFmt colorFmt,
PixelDataType pixelType) {
uint16_t *devInRawBuffer_ptr = (uint16_t *) bindings[0];
uint16_t *devOutRawBuffer_ptr = (uint16_t *) bindings[1];
const unsigned short bit = 16;
float *devInputBuffer_ptr = nullptr;
float *devOutputBuffer_ptr = nullptr;
unsigned volume = height * width * channel;
common::cudaCheck(cudaMalloc((void **) &devInputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
common::cudaCheck(cudaMalloc((void **) &devOutputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
unsigned short npos = 0;
switch (pixelType) {
case PixelDataType::PDT_INT8: // high 8bit
npos = bit - 8;
break;
case PixelDataType::PDT_INT10: // high 10bit
npos = bit - 10;
break;
default:
break;
}
switch (colorFmt) {
case CFMT_RGB: {
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); // SEGMENTATION Fault at this line
}
}
break;
default:
break;
}
void *rtBindings[2] = {devInputBuffer_ptr, devOutputBuffer_ptr};
// forward
this->_forward(rtBindings);
// convert output
unsigned short ef_bit = bit - npos;
switch (colorFmt) {
case CFMT_RGB: {
for (unsigned i = 0; i < volume; ++i) {
devOutRawBuffer_ptr[i] = clip< uint16_t >((uint16_t) devOutputBuffer_ptr[i],
0,
(uint16_t) pow(2, ef_bit)) << npos;
}
}
break;
default:
break;
}
}
bindings
是指向数组的指针,数组中的第一个元素是指向 gpu 上使用 cudaMalloc
分配的缓冲区的设备指针,缓冲区中的每个元素都是一个16bit integer.the 2nd 一样,用来存放输出数据。
height
,width
,channel
,colorFmt(RGB here)
,pixelType(PDT_INT8, aka 8bit)
分别对应图片的高、宽、通道数、色彩空间、存储位数一个像素值。
_forward
函数需要一个指向数组的指针,类似于 bindings
除了缓冲区中的每个元素应该是一个 32 位浮点数。
所以我使用循环进行一些转换
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); // SEGMENTATION Fault at this line
}
>>
操作是因为实际的8bit数据存放在高8位
SEGMENTATION FAULT 发生在这行代码 devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos);
并且 i
等于 0。
我试着把这段代码分成几行:
uint16_t value = devInRawBuffer_ptr[i];
float transferd = float(value >> npos);
devInputBuffer_ptr[i] = transferd;
并且在这一行发生了SEGMENTATION FAULT uint16_t value = devInRawBuffer_ptr[i];
我想知道这是为已分配的 gpu 内存缓冲区赋值的有效方法吗?
PS: bindings
中给出的缓冲区完全没问题。它们是在调用 forward
函数之前使用 cudaMemcpy
来自主机内存,但我仍然粘贴下面的代码
nvinfer1::DataType type = nvinfer1::DataType::kHALF;
HostBuffer hostInputBuffer(volume, type);
DeviceBuffer deviceInputBuffer(volume, type);
HostBuffer hostOutputBuffer(volume, type);
DeviceBuffer deviceOutputBuffer(volume, type);
// HxWxC --> WxHxC
auto *hostInputDataBuffer = static_cast<unsigned short *>(hostInputBuffer.data());
for (unsigned w = 0; w < W; ++w) {
for (unsigned h = 0; h < H; ++h) {
for (unsigned c = 0; c < C; ++c) {
hostInputDataBuffer[w * H * C + h * C + c] = (unsigned short )(*(ppm.buffer.get() + h * W * C + w * C + c));
}
}
}
auto ret = cudaMemcpy(deviceInputBuffer.data(), hostInputBuffer.data(), volume * getElementSize(type),
cudaMemcpyHostToDevice);
if (ret != 0) {
std::cout << "CUDA failure: " << ret << std::endl;
return EXIT_FAILURE;
}
void *bindings[2] = {deviceInputBuffer.data(), deviceOutputBuffer.data()};
model->forward(bindings, H, W, C, sbsisr::ColorSpaceFmt::CFMT_RGB, sbsisr::PixelDataType::PDT_INT8);
在 CUDA 中,通常不建议在主机代码中取消引用设备指针。例如,当您使用 cudaMalloc
:
时,您正在创建一个“设备指针”
common::cudaCheck(cudaMalloc((void **) &devInputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
从您发布的代码中,无法为 devInRawBuffer_ptr
推断出它,但我假设它也是一个设备指针。
在那种情况下,要执行此操作:
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos);
}
您将启动一个 CUDA 内核,如下所示:
// put this function definition at file scope
__global__ void shift_kernel(float *dst, uint16_t *src, size_t sz, unsigned short npos){
for (size_t idx = blockIdx.x*blockDim.x+threadIdx.x, idx < sz; idx += gridDim.x*blockDim.x) dst[idx] = (float)((src[idx]) >> npos);
}
// call it like this in your code:
kernel<<<160, 1024>>>(devInputBuffer_ptr, devInRawBuffer_ptr, volume, npos);
(在浏览器中编码,未经测试)
如果你想更多地了解这里发生的事情,你不妨研究一下CUDA。例如,你可以得到大部分的基本概念here and by studying the CUDA sample code vectorAdd
. The grid-stride loop is discussed here.
我是使用 Cuda 工具包进行 GPU 编程的新手,我必须编写一些代码来提供我在标题中提到的功能。
我想粘贴代码以显示我到底想做什么。
void CTrtModelWrapper::forward(void **bindings,
unsigned height,
unsigned width,
short channel,
ColorSpaceFmt colorFmt,
PixelDataType pixelType) {
uint16_t *devInRawBuffer_ptr = (uint16_t *) bindings[0];
uint16_t *devOutRawBuffer_ptr = (uint16_t *) bindings[1];
const unsigned short bit = 16;
float *devInputBuffer_ptr = nullptr;
float *devOutputBuffer_ptr = nullptr;
unsigned volume = height * width * channel;
common::cudaCheck(cudaMalloc((void **) &devInputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
common::cudaCheck(cudaMalloc((void **) &devOutputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
unsigned short npos = 0;
switch (pixelType) {
case PixelDataType::PDT_INT8: // high 8bit
npos = bit - 8;
break;
case PixelDataType::PDT_INT10: // high 10bit
npos = bit - 10;
break;
default:
break;
}
switch (colorFmt) {
case CFMT_RGB: {
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); // SEGMENTATION Fault at this line
}
}
break;
default:
break;
}
void *rtBindings[2] = {devInputBuffer_ptr, devOutputBuffer_ptr};
// forward
this->_forward(rtBindings);
// convert output
unsigned short ef_bit = bit - npos;
switch (colorFmt) {
case CFMT_RGB: {
for (unsigned i = 0; i < volume; ++i) {
devOutRawBuffer_ptr[i] = clip< uint16_t >((uint16_t) devOutputBuffer_ptr[i],
0,
(uint16_t) pow(2, ef_bit)) << npos;
}
}
break;
default:
break;
}
}
bindings
是指向数组的指针,数组中的第一个元素是指向 gpu 上使用cudaMalloc
分配的缓冲区的设备指针,缓冲区中的每个元素都是一个16bit integer.the 2nd 一样,用来存放输出数据。height
,width
,channel
,colorFmt(RGB here)
,pixelType(PDT_INT8, aka 8bit)
分别对应图片的高、宽、通道数、色彩空间、存储位数一个像素值。
_forward
函数需要一个指向数组的指针,类似于 bindings
除了缓冲区中的每个元素应该是一个 32 位浮点数。
所以我使用循环进行一些转换
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos); // SEGMENTATION Fault at this line
}
>>
操作是因为实际的8bit数据存放在高8位
SEGMENTATION FAULT 发生在这行代码 devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos);
并且 i
等于 0。
我试着把这段代码分成几行:
uint16_t value = devInRawBuffer_ptr[i];
float transferd = float(value >> npos);
devInputBuffer_ptr[i] = transferd;
并且在这一行发生了SEGMENTATION FAULT uint16_t value = devInRawBuffer_ptr[i];
我想知道这是为已分配的 gpu 内存缓冲区赋值的有效方法吗?
PS: bindings
中给出的缓冲区完全没问题。它们是在调用 forward
函数之前使用 cudaMemcpy
来自主机内存,但我仍然粘贴下面的代码
nvinfer1::DataType type = nvinfer1::DataType::kHALF;
HostBuffer hostInputBuffer(volume, type);
DeviceBuffer deviceInputBuffer(volume, type);
HostBuffer hostOutputBuffer(volume, type);
DeviceBuffer deviceOutputBuffer(volume, type);
// HxWxC --> WxHxC
auto *hostInputDataBuffer = static_cast<unsigned short *>(hostInputBuffer.data());
for (unsigned w = 0; w < W; ++w) {
for (unsigned h = 0; h < H; ++h) {
for (unsigned c = 0; c < C; ++c) {
hostInputDataBuffer[w * H * C + h * C + c] = (unsigned short )(*(ppm.buffer.get() + h * W * C + w * C + c));
}
}
}
auto ret = cudaMemcpy(deviceInputBuffer.data(), hostInputBuffer.data(), volume * getElementSize(type),
cudaMemcpyHostToDevice);
if (ret != 0) {
std::cout << "CUDA failure: " << ret << std::endl;
return EXIT_FAILURE;
}
void *bindings[2] = {deviceInputBuffer.data(), deviceOutputBuffer.data()};
model->forward(bindings, H, W, C, sbsisr::ColorSpaceFmt::CFMT_RGB, sbsisr::PixelDataType::PDT_INT8);
在 CUDA 中,通常不建议在主机代码中取消引用设备指针。例如,当您使用 cudaMalloc
:
common::cudaCheck(cudaMalloc((void **) &devInputBuffer_ptr, volume * getElementSize(nvinfer1::DataType::kFLOAT)));
从您发布的代码中,无法为 devInRawBuffer_ptr
推断出它,但我假设它也是一个设备指针。
在那种情况下,要执行此操作:
for (unsigned i = 0; i < volume; ++i) {
devInputBuffer_ptr[i] = float((devInRawBuffer_ptr[i]) >> npos);
}
您将启动一个 CUDA 内核,如下所示:
// put this function definition at file scope
__global__ void shift_kernel(float *dst, uint16_t *src, size_t sz, unsigned short npos){
for (size_t idx = blockIdx.x*blockDim.x+threadIdx.x, idx < sz; idx += gridDim.x*blockDim.x) dst[idx] = (float)((src[idx]) >> npos);
}
// call it like this in your code:
kernel<<<160, 1024>>>(devInputBuffer_ptr, devInRawBuffer_ptr, volume, npos);
(在浏览器中编码,未经测试)
如果你想更多地了解这里发生的事情,你不妨研究一下CUDA。例如,你可以得到大部分的基本概念here and by studying the CUDA sample code vectorAdd
. The grid-stride loop is discussed here.