CUDA 仅处理了 OpenCV 16 位灰度垫中总列的一半
CUDA only processed half of total columns in an OpenCV 16-bit greyscale Mat
我正在制作一个初学者 CUDA 程序,它基本上使用 OpenCV 执行输入灰度图像的下采样。经测试,它在 8 位灰度图像上运行良好,但在将 16 位灰度图像作为输入时,会产生噪声降低的图像,图像的右半部分为空白。下面是我写的代码。
提供了示例输入和输出图像
和
我的 main.cpp 代码,其中将图像加载到 Mat 中:
int main()
{
cv::Mat im1 = cv::imread("test.png", -1);
std::string output_file = "resultout.png";
binFilter(im1, output_file);
return 0;
}
我的CUDA内核代码:
__global__ void binCUDAKernel(unsigned char *input, unsigned char *output, int binDim, int outputWidth, int outputHeight, int inputWstep, int outputWstep, int nChannels)
{
int outXind = blockIdx.x * blockDim.x + threadIdx.x;
int outYind = blockIdx.y * blockDim.y + threadIdx.y;
if ((outXind < outputWidth) && (outYind < outputHeight)) // Only run threads in output image coordinate range
{
if (nChannels == 1) // Test only for greyscale images
{
// Calculate x & y index of input binned pixels corresponding to current output pixel
int inXstart = outXind * binDim;
int inYstart = outYind * binDim;
// Perform binning on identified input pixels
float sum = 0;
for (int binY = inYstart; binY < (inYstart + binDim); binY++) {
for (int binX = inXstart; binX < (inXstart + binDim); binX++) {
int input_tid = binY * inputWstep + binX;
sum += input[input_tid];
}
}
// Establish output thread index in current output pixel index
int output_tid = outYind * outputWstep + outXind;
// Assign binned pixel value to output pixel
output[output_tid] = static_cast<unsigned short>(sum / (binDim*binDim));
}
}
}
我的CPU代码:
void binFilter(const cv::Mat input, std::string output_file)
{
// 2X2 binning
int binDim = 2;
// Create blank output image & calculate size of input and output
cv::Size outsize(input.size().width / binDim, input.size().height / binDim);
cv::Mat output(outsize, input.type());
const int inputBytes = input.step * input.rows;
const int outputBytes = output.step * output.rows;
// Allocate memory in device
unsigned char *d_input, *d_output;
gpuErrchk(cudaMalloc<unsigned char>(&d_input, inputBytes));
gpuErrchk(cudaMalloc<unsigned char>(&d_output, outputBytes));
// Copy input image to device
gpuErrchk(cudaMemcpy(d_input, input.ptr(), inputBytes, cudaMemcpyHostToDevice));
// Configure size of block and grid
const dim3 block(16, 16);
const dim3 grid((output.cols + block.x - 1) / block.x, (output.rows + block.y - 1) / block.y); // Additional block for rounding up
// Execute kernel
binCUDAKernel <<<grid, block>>> (d_input, d_output, binDim, output.cols, output.rows, input.step, output.step, input.channels());
gpuErrchk(cudaPeekAtLastError());
// Wait for all threads to finish
//gpuErrchk(cudaDeviceSynchronize());
// Copy output image from device back to host (cudaMemcpy is a blocking instruction)
gpuErrchk(cudaMemcpy(output.ptr(), d_output, outputBytes, cudaMemcpyDeviceToHost));
// Free device memory
gpuErrchk(cudaFree(d_input));
gpuErrchk(cudaFree(d_output));
// Write image to specified output_file path
cv::imwrite(output_file, output);
}
我怀疑这可能是某种数据类型不匹配,但我无法弄清楚。
首先,对于处理16位图像,像素数据必须被解释为16位宽的数据类型,可能是unsigned short
或short
。请记住,我们只需要 解释 图像数据为 unsigned short
类型;不是类型转换它。为此,我们只需将图像数据指针转换为所需的类型,如下例所示:
unsigned short* ptr16 = reinterpret_cast<unsigned short*>(im1.ptr());
作为上述步骤的结果,我们还必须为 16 位数据类型创建一个单独的内核。我们可以通过将内核定义为 C++ 模板来巧妙地做到这一点。
所以内核可能如下所示:
template<typename T>
__global__ void binCUDAKernel(T *input, T *output, int binDim, int outputWidth, int outputHeight, int inputWstep, int outputWstep, int nChannels)
{
int outXind = blockIdx.x * blockDim.x + threadIdx.x;
int outYind = blockIdx.y * blockDim.y + threadIdx.y;
if ((outXind < outputWidth) && (outXind > outputWidth/2) && (outYind < outputHeight)) // Only run threads in output image coordinate range
{
if (nChannels == 1) // Test only for greyscale images
{
// Calculate x & y index of input binned pixels corresponding to current output pixel
int inXstart = outXind * binDim;
int inYstart = outYind * binDim;
// Perform binning on identified input pixels
float sum = 0;
for (int binY = inYstart; binY < (inYstart + binDim); binY++) {
for (int binX = inXstart; binX < (inXstart + binDim); binX++) {
int input_tid = binY * inputWstep + binX;
sum += float(input[input_tid]);
}
}
// Establish output thread index in current output pixel index
int output_tid = outYind * outputWstep + outXind;
// Assign binned pixel value to output pixel
output[output_tid] = static_cast<T>(sum / (binDim*binDim));
}
}
}
使用自定义 CUDA 内核处理 OpenCV Mat 期间的另一个重要问题是图像步长必须除以数据类型的大小(以字节为单位)。对于 16 位图像,单个像素的大小为 16 位(2 字节),因此内核内部使用的步长必须除以 2。请记住不应修改原始步长。只是作为内核参数传递的步长值应该被划分。
结合上述修复,最终的 CPU 代码可能如下所示:
void binFilter(const cv::Mat input, std::string output_file)
{
// 2X2 binning
int binDim = 2;
// Create blank output image & calculate size of input and output
cv::Size outsize(input.size().width / binDim, input.size().height / binDim);
cv::Mat output(outsize, input.type());
const int inputBytes = input.step * input.rows;
const int outputBytes = output.step * output.rows;
// Allocate memory in device
unsigned char *d_input, *d_output;
gpuErrchk(cudaMalloc<unsigned char>(&d_input, inputBytes));
gpuErrchk(cudaMalloc<unsigned char>(&d_output, outputBytes));
// Copy input image to device
gpuErrchk(cudaMemcpy(d_input, input.ptr(), inputBytes, cudaMemcpyHostToDevice));
// Configure size of block and grid
const dim3 block(16, 16);
const dim3 grid((output.cols + block.x - 1) / block.x, (output.rows + block.y - 1) / block.y); // Additional block for rounding up
int depth = input.depth();
// Execute kernel
if (input.depth() == CV_16U)
{
typedef unsigned short t16;
t16* input16 = reinterpret_cast<t16*>(d_input);
t16* output16 = reinterpret_cast<t16*>(d_output);
int inputStep16 = input.step / sizeof(t16);
int outputStep16 = output.step / sizeof(t16);
binCUDAKernel <t16> <<<grid, block>>> (input16, output16, binDim, output.cols, output.rows, inputStep16, outputStep16, input.channels());
}
else
{
binCUDAKernel <unsigned char> <<<grid, block>>> (d_input, d_output, binDim, output.cols, output.rows, input.step, output.step, input.channels());
}
gpuErrchk(cudaPeekAtLastError());
// Wait for all threads to finish
//gpuErrchk(cudaDeviceSynchronize());
// Copy output image from device back to host (cudaMemcpy is a blocking instruction)
gpuErrchk(cudaMemcpy(output.ptr(), d_output, outputBytes, cudaMemcpyDeviceToHost));
// Free device memory
gpuErrchk(cudaFree(d_input));
gpuErrchk(cudaFree(d_output));
// Write image to specified output_file path
cv::imwrite(output_file, output);
}
由于合并算法的逻辑,输出图像中的噪声似乎是混叠。例如,它与使用最近邻方法对图像重新采样非常相似。
更新:
上面提到的计算像素内存地址的方法没有记录,只是直觉的结果,所以它可能看起来有点不合常规。
OpenCV 和其他库使用的另一种方法避免了图像步长划分的混乱。给定像素的 x 和 y 索引,它按如下方式进行:
- 将图像数据指针重新解释为字节表示形式 (
unsigned char*
)。
- 使用 y 索引和图像步长计算图像行的起始地址。
- 将行起始地址重新解释为所需类型 (
unsigned short*
)。
- 访问行起始指针的 x 索引。
利用这种方法,我们可以计算出灰度图的像素内存地址如下:
template<typename T>
T* getPixelAddress(unsigned char* data, int x, int y, int step)
{
T* row = (T*)((unsigned char*)(data) + y * step);
return row + x;
}
上述方法中,step值为原值,没有除法
我正在制作一个初学者 CUDA 程序,它基本上使用 OpenCV 执行输入灰度图像的下采样。经测试,它在 8 位灰度图像上运行良好,但在将 16 位灰度图像作为输入时,会产生噪声降低的图像,图像的右半部分为空白。下面是我写的代码。
提供了示例输入和输出图像
和
我的 main.cpp 代码,其中将图像加载到 Mat 中:
int main()
{
cv::Mat im1 = cv::imread("test.png", -1);
std::string output_file = "resultout.png";
binFilter(im1, output_file);
return 0;
}
我的CUDA内核代码:
__global__ void binCUDAKernel(unsigned char *input, unsigned char *output, int binDim, int outputWidth, int outputHeight, int inputWstep, int outputWstep, int nChannels)
{
int outXind = blockIdx.x * blockDim.x + threadIdx.x;
int outYind = blockIdx.y * blockDim.y + threadIdx.y;
if ((outXind < outputWidth) && (outYind < outputHeight)) // Only run threads in output image coordinate range
{
if (nChannels == 1) // Test only for greyscale images
{
// Calculate x & y index of input binned pixels corresponding to current output pixel
int inXstart = outXind * binDim;
int inYstart = outYind * binDim;
// Perform binning on identified input pixels
float sum = 0;
for (int binY = inYstart; binY < (inYstart + binDim); binY++) {
for (int binX = inXstart; binX < (inXstart + binDim); binX++) {
int input_tid = binY * inputWstep + binX;
sum += input[input_tid];
}
}
// Establish output thread index in current output pixel index
int output_tid = outYind * outputWstep + outXind;
// Assign binned pixel value to output pixel
output[output_tid] = static_cast<unsigned short>(sum / (binDim*binDim));
}
}
}
我的CPU代码:
void binFilter(const cv::Mat input, std::string output_file)
{
// 2X2 binning
int binDim = 2;
// Create blank output image & calculate size of input and output
cv::Size outsize(input.size().width / binDim, input.size().height / binDim);
cv::Mat output(outsize, input.type());
const int inputBytes = input.step * input.rows;
const int outputBytes = output.step * output.rows;
// Allocate memory in device
unsigned char *d_input, *d_output;
gpuErrchk(cudaMalloc<unsigned char>(&d_input, inputBytes));
gpuErrchk(cudaMalloc<unsigned char>(&d_output, outputBytes));
// Copy input image to device
gpuErrchk(cudaMemcpy(d_input, input.ptr(), inputBytes, cudaMemcpyHostToDevice));
// Configure size of block and grid
const dim3 block(16, 16);
const dim3 grid((output.cols + block.x - 1) / block.x, (output.rows + block.y - 1) / block.y); // Additional block for rounding up
// Execute kernel
binCUDAKernel <<<grid, block>>> (d_input, d_output, binDim, output.cols, output.rows, input.step, output.step, input.channels());
gpuErrchk(cudaPeekAtLastError());
// Wait for all threads to finish
//gpuErrchk(cudaDeviceSynchronize());
// Copy output image from device back to host (cudaMemcpy is a blocking instruction)
gpuErrchk(cudaMemcpy(output.ptr(), d_output, outputBytes, cudaMemcpyDeviceToHost));
// Free device memory
gpuErrchk(cudaFree(d_input));
gpuErrchk(cudaFree(d_output));
// Write image to specified output_file path
cv::imwrite(output_file, output);
}
我怀疑这可能是某种数据类型不匹配,但我无法弄清楚。
首先,对于处理16位图像,像素数据必须被解释为16位宽的数据类型,可能是unsigned short
或short
。请记住,我们只需要 解释 图像数据为 unsigned short
类型;不是类型转换它。为此,我们只需将图像数据指针转换为所需的类型,如下例所示:
unsigned short* ptr16 = reinterpret_cast<unsigned short*>(im1.ptr());
作为上述步骤的结果,我们还必须为 16 位数据类型创建一个单独的内核。我们可以通过将内核定义为 C++ 模板来巧妙地做到这一点。 所以内核可能如下所示:
template<typename T>
__global__ void binCUDAKernel(T *input, T *output, int binDim, int outputWidth, int outputHeight, int inputWstep, int outputWstep, int nChannels)
{
int outXind = blockIdx.x * blockDim.x + threadIdx.x;
int outYind = blockIdx.y * blockDim.y + threadIdx.y;
if ((outXind < outputWidth) && (outXind > outputWidth/2) && (outYind < outputHeight)) // Only run threads in output image coordinate range
{
if (nChannels == 1) // Test only for greyscale images
{
// Calculate x & y index of input binned pixels corresponding to current output pixel
int inXstart = outXind * binDim;
int inYstart = outYind * binDim;
// Perform binning on identified input pixels
float sum = 0;
for (int binY = inYstart; binY < (inYstart + binDim); binY++) {
for (int binX = inXstart; binX < (inXstart + binDim); binX++) {
int input_tid = binY * inputWstep + binX;
sum += float(input[input_tid]);
}
}
// Establish output thread index in current output pixel index
int output_tid = outYind * outputWstep + outXind;
// Assign binned pixel value to output pixel
output[output_tid] = static_cast<T>(sum / (binDim*binDim));
}
}
}
使用自定义 CUDA 内核处理 OpenCV Mat 期间的另一个重要问题是图像步长必须除以数据类型的大小(以字节为单位)。对于 16 位图像,单个像素的大小为 16 位(2 字节),因此内核内部使用的步长必须除以 2。请记住不应修改原始步长。只是作为内核参数传递的步长值应该被划分。
结合上述修复,最终的 CPU 代码可能如下所示:
void binFilter(const cv::Mat input, std::string output_file)
{
// 2X2 binning
int binDim = 2;
// Create blank output image & calculate size of input and output
cv::Size outsize(input.size().width / binDim, input.size().height / binDim);
cv::Mat output(outsize, input.type());
const int inputBytes = input.step * input.rows;
const int outputBytes = output.step * output.rows;
// Allocate memory in device
unsigned char *d_input, *d_output;
gpuErrchk(cudaMalloc<unsigned char>(&d_input, inputBytes));
gpuErrchk(cudaMalloc<unsigned char>(&d_output, outputBytes));
// Copy input image to device
gpuErrchk(cudaMemcpy(d_input, input.ptr(), inputBytes, cudaMemcpyHostToDevice));
// Configure size of block and grid
const dim3 block(16, 16);
const dim3 grid((output.cols + block.x - 1) / block.x, (output.rows + block.y - 1) / block.y); // Additional block for rounding up
int depth = input.depth();
// Execute kernel
if (input.depth() == CV_16U)
{
typedef unsigned short t16;
t16* input16 = reinterpret_cast<t16*>(d_input);
t16* output16 = reinterpret_cast<t16*>(d_output);
int inputStep16 = input.step / sizeof(t16);
int outputStep16 = output.step / sizeof(t16);
binCUDAKernel <t16> <<<grid, block>>> (input16, output16, binDim, output.cols, output.rows, inputStep16, outputStep16, input.channels());
}
else
{
binCUDAKernel <unsigned char> <<<grid, block>>> (d_input, d_output, binDim, output.cols, output.rows, input.step, output.step, input.channels());
}
gpuErrchk(cudaPeekAtLastError());
// Wait for all threads to finish
//gpuErrchk(cudaDeviceSynchronize());
// Copy output image from device back to host (cudaMemcpy is a blocking instruction)
gpuErrchk(cudaMemcpy(output.ptr(), d_output, outputBytes, cudaMemcpyDeviceToHost));
// Free device memory
gpuErrchk(cudaFree(d_input));
gpuErrchk(cudaFree(d_output));
// Write image to specified output_file path
cv::imwrite(output_file, output);
}
由于合并算法的逻辑,输出图像中的噪声似乎是混叠。例如,它与使用最近邻方法对图像重新采样非常相似。
更新:
上面提到的计算像素内存地址的方法没有记录,只是直觉的结果,所以它可能看起来有点不合常规。 OpenCV 和其他库使用的另一种方法避免了图像步长划分的混乱。给定像素的 x 和 y 索引,它按如下方式进行:
- 将图像数据指针重新解释为字节表示形式 (
unsigned char*
)。 - 使用 y 索引和图像步长计算图像行的起始地址。
- 将行起始地址重新解释为所需类型 (
unsigned short*
)。 - 访问行起始指针的 x 索引。
利用这种方法,我们可以计算出灰度图的像素内存地址如下:
template<typename T>
T* getPixelAddress(unsigned char* data, int x, int y, int step)
{
T* row = (T*)((unsigned char*)(data) + y * step);
return row + x;
}
上述方法中,step值为原值,没有除法