使用共享内存在cuda内核中应用高斯掩码
Applying gaussian mask in cuda kernel using shared memory
我正在尝试完成 udacity "intro to parallel programming" 课程的作业,但我被困在第二个作业中,该作业基本上是使用 CUDA 对图像应用高斯模糊蒙版。
我想通过利用共享内存来有效地做到这一点。
我解决 "pixels at the border problem" 的想法是启动比块中实际像素数更多的线程:例如,如果我将输入图像分成大小为 16x16 的活动像素块,并且我有一个遮罩大小为 9x9 那么我的实际块尺寸将是(对于 x 和 y):16+2*(9/2) = 24。这样我在一个块中启动 24 个线程,以便 "outer"线程将仅用于将像素从输入 img 加载到共享内存,而 "inner" 线程对应于实际执行计算的活动像素(此外还在共享内存中缓存)。
由于某种原因,它不起作用。正如您从附加代码中看到的那样,我能够将像素缓存到共享内存中,但在计算过程中出现了严重错误,我附上了一张我得到的糟糕结果的图像。
__global__ void gaussian_blur(const unsigned char* const inputChannel,
unsigned char* const outputChannel,
int numRows, int numCols,
const float* const filter, const int filterWidth)
{
int filter_radius = (int)(filterWidth / 2); //getting the filter "radius"
int x = blockDim.x*blockIdx.x+threadIdx.x;
int y = blockDim.y*blockIdx.y+threadIdx.y;
if(x>=(numCols+filter_radius) || y>=(numRows+filter_radius))
return;
int px = x-filter_radius;
int py = y-filter_radius;
//clamping
if(px<0) px = 0;
if(py<0) py = 0;
//if(px>=numCols) px = numCols-1;
// if(py>=numRows) py = numRows-1;
__shared__ unsigned char tile[(16+8)*(16+8)]; //16 active pixels + 2*filter_radius
tile[threadIdx.y*24+threadIdx.x] = inputChannel[py*numCols+px];
__syncthreads();
//Here everything is working fine: if I do
// outputChannel[py*numCols+px] = tile[threadIdx.y*24+threadIdx.x];
//then I am able to see the perfect reconstruction of the input image.
//caching the filter
__shared__ float t_filter[81]; //9x9 conv mask
if(threadIdx.x==0 && threadIdx.y==0)
{
for(int i=0; i<81; i++)
t_filter[i] = filter[i];
}
__syncthreads();
//I am checking the threadIdx of the threads and I am performing the mask computation
//only to those threads that are pointing to active pixels:
//i.e. all the threads whose id is greater or equal to the filter radius,
//but smaller than the whole block of active pixels will perform the computation.
//filter_radius = filterWidth/2 = 9/2 = 4
//blockDim.x or y = 16 + filterWidth*2 = 16+8 = 24
//active pixel index limit = filter_radius+16 = 4+16 = 20
//is that correct?
if(
threadIdx.y>=filter_radius && threadIdx.x>=filter_radius &&
threadIdx.x < 20 && threadIdx.y < 20
)
{
float value = 0.0;
for(int i=-filter_radius; i<=filter_radius; i++)
for(int j=-filter_radius; j<=filter_radius; j++)
{
int fx = i+filter_radius;
int fy = j+filter_radius;
int ty = threadIdx.y+i;
int tx = threadIdx.x+j;
value += ((float)tile[ty*24+tx])*t_filter[fy*filterWidth+fx];
}
outputChannel[py*numCols+px] = (unsigned char) value;
}
输出图像:http://i.stack.imgur.com/EMu5M.png
编辑:添加内核调用:
int filter_radius = (int) (filterWidth / 2);
blockSize.x = 16 + 2*filter_radius;
blockSize.y = 16 + 2*filter_radius;
gridSize.x = numCols/16+1;
gridSize.y = numRows/16+1;
printf("\n grx %d gry %d \n", blockSize.x, blockSize.y );
gaussian_blur<<<gridSize, blockSize>>>(d_red, d_redBlurred, numRows,numCols, d_filter, filterWidth);
gaussian_blur<<<gridSize, blockSize>>>(d_green, d_greenBlurred, numRows,numCols, d_filter, filterWidth);
gaussian_blur<<<gridSize, blockSize>>>(d_blue, d_blueBlurred, numRows,numCols, d_filter, filterWidth);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
blockSize.x = 32; gridSize.x = numCols/32+1;
blockSize.y = 32; gridSize.y = numRows/32+1;
// Now we recombine your results. We take care of launching this kernel for you.
//
// NOTE: This kernel launch depends on the gridSize and blockSize variables,
// which you must set yourself.
recombineChannels<<<gridSize, blockSize>>>(d_redBlurred,
d_greenBlurred,
d_blueBlurred,
d_outputImageRGBA,
numRows,
numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
编辑之二:
为了编译和 运行 所有其他必要的代码可以在这里找到:
https://github.com/udacity/cs344/tree/master/Problem%20Sets/Problem%20Set%202
上面的内核应该在 student_func.cu 文件中编码。
在您的实现中,每个块永远不会计算边界(在边缘的一个过滤器半径内)像素的模糊。这意味着您希望您的块重叠以便覆盖边界。如果您查看每个块
的 x
索引的域
int x = blockDim.x*blockIdx.x+threadIdx.x;
鉴于您上面的特定内核执行,我们将
blockIdx.x = 0: x = [0,23]
blockIdx.x = 1: x = [24,46]
... etc
如您所见,每个块都会考虑图像的独特部分,但您已告诉每个块不要在边界上计算。这意味着您的计算中忽略了每个块的边界(因此图像中的黑色网格)。
你需要用类似
的东西来计算你的指数
int x = (blockDim.x-2*filter_radius)*blockIdx.x+threadIdx.x;
以便块重叠。现在我们 x
索引的域看起来像
blockIdx.x = 0: x = [0,23]
blockIdx.x = 1: x = [16,39]
... etc
我正在尝试完成 udacity "intro to parallel programming" 课程的作业,但我被困在第二个作业中,该作业基本上是使用 CUDA 对图像应用高斯模糊蒙版。 我想通过利用共享内存来有效地做到这一点。 我解决 "pixels at the border problem" 的想法是启动比块中实际像素数更多的线程:例如,如果我将输入图像分成大小为 16x16 的活动像素块,并且我有一个遮罩大小为 9x9 那么我的实际块尺寸将是(对于 x 和 y):16+2*(9/2) = 24。这样我在一个块中启动 24 个线程,以便 "outer"线程将仅用于将像素从输入 img 加载到共享内存,而 "inner" 线程对应于实际执行计算的活动像素(此外还在共享内存中缓存)。
由于某种原因,它不起作用。正如您从附加代码中看到的那样,我能够将像素缓存到共享内存中,但在计算过程中出现了严重错误,我附上了一张我得到的糟糕结果的图像。
__global__ void gaussian_blur(const unsigned char* const inputChannel,
unsigned char* const outputChannel,
int numRows, int numCols,
const float* const filter, const int filterWidth)
{
int filter_radius = (int)(filterWidth / 2); //getting the filter "radius"
int x = blockDim.x*blockIdx.x+threadIdx.x;
int y = blockDim.y*blockIdx.y+threadIdx.y;
if(x>=(numCols+filter_radius) || y>=(numRows+filter_radius))
return;
int px = x-filter_radius;
int py = y-filter_radius;
//clamping
if(px<0) px = 0;
if(py<0) py = 0;
//if(px>=numCols) px = numCols-1;
// if(py>=numRows) py = numRows-1;
__shared__ unsigned char tile[(16+8)*(16+8)]; //16 active pixels + 2*filter_radius
tile[threadIdx.y*24+threadIdx.x] = inputChannel[py*numCols+px];
__syncthreads();
//Here everything is working fine: if I do
// outputChannel[py*numCols+px] = tile[threadIdx.y*24+threadIdx.x];
//then I am able to see the perfect reconstruction of the input image.
//caching the filter
__shared__ float t_filter[81]; //9x9 conv mask
if(threadIdx.x==0 && threadIdx.y==0)
{
for(int i=0; i<81; i++)
t_filter[i] = filter[i];
}
__syncthreads();
//I am checking the threadIdx of the threads and I am performing the mask computation
//only to those threads that are pointing to active pixels:
//i.e. all the threads whose id is greater or equal to the filter radius,
//but smaller than the whole block of active pixels will perform the computation.
//filter_radius = filterWidth/2 = 9/2 = 4
//blockDim.x or y = 16 + filterWidth*2 = 16+8 = 24
//active pixel index limit = filter_radius+16 = 4+16 = 20
//is that correct?
if(
threadIdx.y>=filter_radius && threadIdx.x>=filter_radius &&
threadIdx.x < 20 && threadIdx.y < 20
)
{
float value = 0.0;
for(int i=-filter_radius; i<=filter_radius; i++)
for(int j=-filter_radius; j<=filter_radius; j++)
{
int fx = i+filter_radius;
int fy = j+filter_radius;
int ty = threadIdx.y+i;
int tx = threadIdx.x+j;
value += ((float)tile[ty*24+tx])*t_filter[fy*filterWidth+fx];
}
outputChannel[py*numCols+px] = (unsigned char) value;
}
输出图像:http://i.stack.imgur.com/EMu5M.png
编辑:添加内核调用:
int filter_radius = (int) (filterWidth / 2);
blockSize.x = 16 + 2*filter_radius;
blockSize.y = 16 + 2*filter_radius;
gridSize.x = numCols/16+1;
gridSize.y = numRows/16+1;
printf("\n grx %d gry %d \n", blockSize.x, blockSize.y );
gaussian_blur<<<gridSize, blockSize>>>(d_red, d_redBlurred, numRows,numCols, d_filter, filterWidth);
gaussian_blur<<<gridSize, blockSize>>>(d_green, d_greenBlurred, numRows,numCols, d_filter, filterWidth);
gaussian_blur<<<gridSize, blockSize>>>(d_blue, d_blueBlurred, numRows,numCols, d_filter, filterWidth);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
blockSize.x = 32; gridSize.x = numCols/32+1;
blockSize.y = 32; gridSize.y = numRows/32+1;
// Now we recombine your results. We take care of launching this kernel for you.
//
// NOTE: This kernel launch depends on the gridSize and blockSize variables,
// which you must set yourself.
recombineChannels<<<gridSize, blockSize>>>(d_redBlurred,
d_greenBlurred,
d_blueBlurred,
d_outputImageRGBA,
numRows,
numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
编辑之二:
为了编译和 运行 所有其他必要的代码可以在这里找到: https://github.com/udacity/cs344/tree/master/Problem%20Sets/Problem%20Set%202 上面的内核应该在 student_func.cu 文件中编码。
在您的实现中,每个块永远不会计算边界(在边缘的一个过滤器半径内)像素的模糊。这意味着您希望您的块重叠以便覆盖边界。如果您查看每个块
的x
索引的域
int x = blockDim.x*blockIdx.x+threadIdx.x;
鉴于您上面的特定内核执行,我们将
blockIdx.x = 0: x = [0,23]
blockIdx.x = 1: x = [24,46]
... etc
如您所见,每个块都会考虑图像的独特部分,但您已告诉每个块不要在边界上计算。这意味着您的计算中忽略了每个块的边界(因此图像中的黑色网格)。
你需要用类似
的东西来计算你的指数int x = (blockDim.x-2*filter_radius)*blockIdx.x+threadIdx.x;
以便块重叠。现在我们 x
索引的域看起来像
blockIdx.x = 0: x = [0,23]
blockIdx.x = 1: x = [16,39]
... etc