使用 mex-cuda 在图像中查找局部均值
finding local mean in an image using mex-cuda
我有一个名为 HSIImage 的图像,大小为 565x585,我在其中找到了每个像素的局部均值和标准差。为此,我使用大小为 9x9 的 window W,如果我们要重新找到 x(i,j) 的平均值,我们需要 W 中的值,其中 x(i ,j) 位于其中心。
为了处理角和边缘像素,我填充了 HSIImage 并将其命名为 HSIImage2。
MATLAB 代码
[m,n,~] = size(HSIImage);
HSIImage2=padarray(HSIImage,[4,4],'symmetric');
mean1 = zeros(m,n);
sd = zeros(m,n);
phi_x=zeros(m,n);
for i=5:m+4
for j=5:n+4
mean1(i-4,j-4) = mean( mean(HSIImage2(i-4:i+4, j-4:j+4, 3) )); %sum / (4*4);
sd(i-4,j-4) = std( std(HSIImage2(i-4:i+4, j-4:j+4, 3), 1));
end
end
[phi_x2,mean2,sd2] = getPhi(HSIImage(:,:,3)',HSIImage2(:,:,3)',m,n);
显示为图像的序列平均值。
我用于查找均值和 sd 的 cuda 代码是
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*col+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
void mexFunction( int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[])
{
double* HSIImage;
double* d_HSIImage;
double* HSIImage2;
double* d_HSIImage2;
double row;
double col;
double* phi_x;
double* d_phi_x;
double* mean2;
double* d_mean;
double* d_std;
double* sd2;
HSIImage = (double*)mxGetPr(prhs[0]);
HSIImage2 = (double*)mxGetPr(prhs[1]);
row = mxGetScalar(prhs[2]);
col = mxGetScalar(prhs[3]);
plhs[0] = mxCreateDoubleMatrix(row,col,mxREAL);
phi_x = mxGetPr(plhs[0]);
plhs[1] = mxCreateDoubleMatrix(row,col,mxREAL);
mean2 = mxGetPr(plhs[1]);
plhs[2] = mxCreateDoubleMatrix(row,col,mxREAL);
sd2 = mxGetPr(plhs[2]);
dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1);
dim3 block(TILE_WIDTH,TILE_WIDTH,1);
if ( cudaMalloc(&d_HSIImage,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_HSIImage2,sizeof(double)*(row+8)*(col+8))!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_phi_x,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_mean,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_std,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
cudaMemcpy(d_HSIImage,HSIImage,sizeof(double)*row*col,cudaMemcpyHostToDevice);
cudaMemcpy(d_HSIImage2,HSIImage2,sizeof(double)*(row+8)*(col+8),cudaMemcpyHostToDevice);
phi <<< grid,block >>> (d_HSIImage,d_HSIImage2,row,col,d_phi_x,d_mean,d_std);
cudaMemcpy(phi_x,d_phi_x,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(mean2,d_mean,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(sd2,d_std,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaFree(d_HSIImage);
cudaFree(d_HSIImage2);
cudaFree(d_phi_x);
}
当图像充满 1 时,它工作正常。但是当我给出常规图像时,串行(MATLAB)和并行(CUDA)输出有很多差异(比较 mean1 和 mean2 时)。请告诉我错误。
我用
启动
dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1);
dim3 block(TILE_WIDTH,TILE_WIDTH,1);
TILEWIDTH 为 32。行=565,列=584。
显示为图像的平行平均值
重要的是要注意 Matlab 的 c api 是按列优先排序的,但是正如评论中提到的那样,OP 已经确保了一致性。问题是用于访问数据的步幅不包括图像的垫。从一行到另一行需要 col+8 的步长(8 是每边 4 的填充。
改变
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*col+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
到
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*(col+8)+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*(col+8)+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
应该可以,但是,我已经包含了一个可编译的示例,我在一个小样本上进行了验证,应该很容易扩展。
它没有优化,但这不是你问题的一部分。使用共享内存进行优化会大大提高性能。
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void phi(double *img, int row, int col, double *d_mean){
int X=blockDim.x*blockIdx.x+threadIdx.x+4;
int Y=blockDim.y*blockIdx.y+threadIdx.y+4;
double sum = 0;
if(Y<row+4 && X<col+4){
for(int i=-4; i<=4; ++i){
for(int j=-4; j<=4; ++j){
sum+=img[ (Y+j)*(col+8)+X+i];
}
}
sum/=81;
d_mean[(Y-4)*col+X-4]=sum;
}
}
int main(int argc, char * argv[]) {
int width=10, height=10;
double *h_img=new double[(width+8)*(height+8)];
for(int i=0; i<height+8; i++){
for(int j=0; j<width+8; j++){
h_img[i*(width+8)+j]=0.0;
}
}
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
int index = (i+4)*(width+8)+j+4;
h_img[index]=i*width+j;
}
}
for(int i=0; i<height+8; i++){
for(int j=0; j<width+8; j++){
cout<<h_img[i*(width+8)+j]<<" ";
}cout<<endl;
}
double *d_img;
size_t size=sizeof(double)*(height+8)*(width*8);
cudaMalloc(&d_img, size);
cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice);
size = sizeof(double)*height*width;
double *d_avg;
cudaMalloc(&d_avg, size);
dim3 block(32, 32, 1);
dim3 grid(width/32+1, height/32+1, 1);
phi<<<grid, block>>>(d_img, height, width, d_avg);
cudaDeviceSynchronize();
double *h_avg=new double[width*height];
cudaMemcpy(h_avg, d_avg, size, cudaMemcpyDeviceToHost);
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
cout<<h_avg[i*width+j]<<" ";
}cout<<endl;
}
return 0;
}
这是我关于局部均值和局部标准差的 2 美分。
您应该检查使用 matlab 的优化内置函数(conv2
和 stdfilt
以及它们的 gpu 支持)是否比 "simple" mex 版本提供更好的性能。例如,要取局部均值,最快的方法是使用 conv2
,如下所示:
local_mean_image=conv2(image,normalized_window,'same');
你的情况 normalized_window=ones(9)./9^2;
对于本地标准使用 stdfilt :
local_std_image = stdfilt(image, ones(9));
两个选项都可用于更快的 GPU 性能,我经常使用 conv2
和 Jacket
,我看到 stdfilt
支持 gpuarray 变量。
通过观察@Christian Sarofeen 和@bla 的回答,我对我的代码做了一些更改,现在我能够找到与 MATLAB 完全相同的均值。我发布了这个想法,认为将来有人可能会使用它(我正在发送来自 MATLAB 的图像)。 仍然找到标准偏差是小问题。
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
int index = (X-4)*row+Y-4;
for(i=-4;i<=4;i++){
for(j=-4;j<=4;j++){
sum= sum + d_HSIImage2[(X+j)*(row+8)+(Y+i)];
}
}
d_mean[index] = sum/81;
double mean = 0;
double temp_std[9] = {0} ;
for(j=-4;j<=4;j++){
sum = 0;
for(i=-4;i<=4;i++){
sum = sum + d_HSIImage2[(X+j)*(row+8)+(Y+i)];//vector mean
}
mean = sum/9;
sum =0 ;
for(i=-4;i<=4;i++){
int index = (X+j)*(row+8)+(Y+i);
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
temp_std[j+4] = (sqrt(sum/9));//vector std
}
sum =0 ;
for(j=-4;j<=4;j++){
sum = sum + temp_std[j+4];//mean of vectors
}
mean = sum/9;
sum = 0 ;
for(j=-4;j<=4;j++){
sum = sum + (temp_std[j+4]-mean) * (temp_std[j+4]-mean);
}
d_std[index] = sqrt(sum);//std of vectors
d_phi_x[index] = 1.0/(1.0+exp((d_mean[index]-d_HSIImage[index])/d_std[index]));
}
}
我有一个名为 HSIImage 的图像,大小为 565x585,我在其中找到了每个像素的局部均值和标准差。为此,我使用大小为 9x9 的 window W,如果我们要重新找到 x(i,j) 的平均值,我们需要 W 中的值,其中 x(i ,j) 位于其中心。
为了处理角和边缘像素,我填充了 HSIImage 并将其命名为 HSIImage2。
MATLAB 代码
[m,n,~] = size(HSIImage);
HSIImage2=padarray(HSIImage,[4,4],'symmetric');
mean1 = zeros(m,n);
sd = zeros(m,n);
phi_x=zeros(m,n);
for i=5:m+4
for j=5:n+4
mean1(i-4,j-4) = mean( mean(HSIImage2(i-4:i+4, j-4:j+4, 3) )); %sum / (4*4);
sd(i-4,j-4) = std( std(HSIImage2(i-4:i+4, j-4:j+4, 3), 1));
end
end
[phi_x2,mean2,sd2] = getPhi(HSIImage(:,:,3)',HSIImage2(:,:,3)',m,n);
显示为图像的序列平均值。
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*col+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
void mexFunction( int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[])
{
double* HSIImage;
double* d_HSIImage;
double* HSIImage2;
double* d_HSIImage2;
double row;
double col;
double* phi_x;
double* d_phi_x;
double* mean2;
double* d_mean;
double* d_std;
double* sd2;
HSIImage = (double*)mxGetPr(prhs[0]);
HSIImage2 = (double*)mxGetPr(prhs[1]);
row = mxGetScalar(prhs[2]);
col = mxGetScalar(prhs[3]);
plhs[0] = mxCreateDoubleMatrix(row,col,mxREAL);
phi_x = mxGetPr(plhs[0]);
plhs[1] = mxCreateDoubleMatrix(row,col,mxREAL);
mean2 = mxGetPr(plhs[1]);
plhs[2] = mxCreateDoubleMatrix(row,col,mxREAL);
sd2 = mxGetPr(plhs[2]);
dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1);
dim3 block(TILE_WIDTH,TILE_WIDTH,1);
if ( cudaMalloc(&d_HSIImage,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_HSIImage2,sizeof(double)*(row+8)*(col+8))!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_phi_x,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_mean,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_std,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
cudaMemcpy(d_HSIImage,HSIImage,sizeof(double)*row*col,cudaMemcpyHostToDevice);
cudaMemcpy(d_HSIImage2,HSIImage2,sizeof(double)*(row+8)*(col+8),cudaMemcpyHostToDevice);
phi <<< grid,block >>> (d_HSIImage,d_HSIImage2,row,col,d_phi_x,d_mean,d_std);
cudaMemcpy(phi_x,d_phi_x,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(mean2,d_mean,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(sd2,d_std,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaFree(d_HSIImage);
cudaFree(d_HSIImage2);
cudaFree(d_phi_x);
}
当图像充满 1 时,它工作正常。但是当我给出常规图像时,串行(MATLAB)和并行(CUDA)输出有很多差异(比较 mean1 和 mean2 时)。请告诉我错误。
我用
启动dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1);
dim3 block(TILE_WIDTH,TILE_WIDTH,1);
TILEWIDTH 为 32。行=565,列=584。
显示为图像的平行平均值
重要的是要注意 Matlab 的 c api 是按列优先排序的,但是正如评论中提到的那样,OP 已经确保了一致性。问题是用于访问数据的步幅不包括图像的垫。从一行到另一行需要 col+8 的步长(8 是每边 4 的填充。
改变
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*col+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
到
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*(col+8)+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*(col+8)+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
应该可以,但是,我已经包含了一个可编译的示例,我在一个小样本上进行了验证,应该很容易扩展。
它没有优化,但这不是你问题的一部分。使用共享内存进行优化会大大提高性能。
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void phi(double *img, int row, int col, double *d_mean){
int X=blockDim.x*blockIdx.x+threadIdx.x+4;
int Y=blockDim.y*blockIdx.y+threadIdx.y+4;
double sum = 0;
if(Y<row+4 && X<col+4){
for(int i=-4; i<=4; ++i){
for(int j=-4; j<=4; ++j){
sum+=img[ (Y+j)*(col+8)+X+i];
}
}
sum/=81;
d_mean[(Y-4)*col+X-4]=sum;
}
}
int main(int argc, char * argv[]) {
int width=10, height=10;
double *h_img=new double[(width+8)*(height+8)];
for(int i=0; i<height+8; i++){
for(int j=0; j<width+8; j++){
h_img[i*(width+8)+j]=0.0;
}
}
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
int index = (i+4)*(width+8)+j+4;
h_img[index]=i*width+j;
}
}
for(int i=0; i<height+8; i++){
for(int j=0; j<width+8; j++){
cout<<h_img[i*(width+8)+j]<<" ";
}cout<<endl;
}
double *d_img;
size_t size=sizeof(double)*(height+8)*(width*8);
cudaMalloc(&d_img, size);
cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice);
size = sizeof(double)*height*width;
double *d_avg;
cudaMalloc(&d_avg, size);
dim3 block(32, 32, 1);
dim3 grid(width/32+1, height/32+1, 1);
phi<<<grid, block>>>(d_img, height, width, d_avg);
cudaDeviceSynchronize();
double *h_avg=new double[width*height];
cudaMemcpy(h_avg, d_avg, size, cudaMemcpyDeviceToHost);
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
cout<<h_avg[i*width+j]<<" ";
}cout<<endl;
}
return 0;
}
这是我关于局部均值和局部标准差的 2 美分。
您应该检查使用 matlab 的优化内置函数(conv2
和 stdfilt
以及它们的 gpu 支持)是否比 "simple" mex 版本提供更好的性能。例如,要取局部均值,最快的方法是使用 conv2
,如下所示:
local_mean_image=conv2(image,normalized_window,'same');
你的情况 normalized_window=ones(9)./9^2;
对于本地标准使用 stdfilt :
local_std_image = stdfilt(image, ones(9));
两个选项都可用于更快的 GPU 性能,我经常使用 conv2
和 Jacket
,我看到 stdfilt
支持 gpuarray 变量。
通过观察@Christian Sarofeen 和@bla 的回答,我对我的代码做了一些更改,现在我能够找到与 MATLAB 完全相同的均值。我发布了这个想法,认为将来有人可能会使用它(我正在发送来自 MATLAB 的图像)。 仍然找到标准偏差是小问题。
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
int index = (X-4)*row+Y-4;
for(i=-4;i<=4;i++){
for(j=-4;j<=4;j++){
sum= sum + d_HSIImage2[(X+j)*(row+8)+(Y+i)];
}
}
d_mean[index] = sum/81;
double mean = 0;
double temp_std[9] = {0} ;
for(j=-4;j<=4;j++){
sum = 0;
for(i=-4;i<=4;i++){
sum = sum + d_HSIImage2[(X+j)*(row+8)+(Y+i)];//vector mean
}
mean = sum/9;
sum =0 ;
for(i=-4;i<=4;i++){
int index = (X+j)*(row+8)+(Y+i);
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
temp_std[j+4] = (sqrt(sum/9));//vector std
}
sum =0 ;
for(j=-4;j<=4;j++){
sum = sum + temp_std[j+4];//mean of vectors
}
mean = sum/9;
sum = 0 ;
for(j=-4;j<=4;j++){
sum = sum + (temp_std[j+4]-mean) * (temp_std[j+4]-mean);
}
d_std[index] = sqrt(sum);//std of vectors
d_phi_x[index] = 1.0/(1.0+exp((d_mean[index]-d_HSIImage[index])/d_std[index]));
}
}