OpenGL 着色器与 CUDA
OpenGL Shader vs CUDA
我在我的 OpenGL 程序中使用 this FXAA 着色器来消除锯齿。现在我在 CUDA 中重新实现了这段代码并进行了测试。生成的图像是相同的,但 CUDA 版本要慢得多。 (着色器在垂直同步时以 60 FPS 运行,而 CUDA 下降到 ~40 FPS)
这是 CUDA 代码:
__device__ uchar4 readChar(int x, int y){
return surf2Dread<uchar4>( surfaceRead, (x)*sizeof(uchar4), (y),cudaBoundaryModeClamp);
}
__device__ uchar4 readFloatBilin2(float x, float y){
int x1 = floor(x);
int y1 = floor(y);
uchar4 z11 = readChar(x1,y1);
uchar4 z12 = readChar(x1,y1+1);
uchar4 z21 = readChar(x1+1,y1);
uchar4 z22 = readChar(x1+1,y1+1);
float u_ratio = x - x1;
float v_ratio = y - y1;
float u_opposite = 1 - u_ratio;
float v_opposite = 1 - v_ratio;
uchar4 result = (z11 * u_opposite + z21 * u_ratio) * v_opposite +
(z12 * u_opposite + z22 * u_ratio) * v_ratio;
return result;
}
__device__ float fluma(const uchar4 &c){
return c.x*0.299 * (1.0/255) + c.y *0.587 * (1.0/255) + c.z*0.114 * (1.0/255);
}
__global__ void filter_fxaa_opt(TextureData data)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= data.w || y >= data.h)
{
return;
}
uchar4 out_color;
const float FXAA_SPAN_MAX = 8.0;
const float FXAA_REDUCE_MUL = 1.0/8.0;
const float FXAA_REDUCE_MIN = (1.0/128.0);
float lumaNW = fluma(readChar(x-1,y-1));
float lumaNE = fluma(readChar(x+1,y-1));
float lumaSW = fluma(readChar(x-1,y+1));
float lumaSE = fluma(readChar(x+1,y+1));
float lumaM = fluma(readChar(x,y));
float lumaMin = min(lumaM, min(min(lumaNW, lumaNE), min(lumaSW, lumaSE)));
float lumaMax = max(lumaM, max(max(lumaNW, lumaNE), max(lumaSW, lumaSE)));
float2 dir;
dir.x = -((lumaNW + lumaNE) - (lumaSW + lumaSE));
dir.y = ((lumaNW + lumaSW) - (lumaNE + lumaSE));
float dirReduce = max((lumaNW + lumaNE + lumaSW + lumaSE) * (0.25 * FXAA_REDUCE_MUL), FXAA_REDUCE_MIN);
float rcpDirMin = 1.0/(min(abs(dir.x), abs(dir.y)) + dirReduce);
// float2 test = dir * rcpDirMin;
dir = clamp(dir * rcpDirMin,-FXAA_SPAN_MAX,FXAA_SPAN_MAX);
uchar4 rgbA = (
readFloatBilin2(x+ dir.x * (1.0/3.0 - 0.5),y+ dir.y * (1.0/3.0 - 0.5))*0.5f+
readFloatBilin2(x+ dir.x * (2.0/3.0 - 0.5),y+ dir.y * (2.0/3.0 - 0.5))*0.5f);
uchar4 rgbB = rgbA * (1.0/2.0) + (
readFloatBilin2(x+ dir.x * (0.0/3.0 - 0.5),y+ dir.y * (0.0/3.0 - 0.5))*0.25f+
readFloatBilin2(x+ dir.x * (3.0/3.0 - 0.5),y+ dir.y * (3.0/3.0 - 0.5))*0.25f);
float lumaB = fluma(rgbB);
if((lumaB < lumaMin) || (lumaB > lumaMax)){
out_color=rgbA;
} else {
out_color=rgbB;
}
surf2Dwrite<uchar4>(out_color, surfaceWrite, x*sizeof(uchar4), y);
}
设置:
//called for the 'src' and 'dst' texture once at the beginning
checked_cuda( cudaGraphicsGLRegisterImage(&res, gl_buffer,gl_target, cudaGraphicsRegisterFlagsSurfaceLoadStore));
//called for the 'src' and 'dst' texture every frame
checked_cuda( cudaGraphicsMapResources(1, &res, 0));
checked_cuda( cudaGraphicsSubResourceGetMappedArray(&array, res, 0,0));
//kernel call every frame
dim3 block_size(8, 8);
dim3 grid_size;
grid_size.x = (src->w) / (block_size.x) ;
grid_size.y = (src->h) / (block_size.y) ;
checked_cuda(cudaBindSurfaceToArray(surfaceRead, (cudaArray *)src->d_data));
checked_cuda(cudaBindSurfaceToArray(surfaceWrite, (cudaArray *)dst->d_data));
filter_fxaa_opt<<<grid_size, block_size>>>(*src);
系统:
Ubuntu 14.04
Opengl version: 4.4.0 NVIDIA 331.113
Renderer version: GeForce GTX 760M/PCIe/SSE2
CUDA 5.5
问题:
OpenGL 着色器在哪些方面做得更好,为什么它更快?
正如 njuffa 所指出的,主要问题是手动插值和归一化。在使用 CUDA texture
而不是 CUDA surface
之后,可以通过调用 tex2D(..)
而不是 surf2Dread(...)
来使用内置插值。
修改后的 CUDA 代码现在几乎与 OpenGL 着色器完全相同,并且确实表现同样出色。
__global__ void filter_fxaa2(TextureData data)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= data.w || y >= data.h)
{
return;
}
uchar4 out_color;
const float FXAA_SPAN_MAX = 8.0f;
const float FXAA_REDUCE_MUL = 1.0f/8.0f;
const float FXAA_REDUCE_MIN = (1.0f/128.0f);
float u = x + 0.5f;
float v = y + 0.5f;
float4 rgbNW = tex2D( texRef, u-1.0f,v-1.0f);
float4 rgbNE = tex2D( texRef, u+1.0f,v-1.0f);
float4 rgbSW = tex2D( texRef, u-1.0f,v+1.0f);
float4 rgbSE = tex2D( texRef, u+1.0f,v+1.0f);
float4 rgbM = tex2D( texRef, u,v);
const float4 luma = make_float4(0.299f, 0.587f, 0.114f,0.0f);
float lumaNW = dot(rgbNW, luma);
float lumaNE = dot(rgbNE, luma);
float lumaSW = dot(rgbSW, luma);
float lumaSE = dot(rgbSE, luma);
float lumaM = dot( rgbM, luma);
float lumaMin = min(lumaM, min(min(lumaNW, lumaNE), min(lumaSW, lumaSE)));
float lumaMax = max(lumaM, max(max(lumaNW, lumaNE), max(lumaSW, lumaSE)));
float2 dir;
dir.x = -((lumaNW + lumaNE) - (lumaSW + lumaSE));
dir.y = ((lumaNW + lumaSW) - (lumaNE + lumaSE));
float dirReduce = max((lumaNW + lumaNE + lumaSW + lumaSE) * (0.25f * FXAA_REDUCE_MUL), FXAA_REDUCE_MIN);
float rcpDirMin = 1.0f/(min(abs(dir.x), abs(dir.y)) + dirReduce);
float2 test = dir * rcpDirMin;
dir = clamp(test,-FXAA_SPAN_MAX,FXAA_SPAN_MAX);
float4 rgbA = (1.0f/2.0f) * (
tex2D( texRef,u+ dir.x * (1.0f/3.0f - 0.5f),v+ dir.y * (1.0f/3.0f - 0.5f))+
tex2D( texRef,u+ dir.x * (2.0f/3.0f - 0.5f),v+ dir.y * (2.0f/3.0f - 0.5f)));
float4 rgbB = rgbA * (1.0f/2.0f) + (1.0f/4.0f) * (
tex2D( texRef,u+ dir.x * (0.0f/3.0f - 0.5f),v+ dir.y * (0.0f/3.0f - 0.5f))+
tex2D( texRef,u+ dir.x * (3.0f/3.0f - 0.5f),v+ dir.y * (3.0f/3.0f - 0.5f)));
float lumaB = dot(rgbB, luma);
if((lumaB < lumaMin) || (lumaB > lumaMax)){
out_color=toChar(rgbA);
} else {
out_color=toChar(rgbB);
}
surf2Dwrite<uchar4>(out_color, surfaceWrite, x*sizeof(uchar4), y);
}
更新:
使用 cudaEvents
测量的性能:
- 旧版本:~12.8ms
- 新版本:~1.2ms
结论:
仅将 CUDA 表面用于写入而不用于读取纹理!
我在我的 OpenGL 程序中使用 this FXAA 着色器来消除锯齿。现在我在 CUDA 中重新实现了这段代码并进行了测试。生成的图像是相同的,但 CUDA 版本要慢得多。 (着色器在垂直同步时以 60 FPS 运行,而 CUDA 下降到 ~40 FPS)
这是 CUDA 代码:
__device__ uchar4 readChar(int x, int y){
return surf2Dread<uchar4>( surfaceRead, (x)*sizeof(uchar4), (y),cudaBoundaryModeClamp);
}
__device__ uchar4 readFloatBilin2(float x, float y){
int x1 = floor(x);
int y1 = floor(y);
uchar4 z11 = readChar(x1,y1);
uchar4 z12 = readChar(x1,y1+1);
uchar4 z21 = readChar(x1+1,y1);
uchar4 z22 = readChar(x1+1,y1+1);
float u_ratio = x - x1;
float v_ratio = y - y1;
float u_opposite = 1 - u_ratio;
float v_opposite = 1 - v_ratio;
uchar4 result = (z11 * u_opposite + z21 * u_ratio) * v_opposite +
(z12 * u_opposite + z22 * u_ratio) * v_ratio;
return result;
}
__device__ float fluma(const uchar4 &c){
return c.x*0.299 * (1.0/255) + c.y *0.587 * (1.0/255) + c.z*0.114 * (1.0/255);
}
__global__ void filter_fxaa_opt(TextureData data)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= data.w || y >= data.h)
{
return;
}
uchar4 out_color;
const float FXAA_SPAN_MAX = 8.0;
const float FXAA_REDUCE_MUL = 1.0/8.0;
const float FXAA_REDUCE_MIN = (1.0/128.0);
float lumaNW = fluma(readChar(x-1,y-1));
float lumaNE = fluma(readChar(x+1,y-1));
float lumaSW = fluma(readChar(x-1,y+1));
float lumaSE = fluma(readChar(x+1,y+1));
float lumaM = fluma(readChar(x,y));
float lumaMin = min(lumaM, min(min(lumaNW, lumaNE), min(lumaSW, lumaSE)));
float lumaMax = max(lumaM, max(max(lumaNW, lumaNE), max(lumaSW, lumaSE)));
float2 dir;
dir.x = -((lumaNW + lumaNE) - (lumaSW + lumaSE));
dir.y = ((lumaNW + lumaSW) - (lumaNE + lumaSE));
float dirReduce = max((lumaNW + lumaNE + lumaSW + lumaSE) * (0.25 * FXAA_REDUCE_MUL), FXAA_REDUCE_MIN);
float rcpDirMin = 1.0/(min(abs(dir.x), abs(dir.y)) + dirReduce);
// float2 test = dir * rcpDirMin;
dir = clamp(dir * rcpDirMin,-FXAA_SPAN_MAX,FXAA_SPAN_MAX);
uchar4 rgbA = (
readFloatBilin2(x+ dir.x * (1.0/3.0 - 0.5),y+ dir.y * (1.0/3.0 - 0.5))*0.5f+
readFloatBilin2(x+ dir.x * (2.0/3.0 - 0.5),y+ dir.y * (2.0/3.0 - 0.5))*0.5f);
uchar4 rgbB = rgbA * (1.0/2.0) + (
readFloatBilin2(x+ dir.x * (0.0/3.0 - 0.5),y+ dir.y * (0.0/3.0 - 0.5))*0.25f+
readFloatBilin2(x+ dir.x * (3.0/3.0 - 0.5),y+ dir.y * (3.0/3.0 - 0.5))*0.25f);
float lumaB = fluma(rgbB);
if((lumaB < lumaMin) || (lumaB > lumaMax)){
out_color=rgbA;
} else {
out_color=rgbB;
}
surf2Dwrite<uchar4>(out_color, surfaceWrite, x*sizeof(uchar4), y);
}
设置:
//called for the 'src' and 'dst' texture once at the beginning
checked_cuda( cudaGraphicsGLRegisterImage(&res, gl_buffer,gl_target, cudaGraphicsRegisterFlagsSurfaceLoadStore));
//called for the 'src' and 'dst' texture every frame
checked_cuda( cudaGraphicsMapResources(1, &res, 0));
checked_cuda( cudaGraphicsSubResourceGetMappedArray(&array, res, 0,0));
//kernel call every frame
dim3 block_size(8, 8);
dim3 grid_size;
grid_size.x = (src->w) / (block_size.x) ;
grid_size.y = (src->h) / (block_size.y) ;
checked_cuda(cudaBindSurfaceToArray(surfaceRead, (cudaArray *)src->d_data));
checked_cuda(cudaBindSurfaceToArray(surfaceWrite, (cudaArray *)dst->d_data));
filter_fxaa_opt<<<grid_size, block_size>>>(*src);
系统:
Ubuntu 14.04
Opengl version: 4.4.0 NVIDIA 331.113
Renderer version: GeForce GTX 760M/PCIe/SSE2
CUDA 5.5
问题: OpenGL 着色器在哪些方面做得更好,为什么它更快?
正如 njuffa 所指出的,主要问题是手动插值和归一化。在使用 CUDA texture
而不是 CUDA surface
之后,可以通过调用 tex2D(..)
而不是 surf2Dread(...)
来使用内置插值。
修改后的 CUDA 代码现在几乎与 OpenGL 着色器完全相同,并且确实表现同样出色。
__global__ void filter_fxaa2(TextureData data)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= data.w || y >= data.h)
{
return;
}
uchar4 out_color;
const float FXAA_SPAN_MAX = 8.0f;
const float FXAA_REDUCE_MUL = 1.0f/8.0f;
const float FXAA_REDUCE_MIN = (1.0f/128.0f);
float u = x + 0.5f;
float v = y + 0.5f;
float4 rgbNW = tex2D( texRef, u-1.0f,v-1.0f);
float4 rgbNE = tex2D( texRef, u+1.0f,v-1.0f);
float4 rgbSW = tex2D( texRef, u-1.0f,v+1.0f);
float4 rgbSE = tex2D( texRef, u+1.0f,v+1.0f);
float4 rgbM = tex2D( texRef, u,v);
const float4 luma = make_float4(0.299f, 0.587f, 0.114f,0.0f);
float lumaNW = dot(rgbNW, luma);
float lumaNE = dot(rgbNE, luma);
float lumaSW = dot(rgbSW, luma);
float lumaSE = dot(rgbSE, luma);
float lumaM = dot( rgbM, luma);
float lumaMin = min(lumaM, min(min(lumaNW, lumaNE), min(lumaSW, lumaSE)));
float lumaMax = max(lumaM, max(max(lumaNW, lumaNE), max(lumaSW, lumaSE)));
float2 dir;
dir.x = -((lumaNW + lumaNE) - (lumaSW + lumaSE));
dir.y = ((lumaNW + lumaSW) - (lumaNE + lumaSE));
float dirReduce = max((lumaNW + lumaNE + lumaSW + lumaSE) * (0.25f * FXAA_REDUCE_MUL), FXAA_REDUCE_MIN);
float rcpDirMin = 1.0f/(min(abs(dir.x), abs(dir.y)) + dirReduce);
float2 test = dir * rcpDirMin;
dir = clamp(test,-FXAA_SPAN_MAX,FXAA_SPAN_MAX);
float4 rgbA = (1.0f/2.0f) * (
tex2D( texRef,u+ dir.x * (1.0f/3.0f - 0.5f),v+ dir.y * (1.0f/3.0f - 0.5f))+
tex2D( texRef,u+ dir.x * (2.0f/3.0f - 0.5f),v+ dir.y * (2.0f/3.0f - 0.5f)));
float4 rgbB = rgbA * (1.0f/2.0f) + (1.0f/4.0f) * (
tex2D( texRef,u+ dir.x * (0.0f/3.0f - 0.5f),v+ dir.y * (0.0f/3.0f - 0.5f))+
tex2D( texRef,u+ dir.x * (3.0f/3.0f - 0.5f),v+ dir.y * (3.0f/3.0f - 0.5f)));
float lumaB = dot(rgbB, luma);
if((lumaB < lumaMin) || (lumaB > lumaMax)){
out_color=toChar(rgbA);
} else {
out_color=toChar(rgbB);
}
surf2Dwrite<uchar4>(out_color, surfaceWrite, x*sizeof(uchar4), y);
}
更新:
使用 cudaEvents
测量的性能:
- 旧版本:~12.8ms
- 新版本:~1.2ms
结论:
仅将 CUDA 表面用于写入而不用于读取纹理!