AMD 上的慢模板纹理

Slow stencil texture on AMD

我正在尝试使用我在光交互片段着色器中绑定和使用的 FBO + 模板纹理附件向修改后的 Doom3 引擎添加柔和阴影。 它运行良好,但在 Radeon 460 上存在严重的性能问题(我没有其他 AMD GPU,但怀疑它相同或更差,因为它相对较新)。

我正在使用最新的驱动程序。

fps 下降非常严重,实际上 qglCopyTexImage2D 对另一个纹理(每个光!)进行绑定比绑定 FBO 中使用的模板纹理更快。

另一个问题是,当我尝试用 qglCopyTexSubImage2D 优化 qglCopyTexImage2D 时,它开始闪烁。

其他程序员对模板纹理有什么实际使用的建议吗?

nVidia 和 Intel 在速度方面似乎都表现不错。

        globalImages->currentRenderImage->Bind();
        globalImages->currentRenderImage->uploadWidth = curWidth; // used as a shader param
        globalImages->currentRenderImage->uploadHeight = curHeight;
        qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR );
        qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR );
        qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE );
        qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE );
        qglTexImage2D( GL_TEXTURE_2D, 0, r_fboColorBits.GetInteger() == 15 ? GL_RGB5_A1 : GL_RGBA, curWidth, curHeight, 0, GL_BGRA, GL_UNSIGNED_BYTE, NULL ); //NULL means reserve texture memory, but texels are undefined

        globalImages->currentRenderFbo->Bind();
        qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR );
        qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR );
        qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE );
        qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE );
        qglTexImage2D( GL_TEXTURE_2D, 0, r_fboColorBits.GetInteger() == 15 ? GL_RGB5_A1 : GL_RGBA, curWidth, curHeight, 0, GL_BGRA, GL_UNSIGNED_BYTE, NULL ); //NULL means reserve texture memory, but texels are undefined

        if ( glConfig.vendor != glvAny ) { 
            globalImages->currentStencilFbo->Bind();
            globalImages->currentStencilFbo->uploadWidth = curWidth;
            globalImages->currentStencilFbo->uploadHeight = curHeight;
            qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST );
            qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST );
            qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE );
            qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE );
            qglTexImage2D( GL_TEXTURE_2D, 0, GL_STENCIL_INDEX8, curWidth, curHeight, 0, GL_STENCIL_INDEX, GL_UNSIGNED_BYTE, 0 );
        }

        globalImages->currentDepthImage->Bind();
        globalImages->currentDepthImage->uploadWidth = curWidth; // used as a shader param
        globalImages->currentDepthImage->uploadHeight = curHeight;
        qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST );
        qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST );
        qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE );
        qglTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE );
        if ( glConfig.vendor == glvIntel ) { // FIXME allow 24-bit depth for low-res monitors
            qglTexImage2D( GL_TEXTURE_2D, 0, GL_DEPTH_COMPONENT16, curWidth, curHeight, 0, GL_DEPTH_COMPONENT, GL_FLOAT, 0 );
        } else {
            qglTexImage2D( GL_TEXTURE_2D, 0, GL_DEPTH_STENCIL, curWidth, curHeight, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0 );
        }
    }

    // (re-)attach textures to FBO
    if ( !fboId || r_fboSharedColor.IsModified() || r_fboSharedDepth.IsModified() ) {
        // create a framebuffer object, you need to delete them when program exits.
        if ( !fboId )
            qglGenFramebuffers( 1, &fboId );
        qglBindFramebuffer( GL_FRAMEBUFFER_EXT, fboId );
        // attach a texture to FBO color attachement point
        qglFramebufferTexture2D( GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, globalImages->currentRenderImage->texnum, 0 );
        // attach a renderbuffer to depth attachment point
        GLuint depthTex = r_fboSharedDepth.GetBool() ? globalImages->currentDepthImage->texnum : globalImages->currentDepthFbo->texnum;
        qglFramebufferTexture2D( GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, depthTex, 0 );
        if ( glConfig.vendor == glvIntel ) // separate stencil, thank God
            qglFramebufferTexture2D( GL_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, globalImages->currentStencilFbo->texnum, 0 );
        else
            qglFramebufferTexture2D( GL_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, depthTex, 0 );
        int status = qglCheckFramebufferStatus( GL_FRAMEBUFFER );
        if ( GL_FRAMEBUFFER_COMPLETE != status ) { // something went wrong, fall back to default
            common->Printf( "glCheckFramebufferStatus %d\n", status );
            qglDeleteFramebuffers( 1, &fboId );
            fboId = 0; // try from scratch next time
            r_useFbo.SetBool( false );
        }
        qglBindFramebuffer( GL_FRAMEBUFFER, 0 ); // not obvious, but let it be 
    }
    qglBindFramebuffer( GL_FRAMEBUFFER, fboId );
    qglClear( GL_COLOR_BUFFER_BIT ); // otherwise transparent skybox blends with previous frame
    fboUsed = true;
    GL_CheckErrors();
}

/*
 Soft shadows vendor specific implementation
 Intel: separate stencil, direct access, fastest
 nVidia: combined stencil & depth, direct access, fast
 AMD: combined stencil & depth, direct access very slow, resorting to stencil copy
 */

void FB_CopyStencil() { // duzenko: why, AMD? WHY?? 
    if ( glConfig.vendor != glvAMD || !r_softShadows.GetBool() )
        return;
    globalImages->currentStencilFbo->Bind();
    qglCopyTexImage2D( GL_TEXTURE_2D, 0, GL_DEPTH_STENCIL, 0, 0, glConfig.vidWidth, glConfig.vidHeight, 0 );
    /*globalImages->currentDepthFbo->Bind();
    idScreenRect& r = backEnd.currentScissor;
    //qglCopyTexSubImage2D( GL_TEXTURE_2D, 0, r.x1, r.y1, r.x1, r.y1, r.x2 - r.x1 + 1, r.y2 - r.y1 + 1 );*/
    GL_CheckErrors();
}

void FB_BindStencilTexture() {
    const GLenum GL_DEPTH_STENCIL_TEXTURE_MODE = 0x90EA;
    idImage* stencil = glConfig.vendor != glvAny ? globalImages->currentStencilFbo : globalImages->currentDepthImage;
    stencil->Bind();
    if ( glConfig.vendor != glvIntel )
        glTexParameteri( GL_TEXTURE_2D, GL_DEPTH_STENCIL_TEXTURE_MODE, GL_STENCIL_INDEX );
}

我最终得到了两个帧缓冲区:一个仅用于阴影,另一个用于所有其他内容。 shadow texture在前者是一个FBO attachment,在后者绑定为texture2D