更快地绘制具有比例和 alpha 通道的 tbitmap

Draw tbitmap with scale and alpha channel faster

以下代码复制一个大位图,将其与正确的背景混合,然后绘制一个带有裁剪区域的半透明图像以节省绘制时间...图像在数组中并预先缩放...

基于我对 C++ 和 Builder 图形的有限了解,这已通过多个级别的优化...

编辑: 更新代码... blend();

void blend(Graphics::TBitmap *dst,int x,int y,Graphics::TBitmap *src,BYTE 
alpha)
{
const int n=3;          // pixel align [Bytes]
int dx0,dy0,dx1,dy1,    // dst BBOX
    sx0,sy0,sx1,sy1,    // src BBOX
    dx,dy,sx,sy,i;
BYTE *dp,*sp;
WORD a,_a,sc,dc,da[256],sa[256];

// compute BBOX (handle clipping)
dx=src->Width; dy=src->Height;
dx0=x; sx0=0; dx1=x+dx; sx1=dx;
dy0=y; sy0=0; dy1=y+dy; sy1=dy;


// blend
a=alpha; _a=255-a;
for (i=0;i<256;i++){ da[i]=_a*i; sa[i]=a*i; }   // precompute BYTE*a and 
BYTE*_a LUTs

for (dy=dy0,sy=sy0;dy<dy1;dy++,sy++)        // ScanLines
    {
    dp=(BYTE*)dst->ScanLine[dy]+(n*dx0);
    sp=(BYTE*)src->ScanLine[sy]+(n*sx0);
    for (dx=dx0,sx=sx0;dx<dx1;dx++,sx++)    // single ScanLine
     for (i=0;i<n;i++,dp++,sp++)            // RGB
      *dp=WORD((sa[*sp]+da[*dp])>>8);       // blend function
    }
}

//--------------------------------------------------------------------------

    det1maps.push_back( new Graphics::TBitmap() );
    for (int i = 1; i < 176; i++)
    {
        det1maps.push_back( new Graphics::TBitmap() );
        det1maps[i]->SetSize(t,t);
        det1maps[i]->Canvas->StretchDraw(Rect(0, 0, t, t), Det1_bmp.get()); // scale
        t = t + 24;
    }

//----------------编辑 3 当前版本 1/18

det1maps[ss]->Transparent = true;
Form1->imgTemp->Picture->Assign(layer0_bmap.get()); //why background first?
HRGN MyRgn;
MyRgn = ::CreateRectRgn(0,0,Sw,Sh);
::SelectClipRgn(Form1->imgTemp->Canvas->Handle,MyRgn); //clip

Form1->imgTemp->Canvas->Draw(X3,Y3,det1maps[ss]); // draw det

blend(layer0_bmap.get(),0,0,Form1->imgTemp->Picture->Bitmap,int(obj[index]));

这里是简单的小C++/VCLScanLineAlpha Blend例子我刚拼起来的:

//---------------------------------------------------------------------------
void blend(Graphics::TBitmap *dst,int x,int y,Graphics::TBitmap *src,BYTE alpha)
    {
    const int n=3;          // pixel align [Bytes]
    int dx0,dy0,dx1,dy1,    // dst BBOX
        sx0,sy0,sx1,sy1,    // src BBOX
        dx,dy,sx,sy,i;
    BYTE *dp,*sp;
    WORD a,_a,sc,dc,da[256],sa[256];
    // compute BBOX (handle clipping)
    dx=src->Width; dy=src->Height;
    dx0=x; sx0=0; dx1=x+dx; sx1=dx;
    dy0=y; sy0=0; dy1=y+dy; sy1=dy;
    if (dx0<0){ sx0-=dx0; dx0=0; }
    if (dy0<0){ sy0-=dy0; dy0=0; }
    dx=dst->Width; dy=dst->Height;
    if (dx1>dx){ sx1+=dx-dx1; dx1=dx; }
    if (dy1>dy){ sy1+=dy-dy1; dy1=dy; }
    // make sure config is compatible with ScanLine[]
    dst->HandleType=bmDIB; dst->PixelFormat=pf24bit;
    src->HandleType=bmDIB; src->PixelFormat=pf24bit;
    // blend
    a=alpha; _a=255-a;
    for (i=0;i<256;i++){ da[i]=_a*i; sa[i]=a*i; }   // precompite BYTE*a and BYTE*_a LUTs
    for (dy=dy0,sy=sy0;dy<dy1;dy++,sy++)        // ScanLines
        {
        dp=(BYTE*)dst->ScanLine[dy]+(n*dx0);
        sp=(BYTE*)src->ScanLine[sy]+(n*sx0);
        for (dx=dx0,sx=sx0;dx<dx1;dx++,sx++)    // single ScanLine
         for (i=0;i<n;i++,dp++,sp++)            // RGB
          *dp=WORD((sa[*sp]+da[*dp])>>8);       // blend function
        }
    }
//---------------------------------------------------------------------------

我只是在 pixel/channel 基础上处理图像,并为每个通道 (R,G,B) 计算:

dst_pixel =  ( src_pixel*alpha + dst_pixel*(255-alpha) )/255

其中通道和 alpha 是 8 位无符号整数...为了提高速度,我使用 24 位像素格式(通常我使用 32 位)。

为了避免在混合中使用 *,/,我预先计算了 2 个 LUT,其中包含 number*alphanumber*(255-alpha) 的所有可能组合。除法是通过移位 >>8.

为了提高速度,您可以将 dst 图像的所有 ScanLine[] 记住一次到您的数组中,然后使用它作为目标图像将被多次使用 ...

当我将 2 1024x768 图像混合在一起进行测试时,我的设置花费了 <=9ms。最慢的操作是 ScanLine[] 访问,图像在混合之前被格式化为像素格式...

此处为 GIF 预览图(缩小 1/4 并由我的捕捉器抖动以适合 imgur 2MByte 限制):

这是我为此使用的代码(单计时器 VCL 应用程序):

//$$---- Form CPP ----
//---------------------------------------------------------------------------
#include <vcl.h>
#pragma hdrstop
#include "win_main.h"
#include <math.h>
#include <jpeg.hpp>
//---------------------------------------------------------------------------
#pragma package(smart_init)
#pragma resource "*.dfm"
TMain *Main;
Graphics::TBitmap *bmp,*bmp0,*bmp1; // back buffer, image0, image1, ...
//---------------------------------------------------------------------------
void blend(Graphics::TBitmap *dst,int x,int y,Graphics::TBitmap *src,BYTE alpha)
    {
    const int n=3;          // pixel align [Bytes]
    int dx0,dy0,dx1,dy1,    // dst BBOX
        sx0,sy0,sx1,sy1,    // src BBOX
        dx,dy,sx,sy,i;
    BYTE *dp,*sp;
    WORD a,_a,sc,dc,da[256],sa[256];
    // compute BBOX (handle clipping)
    dx=src->Width; dy=src->Height;
    dx0=x; sx0=0; dx1=x+dx; sx1=dx;
    dy0=y; sy0=0; dy1=y+dy; sy1=dy;
    if (dx0<0){ sx0-=dx0; dx0=0; }
    if (dy0<0){ sy0-=dy0; dy0=0; }
    dx=dst->Width; dy=dst->Height;
    if (dx1>dx){ sx1+=dx-dx1; dx1=dx; }
    if (dy1>dy){ sy1+=dy-dy1; dy1=dy; }
    // make sure config is compatible with ScanLine[]
    dst->HandleType=bmDIB; dst->PixelFormat=pf24bit;
    src->HandleType=bmDIB; src->PixelFormat=pf24bit;
    // blend
    a=alpha; _a=255-a;
    for (i=0;i<256;i++){ da[i]=_a*i; sa[i]=a*i; }   // precompite BYTE*a and BYTE*_a LUTs
    for (dy=dy0,sy=sy0;dy<dy1;dy++,sy++)        // ScanLines
        {
        dp=(BYTE*)dst->ScanLine[dy]+(n*dx0);
        sp=(BYTE*)src->ScanLine[sy]+(n*sx0);
        for (dx=dx0,sx=sx0;dx<dx1;dx++,sx++)    // single ScanLine
         for (i=0;i<n;i++,dp++,sp++)            // RGB
          *dp=WORD((sa[*sp]+da[*dp])>>8);       // blend function
        }
    }
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
void TMain::draw()
    {
    bmp->Canvas->Draw(0,0,bmp0);            // render background bmp0
    static float a=0.0; a+=0.025*M_PI;
    blend(bmp,0,0,bmp1,fabs(255.0*sin(a))); // alfa blend in bmp1
    Main->Canvas->Draw(0,0,bmp);            // show result on screen
    }
//---------------------------------------------------------------------------
__fastcall TMain::TMain(TComponent* Owner) : TForm(Owner)
    {
    // create bitmaps
    bmp=new Graphics::TBitmap;
    bmp0=new Graphics::TBitmap;
    bmp1=new Graphics::TBitmap;
    // laod images
    TJPEGImage *jpg=new TJPEGImage;
    jpg->LoadFromFile("img0.jpg"); bmp0->Assign(jpg);
    jpg->LoadFromFile("img1.jpg"); bmp1->Assign(jpg);
    delete jpg;
    }
//---------------------------------------------------------------------------
void __fastcall TMain::FormDestroy(TObject *Sender)
    {
    // delete bitmaps
    delete bmp0;
    delete bmp1;
    delete bmp;
    }
//---------------------------------------------------------------------------
void __fastcall TMain::FormResize(TObject *Sender)
    {
    bmp->Width =ClientWidth;
    bmp->Height=ClientHeight;
    }
//---------------------------------------------------------------------------
void __fastcall TMain::FormPaint(TObject *Sender)
    {
    draw();
    }
//---------------------------------------------------------------------------
void __fastcall TMain::tim_redrawTimer(TObject *Sender)
    {
    draw();
    }
//---------------------------------------------------------------------------

这里是图片(我在 Google 图片上找到的第一张漂亮的 1024x768 图片):

这里是混合结果预览:

有关 ScanLine 的更多信息,请参阅:

  • gfx rendering

如果您需要更快的速度,那么您应该选择 GPU 混合(OpenGLDirectX).

[Edit2]数组+矩形例子

在你编辑你的问题之后,它现在很明显了:

  1. 你的位图数组根本不是数组

    它更像是某种列表模板,例如 vector<Graphics::TBitmap*> 或类似的...所以您无法像我一样访问 bmp 的线性数组。为了让您的生活更轻松,我使用了具有相似属性的我的模板,这样您就可以看到如何处理这些(抱歉,我不能共享模板代码,但您只需要将 List<T> 更改为 Vector<T> 或任何您想要的使用 ...

    这就是为什么数组指针对你不起作用的原因,因为你没有。它可能是您的模板与某些成员公开的。我的像 map.dat 这样,如果不是线性存储,你的可能有类似的东西或根本没有。

  2. 您只混合了 2 个图像而不是整个阵列

    因此您可以使用第一个示例并添加 ScanLine 预加载,因为您的图像是静态的...对后台缓冲区图像执行相同操作,因为它仅在调整大小后才会更改。

当我把所有的东西放在一起时,结果是:

//$$---- Form CPP ----
//---------------------------------------------------------------------------
#include <vcl.h>
#pragma hdrstop
#include "win_main.h"
#include <math.h>
#include <jpeg.hpp>
#include "list.h"           // mine list<T> template you got probably vector<> or something similar instead
#include "performance.h"    // this is mine tbeg/tend/tstr time measurement
//---------------------------------------------------------------------------
#pragma package(smart_init)
#pragma resource "*.dfm"
TMain *Main;
//---------------------------------------------------------------------------
// [back buffer]
Graphics::TBitmap *bmp;             // bitmap
BYTE **bmp_pyx=NULL;                // preloaded ScanLines [y][x]
void bmp_init()                     // create preloaded ScanLines
    {
    bmp_pyx=new BYTE*[bmp->Height];
    for (int y=0;y<bmp->Height;y++)
     bmp_pyx[y]=(BYTE*)bmp->ScanLine[y];
    }
void bmp_exit()                     // release preloaded ScanLines
    {
    delete[] bmp_pyx;
    }
//---------------------------------------------------------------------------
// [array of images]
const AnsiString filename[]=        // filenames
    {
    "img0.jpg",
    "img1.jpg",
    "img2.jpg",
    "img3.jpg",
    "img4.jpg",
    "img5.jpg",
    "img6.jpg",
    "img7.jpg",
    "img8.jpg",
    "img9.jpg",
    ""
    };
List<Graphics::TBitmap*> map;       // your "array" of bitmaps
int maps=0;                         // number of images
BYTE ***map_pyx=NULL;               // preloaded ScanLines [ix][y][x]
//---------------------------------------------------------------------------
void map_init()                     // alocate and prepare data
    {
    int i,y;
    Graphics::TBitmap *bmp;
    TJPEGImage *jpg=new TJPEGImage;
    // create "array" of bmp (you already got this)
    for (maps=0;filename[maps]!="";maps++)
        {
        map.add(new Graphics::TBitmap); // this is like your push_back(new Graphics::TBitmap)
        jpg->LoadFromFile(filename[maps]);  // filename[] -> jpg -> bmp -> map[]
        map[maps]->Assign(jpg);             // here you can also rescale or whatever you want to do...
        map[maps]->HandleType=bmDIB;
        map[maps]->PixelFormat=pf24bit;
        }
    // create preloaded ScanLines (you need to add this into your app init)
    map_pyx=new BYTE**[maps];                   // **map_pyx[]
    for (i=0;i<maps;i++)
        {
        map_pyx[i]=new BYTE*[map[i]->Height];   // *map_pyx[][]
        for (y=0;y<map[i]->Height;y++)          // map_pyx[][]]
         map_pyx[i][y]=(BYTE*)map[i]->ScanLine[y];
        }
    delete jpg;
    }
//---------------------------------------------------------------------------
void map_exit()                     // release data (you need to add this in app exit)
    {
    int i;
    for (i=0;i<maps;i++)
        {
        delete   map[i];
        delete[] map_pyx[i];
        }
    delete[] map_pyx;
    }
//---------------------------------------------------------------------------
void blend_rec(BYTE **dp,int x0,int y0,int x1,int y1,BYTE **sp,BYTE alpha)
    {
    const int n=3;          // pixel align [Bytes]
    int x,y,i;
    BYTE *d,*s;
    WORD da[256],sa[256];
    // pixelformat align
    x0*=n; x1*=n;
    // prepare alpha*BYTE and (255-alpha)*BYTE LUTs
    y=    alpha; for (x=0;x<256;x++) sa[x]=x*y;
    y=255-alpha; for (x=0;x<256;x++) da[x]=x*y;
    // blend
    for (y=y0;y<y1;y++)
        {
        d=dp[y]+x0;
        s=sp[y]+x0;
        for (x=x0;x<x1;x++,d++,s++)
         *d=WORD((sa[*s]+da[*d])>>8);       // blend function
        }
    // release data
    }
//---------------------------------------------------------------------------
void TMain::draw()
    {
    bmp->Canvas->Draw(0,0,map[0]);              // render background bmp[0]
    static float a=0.0; a+=0.025*M_PI;          // animation ...
    BYTE alpha=128+float(127.0*sin(a));
    tbeg();
    blend_rec(bmp_pyx,200,500,400,600,map_pyx[1],alpha);    // add the blended rectangle (except background which is bmp[0]
    tend(); Caption=tstr();
    Canvas->Draw(0,0,bmp);                      // show on screen
//  bmp->SaveToFile("out.bmp");
    }
//---------------------------------------------------------------------------
__fastcall TMain::TMain(TComponent* Owner) : TForm(Owner)
    {
    // create bitmaps
    bmp=new Graphics::TBitmap;
    bmp_init();
    map_init();
    }
//---------------------------------------------------------------------------
void __fastcall TMain::FormDestroy(TObject *Sender)
    {
    // delete bitmaps
    delete bmp;
    bmp_exit();
    map_exit();
    }
//---------------------------------------------------------------------------
void __fastcall TMain::FormResize(TObject *Sender)
    {
    bmp->Width =ClientWidth;
    bmp->Height=ClientHeight;
    bmp->HandleType=bmDIB;
    bmp->PixelFormat=pf24bit;
    bmp_exit();
    bmp_init();
    }
//---------------------------------------------------------------------------
void __fastcall TMain::FormPaint(TObject *Sender)
    {
    draw();
    }
//---------------------------------------------------------------------------
void __fastcall TMain::tim_redrawTimer(TObject *Sender)
    {
    draw();
    }
//---------------------------------------------------------------------------

在我选择的矩形设置中,混合在不到 0.5ms 的时间内完成。如您所见,它比原始 9ms 更快...因为如果您使用的是剪辑区域,您仍然会混合整个图像,只是不会复制结果。此方法仅混合和复制需要的内容。

注意我删除了范围检查,因此请确保矩形在图像内部...

如果你想像我一样测量时间,我正在使用我的这个代码:

Performance.h:

//---------------------------------------------------------------------------
//--- Performance counter time measurement: 2.01 ----------------------------
//---------------------------------------------------------------------------
#ifndef _performance_h
#define _performance_h
//---------------------------------------------------------------------------
const int   performance_max=64;                 // push urovni
double      performance_Tms=-1.0,               // perioda citaca [ms]
            performance_tms=0.0,                // zmerany cas po tend [ms]
            performance_t0[performance_max];    // zmerane start casy [ms]
int         performance_ix=-1;                  // index aktualneho casu
//---------------------------------------------------------------------------
void tbeg(double *t0=NULL)  // mesure start time
    {
    double t;
    LARGE_INTEGER i;
    if (performance_Tms<=0.0)
        {
        for (int j=0;j<performance_max;j++) performance_t0[j]=0.0;
        QueryPerformanceFrequency(&i); performance_Tms=1000.0/double(i.QuadPart);
        }
    QueryPerformanceCounter(&i); t=double(i.QuadPart); t*=performance_Tms;
    if (t0) { t0[0]=t; return; }
    performance_ix++;
    if ((performance_ix>=0)&&(performance_ix<performance_max)) performance_t0[performance_ix]=t;
    }
//---------------------------------------------------------------------------
void tpause(double *t0=NULL)    // stop counting time between tbeg()..tend() calls
    {
    double t;
    LARGE_INTEGER i;
    QueryPerformanceCounter(&i); t=double(i.QuadPart); t*=performance_Tms;
    if (t0) { t0[0]=t-t0[0]; return; }
    if ((performance_ix>=0)&&(performance_ix<performance_max)) performance_t0[performance_ix]=t-performance_t0[performance_ix];
    }
//---------------------------------------------------------------------------
void tresume(double *t0=NULL)   // resume counting time between tbeg()..tend() calls
    {
    double t;
    LARGE_INTEGER i;
    QueryPerformanceCounter(&i); t=double(i.QuadPart); t*=performance_Tms;
    if (t0) { t0[0]=t-t0[0]; return; }
    if ((performance_ix>=0)&&(performance_ix<performance_max)) performance_t0[performance_ix]=t-performance_t0[performance_ix];
    }
//---------------------------------------------------------------------------
double tend(double *t0=NULL)    // return duration [ms] between matching tbeg()..tend() calls
    {
    double t;
    LARGE_INTEGER i;
    QueryPerformanceCounter(&i); t=double(i.QuadPart); t*=performance_Tms;
    if (t0) { t-=t0[0]; performance_tms=t; return t; }
    if ((performance_ix>=0)&&(performance_ix<performance_max)) t-=performance_t0[performance_ix]; else t=0.0;
    performance_ix--;
    performance_tms=t;
    return t;
    }
//---------------------------------------------------------------------------
double tper(double *t0=NULL)    // return duration [ms] between tper() calls
    {
    double t,tt;
    LARGE_INTEGER i;
    if (performance_Tms<=0.0)
        {
        for (int j=0;j<performance_max;j++) performance_t0[j]=0.0;
        QueryPerformanceFrequency(&i); performance_Tms=1000.0/double(i.QuadPart);
        }
    QueryPerformanceCounter(&i); t=double(i.QuadPart); t*=performance_Tms;
    if (t0) { tt=t-t0[0]; t0[0]=t; performance_tms=tt; return tt; }
    performance_ix++;
    if ((performance_ix>=0)&&(performance_ix<performance_max))
        {
        tt=t-performance_t0[performance_ix];
        performance_t0[performance_ix]=t;
        }
    else { t=0.0; tt=0.0; };
    performance_ix--;
    performance_tms=tt;
    return tt;
    }
//---------------------------------------------------------------------------
AnsiString tstr()
    {
    AnsiString s;
    s=s.sprintf("%8.3lf",performance_tms); while (s.Length()<8) s=" "+s; s="["+s+" ms]";
    return s;
    }
//---------------------------------------------------------------------------
AnsiString tstr(int N)
    {
    AnsiString s;
    s=s.sprintf("%8.3lf",performance_tms/double(N)); while (s.Length()<8) s=" "+s; s="["+s+" ms]";
    return s;
    }
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
#endif
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------