移动 OpenCL 本地内存库冲突。为什么在内核中使用本地内存比使用全局内存慢?

mobile OpenCL local memory bank conflict. Why using local memory is slower than does global memory in kernel?

我正在使用 OpenCL 在 android 平台上开发人脸检测应用程序。人脸检测算法基于 Viola Jones 算法。我试图制作级联分类步骤内核代码。我将级联阶段中的级联阶段 1 的 classifier data 设置为 local memory(__local),因为分类器数据用于所有工作项。

但是,不使用本地内存(使用全局内存)的内核分析时间比使用本地内存更快。

已编辑:

我上传了完整的代码。


使用本地内存版本

__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
       int cascadeLocalSize = get_local_size(0);

       __local float localS1F1[42];

       int localIdx = get_local_id(1)*cascadeLocalSize + get_local_id(0);
       if(localIdx<42)
       {
           int stage1Idx = localIdx + idxNumValStageArray[0]+4;
           localS1F1[localIdx] = classifierMem[stage1Idx];
       }
       barrier(CLK_LOCAL_MEM_FENCE);


       float resizeFactor = 1.0;
       int2 im_dim = get_image_dim(input_image);
       unsigned int srcWidth = im_dim.x*(float)resizeFactor;
       unsigned int srcHeight = im_dim.y*(float)resizeFactor;

       int gx = get_global_id(0);
       int gy = get_global_id(1);

       int skinX=0;
       int skinY=0;
       int coordi=vecSkin[512*gy+gx];
       skinX = coordi%im_dim.x;
       skinY = coordi/im_dim.x;

       if( skinX >= 10 && skinY >= 10 )
       {
             skinX -= 10;
             skinY -= 10;
       }      

       int type = gx%3;

       unsigned int windowWidth = classifierMem[0];
       unsigned int windowHeight = classifierMem[1]; 


       unsigned int stageIndex;
       float stageThres;
       float numFeatures;
       unsigned int featureIndex;
       float featureValue;

       if(skinX<srcWidth-windowWidth-1 && skinY<srcHeight-windowHeight-1){
             bool stagePass = true;
             unsigned int index = 0;
             for(unsigned int i=numTotStage; i>0;i--){
                    if(stagePass){
                           if(index == 0){
                                 stageIndex = idxNumValStageArray[0];                                 
                                 stageThres = classifierMem[stageIndex+2];
                                 numFeatures = classifierMem[stageIndex+3];
                                 featureIndex = 0;
                                 featureValue = 0.0;                           
                           }
                           else{
                                 stageIndex = idxNumValStageArray[index];
                                 stageThres = classifierMem[stageIndex+2];
                                 numFeatures = classifierMem[stageIndex+3];
                                 featureIndex = stageIndex+4;
                                 featureValue = 0.0;
                           }
                           float featureThres;
                           float succVal;
                           float failVal;
                           unsigned int numRegions;
                           float regionValue;


                           if(type ==0 && index==0)
                           {
                                 for(unsigned int j=numFeatures; j>0;j--){
                                        if(stagePass){
                                               featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
                                              succVal=localS1F1[featureIndex++];
                                              failVal=localS1F1[featureIndex++];
                                              numRegions = localS1F1[featureIndex++];
                                              regionValue =0.0;

                                              float4 rectValue;
                                              int4 regionP;                                  
                                              for(unsigned int k=numRegions; k>0;k--){

                                                     regionP.x=(int)(localS1F1[featureIndex])+skinX;
                                                     regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
                                                     regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
                                                     regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;

                                                     rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                                                     rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                                                     rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                                                     rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;

                                                     regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];

                                                     featureIndex+=5;
                                              }
                                              featureValue += (regionValue < featureThres)?failVal:succVal;                              
                                              if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;

                                        }// end of if(stagePass) 
                                 }// end of for(unsigned int j=numFeatures; j>0;j--)

                                  index++;
                                 if(featureValue < stageThres)    stagePass =false;
                                 else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));

                           }

                           else if(type ==1 && index ==0)
                           {
                                 featureIndex +=14;
                                 for(unsigned int j=numFeatures; j>0;j--){
                                        if(stagePass){
                                              if(j==1)
                                                     featureIndex -= 42;

                                               featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
                                              succVal=localS1F1[featureIndex++];
                                              failVal=localS1F1[featureIndex++];
                                              numRegions = localS1F1[featureIndex++];
                                              regionValue =0.0;


                                              float4 rectValue;
                                              int4 regionP;                                  
                                              for(unsigned int k=numRegions; k>0;k--){

                                                     regionP.x=(int)(localS1F1[featureIndex])+skinX;
                                                     regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
                                                     regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
                                                     regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;

                                                     rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                                                     rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                                                     rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                                                     rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;

                                                     regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];

                                                     featureIndex+=5;
                                              }
                                              featureValue += (regionValue < featureThres)?failVal:succVal;                              
                                              if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
                                        }
                                 }

                                  index++;
                                 if(featureValue < stageThres)    stagePass =false;
                                 else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
                           }

                           else if(index == 0)
                           {
                                 featureIndex +=28;
                                 for(unsigned int j=numFeatures; j>0;j--){
                                        if(stagePass){

                                              if(j==2)     featureIndex -= 42;

                                               featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
                                              succVal=localS1F1[featureIndex++];
                                              failVal=localS1F1[featureIndex++];
                                              numRegions = localS1F1[featureIndex++];
                                              regionValue =0.0;

                                              float4 rectValue;
                                              int4 regionP;                                  
                                              for(unsigned int k=numRegions; k>0;k--){

                                                     regionP.x=(int)(localS1F1[featureIndex])+skinX;
                                                     regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
                                                     regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
                                                     regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;

                                                     rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                                                     rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                                                     rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                                                     rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;

                                                     regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];

                                                     featureIndex+=5;
                                              }// end of for(unsigned int k=numRegions; k>0;k--)
                                              featureValue += (regionValue < featureThres)?failVal:succVal;                              
                                              if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;

                                        }// end of if(stagePass)
                                 }//end of for(unsigned int j=numFeatures; j>0;j--)

                                 index++;
                                 if(featureValue < stageThres)    stagePass =false;
                                 else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
                           }

                           //stage 
                           else{
                                 for(unsigned int j=numFeatures; j>0;j--){
                                        if(stagePass){
                                               featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
                                              succVal=classifierMem[featureIndex++];
                                              failVal=classifierMem[featureIndex++];
                                              numRegions = classifierMem[featureIndex++];
                                              regionValue =0.0;
                                              float4 rectValue;
                                              int4 regionP;                                  
                                              for(unsigned int k=numRegions; k>0;k--){
                                                     regionP.x=(int)(classifierMem[featureIndex])+skinX;
                                                     regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
                                                     regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
                                                     regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;
                                                     rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                                                     rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                                                     rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                                                     rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
                                                     regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4]; 
                                                     featureIndex+=5;
                                              }
                                              featureValue += (regionValue < featureThres)?failVal:succVal;                              
                                              if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
                                        }
                                 }
                                 index++;
                                 if(featureValue < stageThres)    stagePass =false;
                                 else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
                           }
                    }
             }      
       }else return;
}

原始版本(没有本地内存)

__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
    float resizeFactor = 1.0;

    int2 im_dim = get_image_dim(input_image);

    unsigned int srcWidth = im_dim.x*(float)resizeFactor;
    unsigned int srcHeight = im_dim.y*(float)resizeFactor;

    int gx = get_global_id(0);
    int gy = get_global_id(1);


    int skinX=0;
    int skinY=0;
    int coordi=vecSkin[512*gy+gx];
    skinX = coordi%im_dim.x;
    skinY = coordi/im_dim.x;

        if( skinX >= 10 && skinY >= 10 )
    {
        skinX -= 10;
        skinY -= 10;
    }   

    unsigned int windowWidth = classifierMem[0];
    unsigned int windowHeight = classifierMem[1];   

    if(gx<srcWidth-windowWidth-1 && gy<srcHeight-windowHeight-1){
        bool stagePass = true;
        unsigned int index = 0;
        for(unsigned int i=numTotStage; i>0;i--){
            if(stagePass){
                unsigned int stageIndex = idxNumValStageArray[index++];
                float stageThres = classifierMem[stageIndex+2];
                float numFeatures = classifierMem[stageIndex+3];
                unsigned int featureIndex = stageIndex+4;
                float featureValue = 0.0;               

                for(unsigned int j=numFeatures; j>0;j--){
                    if(stagePass){
                        float featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
                        float succVal=classifierMem[featureIndex++];
                        float failVal=classifierMem[featureIndex++];
                        unsigned int numRegions = classifierMem[featureIndex++];
                        float regionValue =0.0;

                        for(unsigned int k=numRegions; k>0;k--){                    
                            float4 rectValue;
                            int4 regionP;

                            regionP.x=(int)(classifierMem[featureIndex])+skinX;
                            regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
                            regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
                            regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;

                            rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                            rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                            rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                            rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;

                            regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4];

                            featureIndex+=5;
                        }

                        featureValue += (regionValue < featureThres)?failVal:succVal;                   

                        if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
                    }
                }
                if(featureValue < stageThres)   stagePass =false;
                else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
            }
        }   
    }else return;
}

分析时间: 原始版本(没有本地内存):24ms 修改版本(使用本地内存):28ms

已编辑: 实际上 localWorkSize NULL 因为 globalWorkSize 总是随放置 NDRangeKernel 的向量大小而变化。当放置特定的 localWorkSize 时,人脸检测率下降...所以我尝试将 localWorkSize 设置为 NULl,然后人脸检测率很好。所以我想知道原因。

这是主机代码:

    //localWorkSize[0] = 16;
    //localWorkSize[1] = 12; 
    numThreadsX=512;
    globalWorkSize[0] = numThreadsX;
    globalWorkSize[1] =  vecCoordinate.size()% numThreadsX == 0 ? vecCoordinate.size()/ numThreadsX :(vecCoordinate.size()/ numThreadsX) + 1;
    errNum = clEnqueueWriteBuffer(commandQueue,classifierMem,CL_TRUE,0,sizeof(float)*cntValArray,stageValArray,0,NULL,NULL); 
    errNum |= clEnqueueWriteBuffer(commandQueue,idxStageMem,CL_TRUE,0,sizeof(int)*haar.numStages,idxNumValStageArray,0,NULL,NULL); 
    errNum |= clSetKernelArg(kHaar_Cascade, 0, sizeof(memObjBuffer_Haar22), &memObjBuffer_Haar22);
    errNum |= clSetKernelArg(kHaar_Cascade, 1, sizeof(memObjBuffer22), &memObjBuffer22);
    errNum |= clSetKernelArg(kHaar_Cascade, 2, sizeof(cl_mem), &classifierMem);
    errNum |= clSetKernelArg(kHaar_Cascade, 3, sizeof(cl_mem), &idxStageMem);
    errNum |= clSetKernelArg(kHaar_Cascade, 4, sizeof(cl_int), &haar.numStages);
    errNum |= clSetKernelArg(kHaar_Cascade, 5, sizeof(cl_mem), &memVecCoordi);

    errNum = clEnqueueNDRangeKernel(commandQueue, kHaar_Cascade, 2, NULL,globalWorkSize, NULL,0, NULL, &event[3]);

您有五个浮点值(20 字节),每个工作项都在读取这些值。大多数现代 GPU 都有硬件管理的缓存,它们在内存层次结构中与用户管理的本地内存处于相似的级别。在您不使用本地内存的版本中,这 20 个字节将很乐意位于靠近 ALU 的高速缓存中,并提供您可能希望的所有数据重用性能优势。您使用本地内存的版本只是明确地做同样的事情,但也增加了一些额外的开销来手动启动副本和工作项之间的一些额外同步,这必然会减慢速度。

如果谨慎应用,本地内存在某些情况下可以带来性能优势。然而,在许多情况下,硬件缓存会做得更好,并为您留下更简单的代码。

有多种原因:

  • 内存负载的线程分歧
  • 障碍不是免费的
  • 更多说明
  • 在大多数平台上,常量内存与共享内存几乎相同,因此额外的工作没有任何好处。
  • 我们不知道共享内存版本节省了多少读取 - 请告诉我们工作组的大小。它越小,我们从非本地内存中读取的数据就越少。如果这是常量内存并不重要。

对它们进行分析,看看分析器的输出是否显示了最大利用率的任何东西,并向我们指出那是什么。

此外,我不相信您的 l​​ocalIdx 和 stage1Idx 有意义,它们可能会超出数组范围并导致奇怪的行为。至少对于给定的 gx/gy,您看起来正在使用来自 classifierMem 的不同索引。