当我当时从单词中提取字符时，为什么单词中相似字符的置信度值不同？

Question

看，我正在从图像中提取单词并检查该单词中每个字符的置信度。每次我得到不同的置信度，即使它属于同一个词。

下面的例子。

01.01.2012 单词，其中 3 次遇到 0 个字符，3 次我得到不同的置信度。

这是输入图像，我得到的输出是文本文件，您可以在下面看到。我的要求是只需要数字数据的方式。所以在图像中，如果任何单词包含 0 到 9 之间的数字，那么我将特定单词和相应的字符和置信度值存储到文件中。正如你在下面看到的那样。我制作了从 0 到 9 的单个文件。出于参考目的，我显示了 0 个字符文件。

WORD          CHAR      confidence
7/11/2014       0       94.3153
01.01.2012      0       91.9117
01.01.2012      0       95.059
01.01.2012      0       95.1877
31.12.2012      0       92.1003
05.07.2013      0       94.4376
05.07.2013      0       97.3389
05.07.2013      0       92.4576
2012            0       94.0608
2012            0       93.1969
31.12.2012      0       93.8993
31.12.2011      0       94.513
606             0       93.5746
405             0       93.6727
109.821         0       96.2786
331.028         0       96.1837
109.506;        0       93.1421
109.506;        0       93.7133
110.427         0       93.7141
946.130         0       96.3438
200.274         0       95.7532
200.274         0       94.8678
10.553.331      0       96.3162
10.186.341      0       94.15
63.401          0       94.6042
346.350         0       96.2305
343.044         0       95.9801
346.350;        0       93.5741
343.044         0       93.8484
284.506         0       94.6761
0               0       95.9303
420             0       94.0067
0               0       93.2645
7.355.042       0       95.9187
9.108           0       96.3331
10.             0       93.9019
12.042          0       97.3823
294.704         0       93.4084
4.350           0       96.0915
0               0       95.4884
20.559.209      0       95.4821
20.559.209      0       94.1849
19.207.660      0       95.9933
19.207.660      0       95.1577
31.12.2012      0       92.7785
31.12.2011      0       94.6773
14.054          0       95.3734
44.706          0       93.6371
58.760          0       97.2008
4.111.720       0       94.4336
2.873.806       0       95.8218
11.083.608      0       96.1708
11.083.608      0       94.3456
10.721.302      0       93.3877
10.721.302      0       94.978
5.045.424       0       95.424
4.242.083       0       95.424

谁能告诉我所见背后的置信度水平如何运作？

#include "./include/header.h"
#include "./include/enum.h"

class RECT
{
public:
    int col;
    int row;
    int width;
    int height;
    char *ocrResult;
};


class OCR
{
public:
    int *g_pixelBuffer;
    int *g_pixelBufferForWord;
    int *g_histogram;
    int  g_Id;

    int initialize (const int row, const int col)
    {
        /// find the size of the image
        int size = row * col ;

        /// allocate the memory for pixel bufferr
        g_pixelBuffer     = (int *)calloc(size, sizeof(int));

        /// Check for proper allocation
        if(g_pixelBuffer == NULL)      return MemoryNotAllocated;

        g_pixelBufferForWord     = (int *)calloc(size, sizeof(int));

        /// Check for proper allocation
        if(g_pixelBufferForWord == NULL)      return MemoryNotAllocated;

        /// allocating memory for histogram
        g_histogram        = (int *)calloc(size ,sizeof(int));

        /// check proper allocation
        if(g_histogram == NULL)     return MemoryNotAllocated;

        g_Id  = 1;


        return Success;
    }

    vector<RECT >  processImage(Mat &image, int size, int ,int );

    void dumpIntoFile(vector<RECT > &rectBuffer, char *outputFile);


    void release()
    {
        // release pixel_buffer memory
        free(g_pixelBuffer);
        free(g_pixelBufferForWord);
        /// release histogram memory
        free(g_histogram);

    }

};

ofstream myfile1("1.txt");
ofstream myfile2("2.txt");
ofstream myfile3("3.txt");
ofstream myfile4("4.txt");
ofstream myfile5("5.txt");
ofstream myfile6("6.txt");
ofstream myfile7("7.txt");
ofstream myfile8("8.txt");
ofstream myfile9("9.txt");
ofstream myfile0("0.txt");


void displayBoundingBox(int staCol, int staRow, int edCol, int edRow
        , int *PixelBufferForWord, int);

void dumpNumberConfidenceIntoFile(char *word, float confi, char *Char);

void getWordDataFromLine(const int *PixelBuffer, int *PixelBufferForWord, RECT &rectLine,
        int mainImageCol, vector <RECT> &rectBuffer);

void  dumpDataIntoFile (int  *, int collenth, int strow, int stcol,
        int enrow, int encols, char *output);

//void dumpIntoFile(vector<RECT > &rectBuffer, char *outputFile);

void GetBinaryImage(Mat &image ,OCR *,const int size);

void  getBinaryImage(OCR *,int size);


void getCharDataFromWord(const int *PixelBuffer,int *PixelBufferForChar, int startColWord ,int startRowWord,
        int endColWord, int endRowWord, RECT &rectLine,
        int mainImageCol,vector<RECT > &rectBuffer, RECT &rectWord);

int main(int argc ,char **argv)

{
    int rs = Success;

    Mat image = imread(argv[1]);//read the image

    if(!image.data){
        cout << "can't able to read the image" << endl;
        return 0;
    }

    int rows = image.rows;// get the rows

    int cols = image.cols;// get the col

    int size = rows * cols;// get the size

    OCR ocr;



    /// Allocate or initialize memory
    rs = ocr.initialize (rows, cols);

    /// check proper allocation
    if(rs == MemoryNotAllocated)    return rs;

    myfile1 << "WORD" << '\t' << '\t' <<  "CHAR" << '\t' << '\t' <<  "confidence" << endl;
    myfile2 << "WORD" << '\t' << '\t' <<  "CHAR" << '\t' << '\t' << "confidence" << endl;
    myfile3 << "WORD" << '\t' << '\t' <<  "CHAR" << '\t' << '\t' << "confidence" << endl;
    myfile4 << "WORD" << '\t' << '\t' << "CHAR" << '\t' << '\t' << "confidence" << endl;
    myfile5 << "WORD" << '\t' << '\t' <<  "CHAR" << '\t' << '\t' <<"confidence" << endl;
    myfile6 << "WORD" << '\t' << '\t' <<  "CHAR" << '\t' << '\t' <<"confidence" << endl;
    myfile7 << "WORD" << '\t' << '\t' <<  "CHAR" << '\t' << '\t' <<"confidence" << endl;
    myfile8 << "WORD" << '\t' << '\t' << "CHAR" << '\t' << '\t' << "confidence" << endl;
    myfile9 << "WORD" << '\t' << '\t' << "CHAR" << '\t' << '\t' << "confidence" << endl;
    myfile0 << "WORD" << '\t' << '\t' << "CHAR" << '\t' << '\t' << "confidence" << endl;



    vector <RECT> rectBuffer = ocr.processImage(image, size, rows, cols);

    ocr.dumpIntoFile(rectBuffer, argv[2] );

    //dumpDataIntoFile (ocr.g_pixelBufferForWord, cols, 0, 0, rows - 1, cols - 1, ( char *)"test123456.pbm");


    ocr.release();

}

vector <RECT> OCR ::processImage(Mat &image, int size, int rows, int cols)
{

    GetBinaryImage (image, this, size);// convert the image into the binary

    for(int i = 0; i < size; i++)
    {
        g_pixelBufferForWord[i] = g_pixelBuffer[i];
    }



    //  dumpDataIntoFile (this, w, 0, 0, h - 1, w - 1, (char *)"test123.pbm");


    tesseract::TessBaseAPI tess;

    if (tess.Init("/usr/share/tesseract/tessdata", "eng")) {
        fprintf(stderr, "Could not initialize tesseract.\n");
        exit(1);
    }


    tess.SetImage((unsigned char*)g_pixelBuffer, cols, rows, sizeof(int)
            ,sizeof(int) * cols);

    tess.Recognize(0);

    tesseract::ResultIterator *riLine = tess.GetIterator();

    tesseract::PageIteratorLevel levelLine = tesseract:: RIL_TEXTLINE;


    RECT rectLine;
    vector <RECT> rectBuffer;

    if(riLine!=0)
    {
        do {
            char *Line = riLine->GetUTF8Text(levelLine);

            if(Line != NULL)
            {
                int startCol, startRow, endCol, endRow;

                riLine->BoundingBox(levelLine, &startCol, &startRow, &endCol, &endRow);

                int width  = endCol - startCol + 1;
                int height = endRow - startRow + 1;

                rectLine.col    = startCol;
                rectLine.row    = startRow;
                rectLine.width  = width;
                rectLine.height = height;

                int length = strlen(Line) + 1;

                rectLine.ocrResult = (char *)calloc( length, sizeof(char));

                if(rectLine.ocrResult == NULL){
                    cout << "rectLine.ocrResult is not allocate"<< endl;
                    exit(1);
                }

                strcpy(rectLine.ocrResult, Line);

                rectBuffer.push_back(rectLine);

                getWordDataFromLine(g_pixelBuffer,g_pixelBufferForWord, rectLine,cols, rectBuffer);

                free(Line);
                //delete Line;
            }

        } while (riLine->Next(levelLine));

    }

    //dumpIntoFile(rectBuffer, argv[2]);

    tess.End();
    delete riLine;

    return rectBuffer;

}

void getWordDataFromLine(const int *PixelBuffer,  int *PixelBufferForWord, RECT &rectLine,
        int mainImageCol, vector <RECT> &rectBuffer)
{
    int index;

    int *SubImageBuffer = (int *)calloc(rectLine.width  * rectLine.height, sizeof(int));

    if(!SubImageBuffer){

        cout << "SubImageBuffer not allocate" << endl;
    }

    int i = 0;

    for(int r = rectLine.row ; r < rectLine.row + rectLine.height; r++)
    {
        for(int c = rectLine.col ; c < rectLine.col + rectLine.width; c++)
        {
            index = r * mainImageCol + c;

            SubImageBuffer[i++] = PixelBuffer[index];
        }
    }
    //dumpDataIntoFile (SubImageBuffer, w, 0, 0, h - 1, w - 1, (char *)"test123.pbm");

    tesseract::TessBaseAPI tessWord;

    if (tessWord.Init("/usr/share/tesseract/tessdata", "eng")) {
        fprintf(stderr, "Could not initialize tesseract.\n");
        exit(1);
    }

    tessWord.SetImage((unsigned char*)SubImageBuffer, rectLine.width,
            rectLine.height, sizeof(int) ,sizeof(int) * rectLine.width);

    tessWord.Recognize(0);

    tesseract::ResultIterator *riWord = tessWord.GetIterator();

    tesseract::PageIteratorLevel levelWord = tesseract:: RIL_WORD;

    RECT rectWord;

    if(riWord!=0)
    {
        do {
            char *Word = riWord->GetUTF8Text(levelWord);

            if(Word != NULL)
            {
                int startCol, startRow, endCol, endRow;
                int staCol = 0, staRow = 0, edCol = 0, edRow = 0;
                riWord->BoundingBox(levelWord, &startCol, &startRow, &endCol, &endRow);

                staCol =  startCol;
                staRow =  startRow;
                edCol  =   endCol;
                edRow  =   endRow;

                staCol += rectLine.col;
                staRow += rectLine.row;
                edRow  += rectLine.row;
                edCol  += rectLine.col;



                rectWord.col    = staCol;
                rectWord.row    = staRow;
                rectWord.width  = edCol - staCol + 1;
                rectWord.height = edRow - staRow + 1;

                int length = strlen(Word) + 1;

                rectWord.ocrResult = (char *)calloc( length, sizeof(char));

                if(rectWord.ocrResult == NULL){
                    cout << "rectWord.ocrResult is not allocate"<< endl;
                    exit(1);
                }

                strcpy(rectWord.ocrResult, Word);

                rectBuffer.push_back(rectWord);

                //displayBoundingBox(staCol, staRow, edCol, edRow ,PixelBufferForWord, mainImageCol);

                getCharDataFromWord(SubImageBuffer, PixelBufferForWord, startCol, startRow ,endCol ,endRow,
                        rectLine,mainImageCol, rectBuffer, rectWord);
                //delete Word;

                free(Word);
            }
        }while (riWord->Next(levelWord));
    }
    delete riWord;
    tessWord.End();
    free(SubImageBuffer);
}


void getCharDataFromWord(const int *PixelBuffer,int *PixelBufferForChar, int startColWord ,int startRowWord,
        int endColWord, int endRowWord, RECT &rectLine,
        int mainImageCol,vector<RECT > &rectBuffer, RECT &rectWord)
{
    int index;

    int width  = endColWord - startColWord + 1;
    int height = endRowWord - startRowWord + 1;

    int *SubImageBufferForChar = (int *)calloc(width * height, sizeof(int));

    if(!SubImageBufferForChar){

        cout << "SubImageBuffer not read" << endl;
    }

    int i = 0;

    for(int r = startRowWord ; r <= endRowWord; r++)
    {
        for(int c = startColWord; c <= endColWord; c++)
        {
            index = r * rectLine.width + c;

            SubImageBufferForChar[i++] = PixelBuffer[index];

        }
    }

    //dumpDataIntoFile (SubImageBufferForChar, width, 0, 0, height - 1, width - 1, (char *)"test123.pbm");

    tesseract::TessBaseAPI tessChar;


    if (tessChar.Init("/usr/share/tesseract/tessdata", "eng")) {
        fprintf(stderr, "Could not initialize tesseract.\n");
        exit(1);
    }

    tessChar.SetImage((unsigned char*)SubImageBufferForChar, width,
            height, sizeof(int) ,sizeof(int) * width);

    tessChar.Recognize(0);

    tesseract::ResultIterator *riChar = tessChar.GetIterator();

    tesseract::PageIteratorLevel levelChar = tesseract:: RIL_SYMBOL;

    RECT rectChar;



    if(riChar!=0)
    {
        do {
            char *Char = riChar->GetUTF8Text(levelChar);

            if(Char != NULL)
            {
                float conf = riChar->Confidence(levelChar);

                int startCol, startRow, endCol, endRow;

                riChar->BoundingBox(levelChar, &startCol, &startRow, &endCol, &endRow);

                startCol += rectWord.col;
                startRow += rectWord.row;
                endRow   += rectWord.row;
                endCol   += rectWord.col;

                rectChar.col    = startCol;
                rectChar.row    = startRow;
                rectChar.width  = endCol - startCol + 1;
                rectChar.height = endRow - startRow + 1;

                int length = strlen(Char) + 1;

                rectChar.ocrResult = (char *)calloc( length, sizeof(char));

                if(rectChar.ocrResult == NULL){
                    cout << "rectChar.ocrResult is not allocate"<< endl;
                    exit(1);
                }
                strcpy(rectChar.ocrResult, Char);

                rectBuffer.push_back(rectChar);


                dumpNumberConfidenceIntoFile(rectWord.ocrResult, conf, Char);

                //displayBoundingBox(startCol, startRow, endCol, endRow ,PixelBufferForChar, mainImageCol);

                //delete Char;
                free(Char);


            }
        } while (riChar->Next(levelChar));
    }


    delete riChar;
    tessChar.End();
    free(SubImageBufferForChar);

}

void dumpNumberConfidenceIntoFile(char *word, float confi, char *Char)
{

    if(Char[0] >= '0' && Char[0] <= '9')
    {
        if(Char[0] == '0'){
            myfile0 << word << '\t' << '\t' << Char << '\t'  << '\t' << confi << endl;
        }
        else if(Char[0] == '1'){
            myfile1 << word << '\t' << '\t' << Char << '\t' << '\t' <<confi << endl;
        }
        else if(Char[0] == '2'){
            myfile2 << word << '\t' << '\t' << Char << '\t' << '\t' << confi << endl;
        }
        else if(Char[0] == '3'){
            myfile3 << word << '\t' << '\t' << Char << '\t' << '\t' << confi << endl;
        }
        else if(Char[0] == '4'){
            myfile4 << word << '\t' << '\t' << Char << '\t' << '\t'  <<confi << endl;
        }
        else if(Char[0] == '5'){
            myfile5 << word << '\t' << '\t' << Char << '\t' << '\t' <<confi << endl;
        }
        else if(Char[0] == '6'){
            myfile6 << word << '\t' << '\t' << Char << '\t' <<  '\t' << confi << endl;
        }
        else if(Char[0] == '7'){
            myfile7 << word << '\t' << '\t' << Char << '\t' << '\t'  << confi << endl;
        }
        else if(Char[0] == '8'){
            myfile8 << word << '\t' << '\t' << Char << '\t' <<  '\t' << confi << endl;
        }
        else if(Char[0] == '9'){
            myfile9 << word << '\t' << '\t' << Char << '\t' <<  '\t' << confi << endl;
        }

    }

}



void OCR ::dumpIntoFile(vector<RECT > &rectBuffer, char *outputFile)
{
    ofstream myfile(outputFile);

    myfile << "ID" << '\t' << "CORD_X" << '\t'  << "CORD_Y" << '\t' <<
            "CORD_W" << '\t' << "CORD_H" << '\t' << "STRING" << endl;

    for(auto it = rectBuffer.begin(); it != rectBuffer.end(); it++)
    {
        myfile << g_Id++ << '\t' << it->col  << '\t' << it->row << '\t' <<
                it->width << '\t' << it->height << '\t';


        int length = strlen(it->ocrResult);

        //cout << "in the string (" << length << ") ::" << endl;
        for(int  j = 0; j < length && it->ocrResult[j] != '\n'; j++)
        {
            myfile << it->ocrResult[j];
        }
        myfile << endl;


    }

}
void  getBinaryImage(OCR *ocr, int size)
{
    long long int total = size;
    long long int sum   = 0;
    long long int q1    = 0;
    long long int q2    = 0;
    float SUM           = 0;
    float u1            = 0;
    float u2            = 0;
    float result        = 0;
    float var_max       = 0;
    int threshold       = 0;

    for(int i = 0; i < 256 ;i++)
        sum = sum + i * ocr->g_histogram[i];

    for(int t = 0; t < 256 ; t++)
    {
        q1 = q1 + ocr->g_histogram[t];
        q2 = total - q1;

        SUM  = SUM + t * ocr->g_histogram[t];

        u1 = SUM / q1 ;

        u2 = (sum - SUM) / q2;

        result =  q1 * q2 * (u1 - u2) * (u1 - u2);

        if(result > var_max)
        {
            threshold = t;
            var_max   = result;
        }


    }

    for(int i = 0; i < size; i++)
    {

        if(ocr->g_pixelBuffer[i] > threshold){
            ocr->g_pixelBuffer[i] = 0;
        }else{
            ocr->g_pixelBuffer[i] = 1;
        }
    }
}
void GetBinaryImage(Mat &image ,OCR *ocr ,const int size)
{

    Mat  channel[3];// convert the image first into Binary image

    split(image,channel); // spilt the image

    uchar *Blue  = channel[0].data; // get the blue value
    uchar *Green = channel[1].data; // get the green value
    uchar *Red   = channel[2].data; // get the red value

    for(int i = 0; i < size; i++){
        ocr->g_pixelBuffer[i]=  ((Red[i]) + (Green[i]) + (Blue[i])) / 3;// get the gray data
    }


    for(int i = 0; i < size; i++) {
        ocr->g_histogram[ocr->g_pixelBuffer[i]]++;// create the histogram for the OTSU thersholding
    }


    getBinaryImage (ocr,size);// got the binary image
}

void  dumpDataIntoFile (int *pixelBuffer, int collenth, int strow, int stcol,
        int enrow, int encols, char *output)
{
    int i, j, index;

    int cols = encols - stcol + 1;
    int rows = enrow  - strow + 1;

    ofstream myfile(output);

    myfile <<  "P1" << endl;

    myfile <<  cols  <<  " " << rows << endl;

    for(i = strow; i <= enrow ; i++ ) {
        for(j = stcol; j <= encols; j++ ) {

            index = i * collenth + j;

            if(pixelBuffer[index] != 0) {
                myfile << "1" << " ";
            } else {
                myfile << "0" << " ";
            }
        }
        myfile << endl;
    }
    myfile.close();
}


void displayBoundingBox(int staCol, int staRow, int edCol, int edRow , int *PixelBufferForWord,int mainImageCol)
{
    int index;

    for(int i = staRow; i < edRow ; i++ ) {

        index = i * mainImageCol + staCol ;//height left
        PixelBufferForWord[index] = 255;

        index = i * mainImageCol + edCol ;// height right
        PixelBufferForWord[index] = 255;
    }

    for(int j = staCol; j < edCol; j++ ) {

        index = staRow * mainImageCol + j;//top
        PixelBufferForWord[index] = 255;

        index = edRow  * mainImageCol + j;// bottom
        PixelBufferForWord[index] = 255;
    }
}

Answer 1

问：- 如何计算置信度。

置信度算法计算识别字符与可用字符的距离。此 link 的 "linguistic analysis" 部分提供了您正在寻找的答案：- https://github.com/tesseract-ocr/docs/blob/master/tesseracticdar2007.pdf

当我当时从单词中提取字符时，为什么单词中相似字符的置信度值不同？

while i am extract the character from the word at that time why confidence values of the similar character in words are different?

c++

ocr

tesseract

image-processing