检测上下颠倒的 pdf 页面

detect pdf pages that are upside down

我们正在使用 php、pypdfocr 和 pdftotext 进行 OCR,并从扫描或传真给我们的文档中提取文本。问题是文档被扫描或传真时上下颠倒,或者某些页面是横向阅读的(因此页面上的文本旋转了 90 度)

我尝试过的事情:

为具有 90 度文本的页面生成的 OCR 文本层还不错,但是颠倒的页面,它对每个单词进行 OCR 并将其翻转到位,这样如果 'This is a test' 出现在文档中但颠倒过来,文本层可能会显示 'test a is This'

如果有办法检测页面是否颠倒,我可以使用 pdftk 旋转页面,然后 运行 通过 OCR(或者如果是 OCR,我可以删除文本层' d 和 运行 它在使用 pdftk 旋转后再次通过 OCR)

此时可以从 linux CLI 执行的任何解决方案都是可行的解决方案。

您可以使用 tesseract (>=3.03 ?) 轻松获取有关页面方向的信息。例如

$ tesseract image.png -  -psm 0

将产生此输出

Orientation: 3
Orientation in degrees: 90
Orientation confidence: 25.40
Script: 1 
Script confidence: 18.40

根据此信息,您可以调整图像旋转。如何在 python 中执行此操作的示例可以是例如在脚本 Fix image rotation with tesseract.

我遇到了同样的问题。我的解决方法是创建一个简单的 C++ 应用程序,它将 PNG 文件名作为参数并自动 rotates/deskews 它。

我的密码是

#include <iostream>
#include <cmath>
#include <tesseract/baseapi.h>
#include <leptonica/allheaders.h>

using namespace std;

int main(int argc, char **argv)
{

    if (argc != 2) {
        cerr << "usage: " << argv[0] << " <image>\n";
        exit(1);
    }

    tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
    // Initialize tesseract-ocr with English, without specifying tessdata path
    if (api->Init(NULL, "eng")) {
        cerr << "Could not initialize tesseract.\n";
        exit(2);
    }

    const char* inputfile = argv[1];
    tesseract::Orientation orientation;
    tesseract::WritingDirection direction;
    tesseract::TextlineOrder order;
    float deskew_angle;

    PIX *image = pixRead(inputfile);
    if (image == NULL) {
        cerr << "could not open " << inputfile << endl;
        return -2;
    }

    api->SetPageSegMode(tesseract::PSM_AUTO_OSD);
    api->SetImage(image);
    api->Recognize(0);

    tesseract::PageIterator* it =  api->AnalyseLayout();
    it->Orientation(&orientation, &direction, &order, &deskew_angle);
    cout << "Orientation: " << orientation << 
            "\nWritingDirection: " << direction <<
            "\nTextlineOrder: " << order << 
            "\nDeskew angle: " << deskew_angle << "\n";

    PIX* pixd = NULL;
    switch (orientation) {
        case 0:
            cout << "image in the correct position, nothing to do\n";
            if (fabs(deskew_angle) > 0.0001f) {
                cout << "deskewing...\n";
                pixd = pixRotate(image, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
            }
            break;
        case 1:
            cout << "rotating image by 270 degrees\n";
            pixd = pixRotate90(image, -1);
            if (deskew_angle > 0.0001f) {
                cout << "deskewing...\n";
                pixd = pixRotate(pixd, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
            }
            break;
        case 2:
            cout << "rotating image by 180 degrees\n";
            pixd = pixRotate180(NULL, image);
            if (deskew_angle > 0.0001f) {
                cout << "deskewing...\n";
                pixd = pixRotate(pixd, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
            }
            break;
        case 3:
            cout << "rotating image by 90 degrees\n";
            pixd = pixRotate90(image, 1);
            if (deskew_angle > 0.0001f) {
                cout << "deskewing...\n";
                pixd = pixRotate(pixd, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
            }
            break;
    }

    pixDestroy(&image);

    if (pixd != NULL) {
        pixWrite(inputfile, pixd, IFF_PNG);
        pixDestroy(&pixd);
    }

    return 0;
}

你可以用

编译它
g++ -o tesseract_fixposition tesseract_fixposition.cpp -llept -ltesseract

依赖项是 libtesseract 和 libleptonica。我使用 Tesseract 版本 3.03 和 3.04 以及 Leptonica 1.72 进行了测试。我处理了几千张图片,没有发现任何错误的识别。

希望对您有所帮助!

如果速度有问题,您不需要使用 tesseract 来修复页面方向。您可以只使用 leptonica 函数。像这样:

/*
 * Compile with:
 *     g++ fixorientation.cpp -o fixorientation -llept
 *
 */

#include <cstring>
#include <leptonica/allheaders.h>

int main(int argc, char *argv[]) {
    const char* filename = NULL;
    const char* outfile = NULL;
    l_int32   orient, format;
    l_int32  alt_rot = -1;
    l_float32 upconf1, leftconf1;
    PIX       *fpixs, *pixs;

    if (argc < 1) {
        fprintf(stderr, "Usage is:\n\t%s -f filename [-o output]\n", argv[0]);
        return(1);
    } else {
        for (int i = 1; i < argc; i++) {
            if (i + 1 < argc) {
                if (strcmp(argv[i], "-f") == 0) {
                    filename = argv[i + 1];
                } else if (strcmp(argv[i], "-o") == 0) {
                    outfile = argv[i + 1];
                }
            }
        }
    }

    if (filename) {
        pixs = pixRead(filename);
    } else {
        fprintf(stderr, "Usage is:\n\t%s -f filename [-o output]\n", argv[0]);
        return(1);
    }

    if (pixs == NULL) {
        fprintf(stderr, "Unsupported image type.\n");
        return(3);
    }
    format = pixGetInputFormat(pixs);

    fpixs = pixConvertTo1(pixs, 130);
    pixOrientDetect(fpixs, &upconf1, &leftconf1, 0, 0);
    makeOrientDecision(upconf1, leftconf1, 0, 0, &orient, 1);

    if (orient == L_TEXT_ORIENT_UNKNOWN) {
        fprintf(stdout, "Confidence is low; no determination is made. "
                "But maybe there is %1 deg rotation.\n", alt_rot);
    } else if (orient == L_TEXT_ORIENT_UP) {
        fprintf(stdout, "Text is rightside-up\n");
        alt_rot = 0;
    } else if (orient == L_TEXT_ORIENT_LEFT) {
        fprintf(stdout, "Text is rotated 90 deg ccw\n");
        alt_rot = 1;
    } else if (orient == L_TEXT_ORIENT_DOWN) {
        fprintf(stdout, "Text is upside-down\n");
        alt_rot = 2;
    } else {  /* orient == L_TEXT_ORIENT_RIGHT */
        fprintf(stdout, "Text is rotated 90 deg cw\n");
        alt_rot = 3;
    }

    if (alt_rot > -1) {
        fpixs = pixRotateOrth(pixs, alt_rot);
        if (outfile) {
            pixWrite(outfile, fpixs, format);
        } else {
            char savefile[strlen("fixed_") + strlen(filename) + 1];
            strcpy(savefile, "fixed_");
            strcat(savefile, filename);
            fprintf(stdout, "Output save to %s\n", savefile);
            pixWrite(savefile, fpixs, format);

        }
    } else {
        return(2);
    }
    pixDestroy(&fpixs);
    pixDestroy(&pixs);
    return(0);
}