检测上下颠倒的 pdf 页面
detect pdf pages that are upside down
我们正在使用 php、pypdfocr 和 pdftotext 进行 OCR,并从扫描或传真给我们的文档中提取文本。问题是文档被扫描或传真时上下颠倒,或者某些页面是横向阅读的(因此页面上的文本旋转了 90 度)
我尝试过的事情:
- 在 tessdata cp eng.traineddata osd.traineddata
为具有 90 度文本的页面生成的 OCR 文本层还不错,但是颠倒的页面,它对每个单词进行 OCR 并将其翻转到位,这样如果 'This is a test' 出现在文档中但颠倒过来,文本层可能会显示 'test a is This'
如果有办法检测页面是否颠倒,我可以使用 pdftk 旋转页面,然后 运行 通过 OCR(或者如果是 OCR,我可以删除文本层' d 和 运行 它在使用 pdftk 旋转后再次通过 OCR)
此时可以从 linux CLI 执行的任何解决方案都是可行的解决方案。
您可以使用 tesseract (>=3.03 ?) 轻松获取有关页面方向的信息。例如
$ tesseract image.png - -psm 0
将产生此输出
Orientation: 3
Orientation in degrees: 90
Orientation confidence: 25.40
Script: 1
Script confidence: 18.40
根据此信息,您可以调整图像旋转。如何在 python 中执行此操作的示例可以是例如在脚本 Fix image rotation with tesseract.
我遇到了同样的问题。我的解决方法是创建一个简单的 C++ 应用程序,它将 PNG 文件名作为参数并自动 rotates/deskews 它。
我的密码是
#include <iostream>
#include <cmath>
#include <tesseract/baseapi.h>
#include <leptonica/allheaders.h>
using namespace std;
int main(int argc, char **argv)
{
if (argc != 2) {
cerr << "usage: " << argv[0] << " <image>\n";
exit(1);
}
tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
// Initialize tesseract-ocr with English, without specifying tessdata path
if (api->Init(NULL, "eng")) {
cerr << "Could not initialize tesseract.\n";
exit(2);
}
const char* inputfile = argv[1];
tesseract::Orientation orientation;
tesseract::WritingDirection direction;
tesseract::TextlineOrder order;
float deskew_angle;
PIX *image = pixRead(inputfile);
if (image == NULL) {
cerr << "could not open " << inputfile << endl;
return -2;
}
api->SetPageSegMode(tesseract::PSM_AUTO_OSD);
api->SetImage(image);
api->Recognize(0);
tesseract::PageIterator* it = api->AnalyseLayout();
it->Orientation(&orientation, &direction, &order, &deskew_angle);
cout << "Orientation: " << orientation <<
"\nWritingDirection: " << direction <<
"\nTextlineOrder: " << order <<
"\nDeskew angle: " << deskew_angle << "\n";
PIX* pixd = NULL;
switch (orientation) {
case 0:
cout << "image in the correct position, nothing to do\n";
if (fabs(deskew_angle) > 0.0001f) {
cout << "deskewing...\n";
pixd = pixRotate(image, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
}
break;
case 1:
cout << "rotating image by 270 degrees\n";
pixd = pixRotate90(image, -1);
if (deskew_angle > 0.0001f) {
cout << "deskewing...\n";
pixd = pixRotate(pixd, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
}
break;
case 2:
cout << "rotating image by 180 degrees\n";
pixd = pixRotate180(NULL, image);
if (deskew_angle > 0.0001f) {
cout << "deskewing...\n";
pixd = pixRotate(pixd, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
}
break;
case 3:
cout << "rotating image by 90 degrees\n";
pixd = pixRotate90(image, 1);
if (deskew_angle > 0.0001f) {
cout << "deskewing...\n";
pixd = pixRotate(pixd, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
}
break;
}
pixDestroy(&image);
if (pixd != NULL) {
pixWrite(inputfile, pixd, IFF_PNG);
pixDestroy(&pixd);
}
return 0;
}
你可以用
编译它
g++ -o tesseract_fixposition tesseract_fixposition.cpp -llept -ltesseract
依赖项是 libtesseract 和 libleptonica。我使用 Tesseract 版本 3.03 和 3.04 以及 Leptonica 1.72 进行了测试。我处理了几千张图片,没有发现任何错误的识别。
希望对您有所帮助!
如果速度有问题,您不需要使用 tesseract 来修复页面方向。您可以只使用 leptonica 函数。像这样:
/*
* Compile with:
* g++ fixorientation.cpp -o fixorientation -llept
*
*/
#include <cstring>
#include <leptonica/allheaders.h>
int main(int argc, char *argv[]) {
const char* filename = NULL;
const char* outfile = NULL;
l_int32 orient, format;
l_int32 alt_rot = -1;
l_float32 upconf1, leftconf1;
PIX *fpixs, *pixs;
if (argc < 1) {
fprintf(stderr, "Usage is:\n\t%s -f filename [-o output]\n", argv[0]);
return(1);
} else {
for (int i = 1; i < argc; i++) {
if (i + 1 < argc) {
if (strcmp(argv[i], "-f") == 0) {
filename = argv[i + 1];
} else if (strcmp(argv[i], "-o") == 0) {
outfile = argv[i + 1];
}
}
}
}
if (filename) {
pixs = pixRead(filename);
} else {
fprintf(stderr, "Usage is:\n\t%s -f filename [-o output]\n", argv[0]);
return(1);
}
if (pixs == NULL) {
fprintf(stderr, "Unsupported image type.\n");
return(3);
}
format = pixGetInputFormat(pixs);
fpixs = pixConvertTo1(pixs, 130);
pixOrientDetect(fpixs, &upconf1, &leftconf1, 0, 0);
makeOrientDecision(upconf1, leftconf1, 0, 0, &orient, 1);
if (orient == L_TEXT_ORIENT_UNKNOWN) {
fprintf(stdout, "Confidence is low; no determination is made. "
"But maybe there is %1 deg rotation.\n", alt_rot);
} else if (orient == L_TEXT_ORIENT_UP) {
fprintf(stdout, "Text is rightside-up\n");
alt_rot = 0;
} else if (orient == L_TEXT_ORIENT_LEFT) {
fprintf(stdout, "Text is rotated 90 deg ccw\n");
alt_rot = 1;
} else if (orient == L_TEXT_ORIENT_DOWN) {
fprintf(stdout, "Text is upside-down\n");
alt_rot = 2;
} else { /* orient == L_TEXT_ORIENT_RIGHT */
fprintf(stdout, "Text is rotated 90 deg cw\n");
alt_rot = 3;
}
if (alt_rot > -1) {
fpixs = pixRotateOrth(pixs, alt_rot);
if (outfile) {
pixWrite(outfile, fpixs, format);
} else {
char savefile[strlen("fixed_") + strlen(filename) + 1];
strcpy(savefile, "fixed_");
strcat(savefile, filename);
fprintf(stdout, "Output save to %s\n", savefile);
pixWrite(savefile, fpixs, format);
}
} else {
return(2);
}
pixDestroy(&fpixs);
pixDestroy(&pixs);
return(0);
}
我们正在使用 php、pypdfocr 和 pdftotext 进行 OCR,并从扫描或传真给我们的文档中提取文本。问题是文档被扫描或传真时上下颠倒,或者某些页面是横向阅读的(因此页面上的文本旋转了 90 度)
我尝试过的事情:
- 在 tessdata cp eng.traineddata osd.traineddata
为具有 90 度文本的页面生成的 OCR 文本层还不错,但是颠倒的页面,它对每个单词进行 OCR 并将其翻转到位,这样如果 'This is a test' 出现在文档中但颠倒过来,文本层可能会显示 'test a is This'
如果有办法检测页面是否颠倒,我可以使用 pdftk 旋转页面,然后 运行 通过 OCR(或者如果是 OCR,我可以删除文本层' d 和 运行 它在使用 pdftk 旋转后再次通过 OCR)
此时可以从 linux CLI 执行的任何解决方案都是可行的解决方案。
您可以使用 tesseract (>=3.03 ?) 轻松获取有关页面方向的信息。例如
$ tesseract image.png - -psm 0
将产生此输出
Orientation: 3
Orientation in degrees: 90
Orientation confidence: 25.40
Script: 1
Script confidence: 18.40
根据此信息,您可以调整图像旋转。如何在 python 中执行此操作的示例可以是例如在脚本 Fix image rotation with tesseract.
我遇到了同样的问题。我的解决方法是创建一个简单的 C++ 应用程序,它将 PNG 文件名作为参数并自动 rotates/deskews 它。
我的密码是
#include <iostream>
#include <cmath>
#include <tesseract/baseapi.h>
#include <leptonica/allheaders.h>
using namespace std;
int main(int argc, char **argv)
{
if (argc != 2) {
cerr << "usage: " << argv[0] << " <image>\n";
exit(1);
}
tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
// Initialize tesseract-ocr with English, without specifying tessdata path
if (api->Init(NULL, "eng")) {
cerr << "Could not initialize tesseract.\n";
exit(2);
}
const char* inputfile = argv[1];
tesseract::Orientation orientation;
tesseract::WritingDirection direction;
tesseract::TextlineOrder order;
float deskew_angle;
PIX *image = pixRead(inputfile);
if (image == NULL) {
cerr << "could not open " << inputfile << endl;
return -2;
}
api->SetPageSegMode(tesseract::PSM_AUTO_OSD);
api->SetImage(image);
api->Recognize(0);
tesseract::PageIterator* it = api->AnalyseLayout();
it->Orientation(&orientation, &direction, &order, &deskew_angle);
cout << "Orientation: " << orientation <<
"\nWritingDirection: " << direction <<
"\nTextlineOrder: " << order <<
"\nDeskew angle: " << deskew_angle << "\n";
PIX* pixd = NULL;
switch (orientation) {
case 0:
cout << "image in the correct position, nothing to do\n";
if (fabs(deskew_angle) > 0.0001f) {
cout << "deskewing...\n";
pixd = pixRotate(image, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
}
break;
case 1:
cout << "rotating image by 270 degrees\n";
pixd = pixRotate90(image, -1);
if (deskew_angle > 0.0001f) {
cout << "deskewing...\n";
pixd = pixRotate(pixd, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
}
break;
case 2:
cout << "rotating image by 180 degrees\n";
pixd = pixRotate180(NULL, image);
if (deskew_angle > 0.0001f) {
cout << "deskewing...\n";
pixd = pixRotate(pixd, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
}
break;
case 3:
cout << "rotating image by 90 degrees\n";
pixd = pixRotate90(image, 1);
if (deskew_angle > 0.0001f) {
cout << "deskewing...\n";
pixd = pixRotate(pixd, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
}
break;
}
pixDestroy(&image);
if (pixd != NULL) {
pixWrite(inputfile, pixd, IFF_PNG);
pixDestroy(&pixd);
}
return 0;
}
你可以用
编译它g++ -o tesseract_fixposition tesseract_fixposition.cpp -llept -ltesseract
依赖项是 libtesseract 和 libleptonica。我使用 Tesseract 版本 3.03 和 3.04 以及 Leptonica 1.72 进行了测试。我处理了几千张图片,没有发现任何错误的识别。
希望对您有所帮助!
如果速度有问题,您不需要使用 tesseract 来修复页面方向。您可以只使用 leptonica 函数。像这样:
/*
* Compile with:
* g++ fixorientation.cpp -o fixorientation -llept
*
*/
#include <cstring>
#include <leptonica/allheaders.h>
int main(int argc, char *argv[]) {
const char* filename = NULL;
const char* outfile = NULL;
l_int32 orient, format;
l_int32 alt_rot = -1;
l_float32 upconf1, leftconf1;
PIX *fpixs, *pixs;
if (argc < 1) {
fprintf(stderr, "Usage is:\n\t%s -f filename [-o output]\n", argv[0]);
return(1);
} else {
for (int i = 1; i < argc; i++) {
if (i + 1 < argc) {
if (strcmp(argv[i], "-f") == 0) {
filename = argv[i + 1];
} else if (strcmp(argv[i], "-o") == 0) {
outfile = argv[i + 1];
}
}
}
}
if (filename) {
pixs = pixRead(filename);
} else {
fprintf(stderr, "Usage is:\n\t%s -f filename [-o output]\n", argv[0]);
return(1);
}
if (pixs == NULL) {
fprintf(stderr, "Unsupported image type.\n");
return(3);
}
format = pixGetInputFormat(pixs);
fpixs = pixConvertTo1(pixs, 130);
pixOrientDetect(fpixs, &upconf1, &leftconf1, 0, 0);
makeOrientDecision(upconf1, leftconf1, 0, 0, &orient, 1);
if (orient == L_TEXT_ORIENT_UNKNOWN) {
fprintf(stdout, "Confidence is low; no determination is made. "
"But maybe there is %1 deg rotation.\n", alt_rot);
} else if (orient == L_TEXT_ORIENT_UP) {
fprintf(stdout, "Text is rightside-up\n");
alt_rot = 0;
} else if (orient == L_TEXT_ORIENT_LEFT) {
fprintf(stdout, "Text is rotated 90 deg ccw\n");
alt_rot = 1;
} else if (orient == L_TEXT_ORIENT_DOWN) {
fprintf(stdout, "Text is upside-down\n");
alt_rot = 2;
} else { /* orient == L_TEXT_ORIENT_RIGHT */
fprintf(stdout, "Text is rotated 90 deg cw\n");
alt_rot = 3;
}
if (alt_rot > -1) {
fpixs = pixRotateOrth(pixs, alt_rot);
if (outfile) {
pixWrite(outfile, fpixs, format);
} else {
char savefile[strlen("fixed_") + strlen(filename) + 1];
strcpy(savefile, "fixed_");
strcat(savefile, filename);
fprintf(stdout, "Output save to %s\n", savefile);
pixWrite(savefile, fpixs, format);
}
} else {
return(2);
}
pixDestroy(&fpixs);
pixDestroy(&pixs);
return(0);
}