将 pdf 页面呈现为图像 PDFBox 时缺少 PDAnnotationTextMarkup Java
PDAnnotationTextMarkup missing when rendering a pdf page to image PDFBox Java
我有使用 PDFBox API 编写的代码,它突出显示了 PDF 中的文字,但是当我将突出显示的 PDF 页面转换为图像时,我突出显示的任何内容都会从图像中消失。
下面的屏幕截图带有突出显示的文本,为了突出显示我使用了 PDFBox 的 PDAnnotationTextMarkup class:
Highlighted PDF Page
下图为pdf页面转为图片后的图片:
Highlighted PDF Page Image after converting
下面是我用来将 PDF 转换为图像的代码:
PDDocument document = PDDocument.load(new File(pdfFilename));
PDFRenderer pdfRenderer = new PDFRenderer(document);
int pageCounter = 0;
for (PDPage page : document.getPages())
{
BufferedImage bim = pdfRenderer.renderImageWithDPI(pageCounter, 300, ImageType.RGB);
ImageIOUtil.writeImage(bim, pdfFilename + "-" + (pageCounter++) + ".png", 300);
}
document.close();
请指出这里出了什么问题,为什么 PDFRenderer 无法获取 PDF 页面图像以及突出显示的红色框。
下面是我用来突出显示 PDF 文本的代码:
private void highlightText(String pdfFilePath, String highlightedPdfFilePath) {
try {
// Loading an existing document
File file = new File(highlightedPdfFilePath);
if (!file.exists()) {
file = new File(pdfFilePath);
}
PDDocument document = PDDocument.load(file);
// extended PDFTextStripper class
PDFTextStripper stripper = new PDFTextHighlighter();
// Get number of pages
int number_of_pages = document.getDocumentCatalog().getPages().getCount();
// The method writeText will invoke an override version of
// writeString
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
// Print collected information
System.out.println("tokenStream:::"+tokenStream);
System.out.println("tokenStream size::"+tokenStream.size());
System.out.println("coordinates size::"+coordinates.size());
double page_height;
double page_width;
double width, height, minx, maxx, miny, maxy;
int rotation;
// scan each page and highlitht all the words inside them
for (int page_index = 0; page_index < number_of_pages; page_index++) {
// get current page
PDPage page = document.getPage(page_index);
// Get annotations for the selected page
List<PDAnnotation> annotations = page.getAnnotations();
// Define a color to use for highlighting text
PDColor red = new PDColor(new float[] { 1, 0, 0 }, PDDeviceRGB.INSTANCE);
// Page height and width
page_height = page.getMediaBox().getHeight();
page_width = page.getMediaBox().getWidth();
// Scan collected coordinates
for (int i = 0; i < coordinates.size(); i++) {
if (!differencePgaeNumber.contains(page_index)) {
differencePgaeNumber.add(page_index);
}
// if the current coordinates are not related to the current
// page, ignore them
if ((int) coordinates.get(i)[4] != (page_index + 1))
continue;
else {
// get rotation of the page...portrait..landscape..
rotation = (int) coordinates.get(i)[7];
// page rotated of 90degrees
if (rotation == 90) {
height = coordinates.get(i)[5];
width = coordinates.get(i)[6];
width = (page_height * width) / page_width;
// define coordinates of a rectangle
maxx = coordinates.get(i)[1];
minx = coordinates.get(i)[1] - height;
miny = coordinates.get(i)[0];
maxy = coordinates.get(i)[0] + width;
} else // i should add here the cases -90/-180 degrees
{
height = coordinates.get(i)[5];
minx = coordinates.get(i)[0];
maxx = coordinates.get(i)[2];
miny = page_height - coordinates.get(i)[1];
maxy = page_height - coordinates.get(i)[3] + height;
}
// Add an annotation for each scanned word
PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(
PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
txtMark.setColor(red);
txtMark.setConstantOpacity((float) 0.3); // 30%
// transparent
PDRectangle position = new PDRectangle();
position.setLowerLeftX((float) minx);
position.setLowerLeftY((float) miny);
position.setUpperRightX((float) maxx);
position.setUpperRightY((float) ((float) maxy + height));
txtMark.setRectangle(position);
float[] quads = new float[8];
quads[0] = position.getLowerLeftX(); // x1
quads[1] = position.getUpperRightY() - 2; // y1
quads[2] = position.getUpperRightX(); // x2
quads[3] = quads[1]; // y2
quads[4] = quads[0]; // x3
quads[5] = position.getLowerLeftY() - 2; // y3
quads[6] = quads[2]; // x4
quads[7] = quads[5]; // y5
txtMark.setQuadPoints(quads);
txtMark.setContents(tokenStream.get(i).toString());
annotations.add(txtMark);
}
}
}
// Saving the document in a new file
File highlighted_doc = new File(highlightedPdfFilePath);
document.save(highlighted_doc);
document.close();
} catch (IOException e) {
System.out.println(e);
}
}
您需要通过此调用构建注释的视觉外观:
txtMark.constructAppearances(document);
我有使用 PDFBox API 编写的代码,它突出显示了 PDF 中的文字,但是当我将突出显示的 PDF 页面转换为图像时,我突出显示的任何内容都会从图像中消失。
下面的屏幕截图带有突出显示的文本,为了突出显示我使用了 PDFBox 的 PDAnnotationTextMarkup class:
Highlighted PDF Page
下图为pdf页面转为图片后的图片:
Highlighted PDF Page Image after converting
下面是我用来将 PDF 转换为图像的代码:
PDDocument document = PDDocument.load(new File(pdfFilename));
PDFRenderer pdfRenderer = new PDFRenderer(document);
int pageCounter = 0;
for (PDPage page : document.getPages())
{
BufferedImage bim = pdfRenderer.renderImageWithDPI(pageCounter, 300, ImageType.RGB);
ImageIOUtil.writeImage(bim, pdfFilename + "-" + (pageCounter++) + ".png", 300);
}
document.close();
请指出这里出了什么问题,为什么 PDFRenderer 无法获取 PDF 页面图像以及突出显示的红色框。
下面是我用来突出显示 PDF 文本的代码:
private void highlightText(String pdfFilePath, String highlightedPdfFilePath) {
try {
// Loading an existing document
File file = new File(highlightedPdfFilePath);
if (!file.exists()) {
file = new File(pdfFilePath);
}
PDDocument document = PDDocument.load(file);
// extended PDFTextStripper class
PDFTextStripper stripper = new PDFTextHighlighter();
// Get number of pages
int number_of_pages = document.getDocumentCatalog().getPages().getCount();
// The method writeText will invoke an override version of
// writeString
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
// Print collected information
System.out.println("tokenStream:::"+tokenStream);
System.out.println("tokenStream size::"+tokenStream.size());
System.out.println("coordinates size::"+coordinates.size());
double page_height;
double page_width;
double width, height, minx, maxx, miny, maxy;
int rotation;
// scan each page and highlitht all the words inside them
for (int page_index = 0; page_index < number_of_pages; page_index++) {
// get current page
PDPage page = document.getPage(page_index);
// Get annotations for the selected page
List<PDAnnotation> annotations = page.getAnnotations();
// Define a color to use for highlighting text
PDColor red = new PDColor(new float[] { 1, 0, 0 }, PDDeviceRGB.INSTANCE);
// Page height and width
page_height = page.getMediaBox().getHeight();
page_width = page.getMediaBox().getWidth();
// Scan collected coordinates
for (int i = 0; i < coordinates.size(); i++) {
if (!differencePgaeNumber.contains(page_index)) {
differencePgaeNumber.add(page_index);
}
// if the current coordinates are not related to the current
// page, ignore them
if ((int) coordinates.get(i)[4] != (page_index + 1))
continue;
else {
// get rotation of the page...portrait..landscape..
rotation = (int) coordinates.get(i)[7];
// page rotated of 90degrees
if (rotation == 90) {
height = coordinates.get(i)[5];
width = coordinates.get(i)[6];
width = (page_height * width) / page_width;
// define coordinates of a rectangle
maxx = coordinates.get(i)[1];
minx = coordinates.get(i)[1] - height;
miny = coordinates.get(i)[0];
maxy = coordinates.get(i)[0] + width;
} else // i should add here the cases -90/-180 degrees
{
height = coordinates.get(i)[5];
minx = coordinates.get(i)[0];
maxx = coordinates.get(i)[2];
miny = page_height - coordinates.get(i)[1];
maxy = page_height - coordinates.get(i)[3] + height;
}
// Add an annotation for each scanned word
PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(
PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
txtMark.setColor(red);
txtMark.setConstantOpacity((float) 0.3); // 30%
// transparent
PDRectangle position = new PDRectangle();
position.setLowerLeftX((float) minx);
position.setLowerLeftY((float) miny);
position.setUpperRightX((float) maxx);
position.setUpperRightY((float) ((float) maxy + height));
txtMark.setRectangle(position);
float[] quads = new float[8];
quads[0] = position.getLowerLeftX(); // x1
quads[1] = position.getUpperRightY() - 2; // y1
quads[2] = position.getUpperRightX(); // x2
quads[3] = quads[1]; // y2
quads[4] = quads[0]; // x3
quads[5] = position.getLowerLeftY() - 2; // y3
quads[6] = quads[2]; // x4
quads[7] = quads[5]; // y5
txtMark.setQuadPoints(quads);
txtMark.setContents(tokenStream.get(i).toString());
annotations.add(txtMark);
}
}
}
// Saving the document in a new file
File highlighted_doc = new File(highlightedPdfFilePath);
document.save(highlighted_doc);
document.close();
} catch (IOException e) {
System.out.println(e);
}
}
您需要通过此调用构建注释的视觉外观:
txtMark.constructAppearances(document);