从注释(PdfTex 标记注释)创建一个矩形以提取突出显示的内容?
Create an Rectangle from an Annotation (PdfTextMarkupAnnotation) to extract highlighted content?
我想获取我在 pdf 文件中标记的文本。我遍历 PdfPage
的 PdfAnnotation
。 annoation 有一个方法 getRectangle()
which return a PdfArray
。我无法从 PdfArray 创建一个 Rectangle
runtime-class (object/instance),它具有位置并覆盖在注释的标记文本上。
使用注释中的 Rectangle
我想通过 LocationtextExtratctionStrategy 过滤标记的内容。
我写了下面的代码来用 iText 获取它:
package biz.hochguertel;
import com.itextpdf.kernel.color.DeviceCmyk;
import com.itextpdf.kernel.events.Event;
import com.itextpdf.kernel.events.IEventHandler;
import com.itextpdf.kernel.events.PdfDocumentEvent;
import com.itextpdf.kernel.font.PdfFont;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.*;
import com.itextpdf.kernel.pdf.annot.PdfAnnotation;
import com.itextpdf.kernel.pdf.annot.PdfTextMarkupAnnotation;
import com.itextpdf.kernel.pdf.canvas.PdfCanvas;
import com.itextpdf.kernel.pdf.canvas.parser.EventType;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter;
import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredEventListener;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
import java.io.File;
import java.io.IOException;
import java.util.List;
public class AppIText {
private String filePath = getClass().getClassLoader().getResource("itext/OCA/549_OCA_Java_SE_7_Programmer_I_Certification.pdf").getFile();
private static String DEST = "demo-output/549_OCA_Java_SE_7_Programmer_I_Certification.pdf";
private PdfDocument pdfDocument;
private PdfDocument pdfWriteDoc;
public void before() throws IOException {
File file = new File(DEST);
file.getParentFile().mkdir();
if (file.exists()) {
file.delete();
}
pdfDocument = new PdfDocument(new PdfReader(filePath));
pdfWriteDoc = new PdfDocument(new PdfWriter(DEST));
}
public static void main(String[] args) throws IOException {
AppIText appIText = new AppIText();
appIText.before();
appIText.process();
appIText.close();
}
private void close() {
pdfDocument.close();
pdfWriteDoc.close();
}
private void process() {
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) {
PdfPage page = pdfDocument.getPage(i);
List<PdfPage> newPdfPages = pdfDocument.copyPagesTo(i, i, pdfWriteDoc);
PdfPage newPage = null;
if (newPdfPages.size() > 0) {
newPage = newPdfPages.get(0);
}
List<PdfAnnotation> annotations = page.getAnnotations();
for (PdfAnnotation annotation : annotations) {
if (annotation.getContents() != null) {
System.out.println(annotation.getContents());
if (annotation instanceof PdfTextMarkupAnnotation) {
PdfArray rectangleArray = annotation.getRectangle();
double x = ((PdfNumber) rectangleArray.get(0)).getValue();
double y = ((PdfNumber) rectangleArray.get(1)).getValue();
double xWidth = ((PdfNumber) rectangleArray.get(2)).getValue();
double yWidth = ((PdfNumber) rectangleArray.get(3)).getValue();
System.out.println(String.format("x=%s,y=%s,w=%s,h=%s", x, y, xWidth, yWidth));
Rectangle rectangle = new Rectangle((float) x, (float) y, (float) xWidth, (float) yWidth);
PdfCanvas canvas = new PdfCanvas(newPage);
canvas.setFillColor(new DeviceCmyk(1, 0, 0, 0))
.rectangle(rectangle)
.fillStroke()
;
FontFilter fontFilter = new FontFilter(rectangle);
FilteredEventListener listener = new FilteredEventListener();
LocationTextExtractionStrategy extractionStrategy = listener.attachEventListener(new LocationTextExtractionStrategy(), fontFilter);
new PdfCanvasProcessor(listener).processPageContent(page);
String actualText = extractionStrategy.getResultantText();
}
}
}
}
}
}
class RectangleEventHandler implements IEventHandler {
@Override
public void handleEvent(Event event) {
PdfDocumentEvent docEvent = (PdfDocumentEvent) event;
PdfDocument pdfDoc = docEvent.getDocument();
PdfPage page = docEvent.getPage();
PdfCanvas canvas = new PdfCanvas(page.getLastContentStream(), page.getResources(), pdfDoc);
canvas.setFillColor(new DeviceCmyk(1, 0, 0, 0))
.rectangle(new Rectangle(20, 10, 10, 820))
.fillStroke();
}
}
class FontFilter extends TextRegionEventFilter {
public FontFilter(Rectangle filterRect) {
super(filterRect);
}
@Override
public boolean accept(IEventData data, EventType type) {
if (type.equals(EventType.RENDER_TEXT)) {
TextRenderInfo renderInfo = (TextRenderInfo) data;
PdfFont font = renderInfo.getFont();
if (null != font) {
String fontName = font.getFontProgram().getFontNames().getFontName();
return fontName.endsWith("Bold") || fontName.endsWith("Oblique");
}
}
return false;
}
}
- 如何创建一个与标记区域匹配的矩形以仅从 pdf 中提取标记(突出显示)的文本?
- 或者是否有其他方法从 pdf 中获取带有注释的标记文本?
上面代码的以下主要部分是应用:
Rectangle rectangle = new Rectangle((float) x, (float) y, (float) xWidth, (float) yWidth);
我找到了解决方案。
矩形计算:
x: annotation.x
是:annotation.y
宽度:annotation.width - annotation.x
身高:annotation.height - annotation.y
我现在得到的是:
可视化调试(如果 LOG_LEVEL >= 100):
提取的内容:
13:50:01.323 [main] INFO b.h.AppIText - Annotation contents: q(7.1).explain(1)
13:50:01.323 [main] INFO b.h.AppIText - rectangleArray: x=90.0338, y=438.245, w=468.33, h=489.749
13:50:01.323 [main] INFO b.h.AppIText - pageSizeWithRotation: x=0.0, y=0.0, w=531.0, h=666.0, top=666.0, bottom=0.0, left=0.0, right=531.0
13:50:01.337 [main] INFO b.h.AppIText - str: Purpose: A finally block can’t be placed before the catch blocks. <cut here because the book is not free, but I get the complete marked text..>
我的固定代码现在看起来是:
package biz.hochguertel;
import com.itextpdf.kernel.color.DeviceCmyk;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.*;
import com.itextpdf.kernel.pdf.annot.PdfAnnotation;
import com.itextpdf.kernel.pdf.annot.PdfTextMarkupAnnotation;
import com.itextpdf.kernel.pdf.canvas.PdfCanvas;
import com.itextpdf.kernel.pdf.canvas.parser.PdfTextExtractor;
import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter;
import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredTextEventListener;
import com.itextpdf.kernel.pdf.canvas.parser.listener.ITextExtractionStrategy;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.List;
/**
* With the help of the following documentations:
* - http://developers.itextpdf.com/content/best-itext-questions-stackoverview/content-parsing-extraction-and-redaction-text/itext7-how-read-text-specific-position
*/
public class AppIText {
private static final Logger LOGGER = LoggerFactory.getLogger(AppIText.class);
private static int LOG_LEVEL = 0;
private final static int VISUAL_DEBUG = 100;
private String filePath = getClass().getClassLoader().getResource("itext/OCA/393-394,549-550_OCA_Java_SE_7_Programmer_I_Certification.pdf").getFile();
private static String DEST = "demo-output/393-394,549-550_OCA_Java_SE_7_Programmer_I_Certification.pdf";
private PdfDocument pdfDocument;
private PdfDocument pdfWriteDoc;
public void before() throws IOException {
File file = new File(DEST);
file.getParentFile().mkdir();
if (file.exists()) {
file.delete();
}
pdfDocument = new PdfDocument(new PdfReader(filePath));
pdfWriteDoc = new PdfDocument(new PdfWriter(DEST));
}
public static void main(String[] args) throws IOException {
AppIText appIText = new AppIText();
appIText.before();
appIText.process();
}
private void process() {
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) {
PdfPage page = pdfDocument.getPage(i);
List<PdfPage> newPdfPages = pdfDocument.copyPagesTo(i, i, pdfWriteDoc);
PdfPage newPage = null;
if (newPdfPages.size() > 0) {
newPage = newPdfPages.get(0);
}
List<PdfAnnotation> annotations = page.getAnnotations();
for (PdfAnnotation annotation : annotations) {
if (annotation.getContents() != null) {
System.out.println();
LOGGER.info("Annotation contents: {}", annotation.getContents());
if (annotation instanceof PdfTextMarkupAnnotation) {
PdfArray rectangleArray = annotation.getRectangle();
LOGGER.info("rectangleArray: x={}, y={}, w={}, h={}",
rectangleArray.get(0),
rectangleArray.get(1),
rectangleArray.get(2),
rectangleArray.get(3)
);
Rectangle pageSizeWithRotation = page.getCropBox();
LOGGER.info("pageSizeWithRotation: x={}, y={}, w={}, h={}, top={}, bottom={}, left={}, right={}",
pageSizeWithRotation.getX(),
pageSizeWithRotation.getY(),
pageSizeWithRotation.getWidth(),
pageSizeWithRotation.getHeight(),
pageSizeWithRotation.getTop(),
pageSizeWithRotation.getBottom(),
pageSizeWithRotation.getLeft(),
pageSizeWithRotation.getRight()
);
float x = ((PdfNumber) rectangleArray.get(0)).floatValue();
float y = ((PdfNumber) rectangleArray.get(1)).floatValue();
float width = ((PdfNumber) rectangleArray.get(2)).floatValue() - x;
float height = ((PdfNumber) rectangleArray.get(3)).floatValue() - y;
Rectangle rectangle = new Rectangle(
x,
y,
width,
height
);
//13:10:33.097 [main] INFO b.h.AppIText - Annotation contents: q(7.1).explain(1)
//13:10:33.107 [main] INFO b.h.AppIText - rectangleArray: x=90.0338, y=438.245, w=468.33, h=489.749
//13:10:33.107 [main] INFO b.h.AppIText - pageSizeWithRotation: x=0.0, y=0.0, w=531.0, h=666.0, top=666.0, bottom=0.0, left=0.0, right=531.0
//width: 468.33f - 90.0388f,
//height: 489.749f - 438.245f
if (LOG_LEVEL >= VISUAL_DEBUG) {
PdfCanvas canvas = new PdfCanvas(newPage);
canvas.setFillColor(new DeviceCmyk(1, 0, 0, 0))
.rectangle(rectangle)
.fillStroke();
}
TextRegionEventFilter regionFilter = new TextRegionEventFilter(rectangle);
ITextExtractionStrategy strategy = new FilteredTextEventListener(new LocationTextExtractionStrategy(), regionFilter);
String str = PdfTextExtractor.getTextFromPage(page, strategy) + "\n";
LOGGER.info("str: {}", str);
}
}
}
}
pdfDocument.close();
pdfWriteDoc.close();
}
}
我想获取我在 pdf 文件中标记的文本。我遍历 PdfPage
的 PdfAnnotation
。 annoation 有一个方法 getRectangle()
which return a PdfArray
。我无法从 PdfArray 创建一个 Rectangle
runtime-class (object/instance),它具有位置并覆盖在注释的标记文本上。
使用注释中的 Rectangle
我想通过 LocationtextExtratctionStrategy 过滤标记的内容。
我写了下面的代码来用 iText 获取它:
package biz.hochguertel;
import com.itextpdf.kernel.color.DeviceCmyk;
import com.itextpdf.kernel.events.Event;
import com.itextpdf.kernel.events.IEventHandler;
import com.itextpdf.kernel.events.PdfDocumentEvent;
import com.itextpdf.kernel.font.PdfFont;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.*;
import com.itextpdf.kernel.pdf.annot.PdfAnnotation;
import com.itextpdf.kernel.pdf.annot.PdfTextMarkupAnnotation;
import com.itextpdf.kernel.pdf.canvas.PdfCanvas;
import com.itextpdf.kernel.pdf.canvas.parser.EventType;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter;
import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredEventListener;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
import java.io.File;
import java.io.IOException;
import java.util.List;
public class AppIText {
private String filePath = getClass().getClassLoader().getResource("itext/OCA/549_OCA_Java_SE_7_Programmer_I_Certification.pdf").getFile();
private static String DEST = "demo-output/549_OCA_Java_SE_7_Programmer_I_Certification.pdf";
private PdfDocument pdfDocument;
private PdfDocument pdfWriteDoc;
public void before() throws IOException {
File file = new File(DEST);
file.getParentFile().mkdir();
if (file.exists()) {
file.delete();
}
pdfDocument = new PdfDocument(new PdfReader(filePath));
pdfWriteDoc = new PdfDocument(new PdfWriter(DEST));
}
public static void main(String[] args) throws IOException {
AppIText appIText = new AppIText();
appIText.before();
appIText.process();
appIText.close();
}
private void close() {
pdfDocument.close();
pdfWriteDoc.close();
}
private void process() {
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) {
PdfPage page = pdfDocument.getPage(i);
List<PdfPage> newPdfPages = pdfDocument.copyPagesTo(i, i, pdfWriteDoc);
PdfPage newPage = null;
if (newPdfPages.size() > 0) {
newPage = newPdfPages.get(0);
}
List<PdfAnnotation> annotations = page.getAnnotations();
for (PdfAnnotation annotation : annotations) {
if (annotation.getContents() != null) {
System.out.println(annotation.getContents());
if (annotation instanceof PdfTextMarkupAnnotation) {
PdfArray rectangleArray = annotation.getRectangle();
double x = ((PdfNumber) rectangleArray.get(0)).getValue();
double y = ((PdfNumber) rectangleArray.get(1)).getValue();
double xWidth = ((PdfNumber) rectangleArray.get(2)).getValue();
double yWidth = ((PdfNumber) rectangleArray.get(3)).getValue();
System.out.println(String.format("x=%s,y=%s,w=%s,h=%s", x, y, xWidth, yWidth));
Rectangle rectangle = new Rectangle((float) x, (float) y, (float) xWidth, (float) yWidth);
PdfCanvas canvas = new PdfCanvas(newPage);
canvas.setFillColor(new DeviceCmyk(1, 0, 0, 0))
.rectangle(rectangle)
.fillStroke()
;
FontFilter fontFilter = new FontFilter(rectangle);
FilteredEventListener listener = new FilteredEventListener();
LocationTextExtractionStrategy extractionStrategy = listener.attachEventListener(new LocationTextExtractionStrategy(), fontFilter);
new PdfCanvasProcessor(listener).processPageContent(page);
String actualText = extractionStrategy.getResultantText();
}
}
}
}
}
}
class RectangleEventHandler implements IEventHandler {
@Override
public void handleEvent(Event event) {
PdfDocumentEvent docEvent = (PdfDocumentEvent) event;
PdfDocument pdfDoc = docEvent.getDocument();
PdfPage page = docEvent.getPage();
PdfCanvas canvas = new PdfCanvas(page.getLastContentStream(), page.getResources(), pdfDoc);
canvas.setFillColor(new DeviceCmyk(1, 0, 0, 0))
.rectangle(new Rectangle(20, 10, 10, 820))
.fillStroke();
}
}
class FontFilter extends TextRegionEventFilter {
public FontFilter(Rectangle filterRect) {
super(filterRect);
}
@Override
public boolean accept(IEventData data, EventType type) {
if (type.equals(EventType.RENDER_TEXT)) {
TextRenderInfo renderInfo = (TextRenderInfo) data;
PdfFont font = renderInfo.getFont();
if (null != font) {
String fontName = font.getFontProgram().getFontNames().getFontName();
return fontName.endsWith("Bold") || fontName.endsWith("Oblique");
}
}
return false;
}
}
- 如何创建一个与标记区域匹配的矩形以仅从 pdf 中提取标记(突出显示)的文本?
- 或者是否有其他方法从 pdf 中获取带有注释的标记文本?
上面代码的以下主要部分是应用:
Rectangle rectangle = new Rectangle((float) x, (float) y, (float) xWidth, (float) yWidth);
我找到了解决方案。
矩形计算:
x: annotation.x
是:annotation.y
宽度:annotation.width - annotation.x
身高:annotation.height - annotation.y
我现在得到的是:
可视化调试(如果 LOG_LEVEL >= 100):
提取的内容:
13:50:01.323 [main] INFO b.h.AppIText - Annotation contents: q(7.1).explain(1)
13:50:01.323 [main] INFO b.h.AppIText - rectangleArray: x=90.0338, y=438.245, w=468.33, h=489.749
13:50:01.323 [main] INFO b.h.AppIText - pageSizeWithRotation: x=0.0, y=0.0, w=531.0, h=666.0, top=666.0, bottom=0.0, left=0.0, right=531.0
13:50:01.337 [main] INFO b.h.AppIText - str: Purpose: A finally block can’t be placed before the catch blocks. <cut here because the book is not free, but I get the complete marked text..>
我的固定代码现在看起来是:
package biz.hochguertel;
import com.itextpdf.kernel.color.DeviceCmyk;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.*;
import com.itextpdf.kernel.pdf.annot.PdfAnnotation;
import com.itextpdf.kernel.pdf.annot.PdfTextMarkupAnnotation;
import com.itextpdf.kernel.pdf.canvas.PdfCanvas;
import com.itextpdf.kernel.pdf.canvas.parser.PdfTextExtractor;
import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter;
import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredTextEventListener;
import com.itextpdf.kernel.pdf.canvas.parser.listener.ITextExtractionStrategy;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.List;
/**
* With the help of the following documentations:
* - http://developers.itextpdf.com/content/best-itext-questions-stackoverview/content-parsing-extraction-and-redaction-text/itext7-how-read-text-specific-position
*/
public class AppIText {
private static final Logger LOGGER = LoggerFactory.getLogger(AppIText.class);
private static int LOG_LEVEL = 0;
private final static int VISUAL_DEBUG = 100;
private String filePath = getClass().getClassLoader().getResource("itext/OCA/393-394,549-550_OCA_Java_SE_7_Programmer_I_Certification.pdf").getFile();
private static String DEST = "demo-output/393-394,549-550_OCA_Java_SE_7_Programmer_I_Certification.pdf";
private PdfDocument pdfDocument;
private PdfDocument pdfWriteDoc;
public void before() throws IOException {
File file = new File(DEST);
file.getParentFile().mkdir();
if (file.exists()) {
file.delete();
}
pdfDocument = new PdfDocument(new PdfReader(filePath));
pdfWriteDoc = new PdfDocument(new PdfWriter(DEST));
}
public static void main(String[] args) throws IOException {
AppIText appIText = new AppIText();
appIText.before();
appIText.process();
}
private void process() {
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) {
PdfPage page = pdfDocument.getPage(i);
List<PdfPage> newPdfPages = pdfDocument.copyPagesTo(i, i, pdfWriteDoc);
PdfPage newPage = null;
if (newPdfPages.size() > 0) {
newPage = newPdfPages.get(0);
}
List<PdfAnnotation> annotations = page.getAnnotations();
for (PdfAnnotation annotation : annotations) {
if (annotation.getContents() != null) {
System.out.println();
LOGGER.info("Annotation contents: {}", annotation.getContents());
if (annotation instanceof PdfTextMarkupAnnotation) {
PdfArray rectangleArray = annotation.getRectangle();
LOGGER.info("rectangleArray: x={}, y={}, w={}, h={}",
rectangleArray.get(0),
rectangleArray.get(1),
rectangleArray.get(2),
rectangleArray.get(3)
);
Rectangle pageSizeWithRotation = page.getCropBox();
LOGGER.info("pageSizeWithRotation: x={}, y={}, w={}, h={}, top={}, bottom={}, left={}, right={}",
pageSizeWithRotation.getX(),
pageSizeWithRotation.getY(),
pageSizeWithRotation.getWidth(),
pageSizeWithRotation.getHeight(),
pageSizeWithRotation.getTop(),
pageSizeWithRotation.getBottom(),
pageSizeWithRotation.getLeft(),
pageSizeWithRotation.getRight()
);
float x = ((PdfNumber) rectangleArray.get(0)).floatValue();
float y = ((PdfNumber) rectangleArray.get(1)).floatValue();
float width = ((PdfNumber) rectangleArray.get(2)).floatValue() - x;
float height = ((PdfNumber) rectangleArray.get(3)).floatValue() - y;
Rectangle rectangle = new Rectangle(
x,
y,
width,
height
);
//13:10:33.097 [main] INFO b.h.AppIText - Annotation contents: q(7.1).explain(1)
//13:10:33.107 [main] INFO b.h.AppIText - rectangleArray: x=90.0338, y=438.245, w=468.33, h=489.749
//13:10:33.107 [main] INFO b.h.AppIText - pageSizeWithRotation: x=0.0, y=0.0, w=531.0, h=666.0, top=666.0, bottom=0.0, left=0.0, right=531.0
//width: 468.33f - 90.0388f,
//height: 489.749f - 438.245f
if (LOG_LEVEL >= VISUAL_DEBUG) {
PdfCanvas canvas = new PdfCanvas(newPage);
canvas.setFillColor(new DeviceCmyk(1, 0, 0, 0))
.rectangle(rectangle)
.fillStroke();
}
TextRegionEventFilter regionFilter = new TextRegionEventFilter(rectangle);
ITextExtractionStrategy strategy = new FilteredTextEventListener(new LocationTextExtractionStrategy(), regionFilter);
String str = PdfTextExtractor.getTextFromPage(page, strategy) + "\n";
LOGGER.info("str: {}", str);
}
}
}
}
pdfDocument.close();
pdfWriteDoc.close();
}
}