PDFbox 未在 android 上提取区域
PDFbox not extracting regions on android
我试图只提取 PDF 文档中突出显示的文本。它可以在 PC 上运行,但是当我在 Android 上使用它时,它会失败。 PDFBox 不直接在 android 上工作,所以我将 Birdbrain2/PdfBox-Android 用于 Android。
这是有效的 PC 代码
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.util.PDFTextStripperByArea;
public class ExtractHighlights {
public static void main(String args[]) {
System.out.println(extractHighlights("sample.pdf"));
}
public static String extractHighlights(String fileName){
String extractedText = "";
try {
PDDocument pddDocument = PDDocument.load(new File(fileName));
List allPages = pddDocument.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
List<PDAnnotation> la = page.getAnnotations();
if (la.size() < 1) {
continue;
}
for (PDAnnotation pda : la) {
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
PDRectangle rect = pda.getRectangle();
float x = rect.getLowerLeftX();
float y = rect.getUpperRightY();
float width = rect.getWidth();
float height = rect.getHeight();
int rotation = page.findRotation();
if (rotation == 0) {
PDRectangle pageSize = page.findMediaBox();
y = pageSize.getHeight() - y;
}
Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y,
width, height);
stripper.addRegion("0", awtRect);
stripper.extractRegions(page);
String highlight = stripper.getTextForRegion("0").trim();
if(highlight.length() == 0) continue;
extractedText += highlight.substring(0,highlight.length()-2)+" ";
}
}
pddDocument.close();
//System.out.println(extractedText);
} catch (Exception ex) {
ex.printStackTrace();
}
return extractedText;
}
}
这里是 android 不起作用的代码
@Override
protected String doInBackground(String... strings) {
String extractedText = "";
try {
Log.i("ExtractHighlights","Started");
PDDocument pddDocument = PDDocument.load(new File(strings[0]));
PDPageTree allPages = pddDocument.getDocumentCatalog().getPages();
int totalPages = allPages.getCount();
int pageNumber = 0;
for (PDPage page:allPages) {
publishProgress(pageNumber++,totalPages);
Log.i("ExtractHighlights", "Reading page");
List<PDAnnotation> la = page.getAnnotations();
if (la.size() < 1) {
continue;
}
for (PDAnnotation pda : la) {
Log.i("ExtractHighlights","Annotation found");
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
Log.i("ExtractHighlights","Getting rectangle");
PDRectangle rect = pda.getRectangle();
float x = rect.getLowerLeftX();
float y = rect.getUpperRightY();
float width = rect.getWidth();
float height = rect.getHeight();
RectF region = new RectF(x,y,width,height);
stripper.addRegion("0",region);
Log.i("ExtractHighlights","Extracting regions");
stripper.extractRegions(page);
Log.i("ExtractHighlights","Getting text from region");
String highlight = stripper.getTextForRegion("0").trim();
Log.i("ExtractHighlights",highlight);
if(highlight.length() == 0) continue;
extractedText += highlight.substring(0,highlight.length()-2)+" ";
}
Log.i("ExtractHighlights","Page done");
}
pddDocument.close();
Log.i("ExtractHighlights","Document closed");
} catch (Exception ex) {
ex.printStackTrace();
}
return extractedText;
}
Android 也需要很长时间,Android 假设程序已经崩溃。
我可以尝试将整个 PDF 转换为文本,但我如何知道突出显示了哪些文本?
正如@mkl 所指出的,这是一个已知错误。
所以我使用了 this 家伙的 PDFBox 端口。成功了!
我试图只提取 PDF 文档中突出显示的文本。它可以在 PC 上运行,但是当我在 Android 上使用它时,它会失败。 PDFBox 不直接在 android 上工作,所以我将 Birdbrain2/PdfBox-Android 用于 Android。
这是有效的 PC 代码
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.util.PDFTextStripperByArea;
public class ExtractHighlights {
public static void main(String args[]) {
System.out.println(extractHighlights("sample.pdf"));
}
public static String extractHighlights(String fileName){
String extractedText = "";
try {
PDDocument pddDocument = PDDocument.load(new File(fileName));
List allPages = pddDocument.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
List<PDAnnotation> la = page.getAnnotations();
if (la.size() < 1) {
continue;
}
for (PDAnnotation pda : la) {
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
PDRectangle rect = pda.getRectangle();
float x = rect.getLowerLeftX();
float y = rect.getUpperRightY();
float width = rect.getWidth();
float height = rect.getHeight();
int rotation = page.findRotation();
if (rotation == 0) {
PDRectangle pageSize = page.findMediaBox();
y = pageSize.getHeight() - y;
}
Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y,
width, height);
stripper.addRegion("0", awtRect);
stripper.extractRegions(page);
String highlight = stripper.getTextForRegion("0").trim();
if(highlight.length() == 0) continue;
extractedText += highlight.substring(0,highlight.length()-2)+" ";
}
}
pddDocument.close();
//System.out.println(extractedText);
} catch (Exception ex) {
ex.printStackTrace();
}
return extractedText;
}
}
这里是 android 不起作用的代码
@Override
protected String doInBackground(String... strings) {
String extractedText = "";
try {
Log.i("ExtractHighlights","Started");
PDDocument pddDocument = PDDocument.load(new File(strings[0]));
PDPageTree allPages = pddDocument.getDocumentCatalog().getPages();
int totalPages = allPages.getCount();
int pageNumber = 0;
for (PDPage page:allPages) {
publishProgress(pageNumber++,totalPages);
Log.i("ExtractHighlights", "Reading page");
List<PDAnnotation> la = page.getAnnotations();
if (la.size() < 1) {
continue;
}
for (PDAnnotation pda : la) {
Log.i("ExtractHighlights","Annotation found");
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
Log.i("ExtractHighlights","Getting rectangle");
PDRectangle rect = pda.getRectangle();
float x = rect.getLowerLeftX();
float y = rect.getUpperRightY();
float width = rect.getWidth();
float height = rect.getHeight();
RectF region = new RectF(x,y,width,height);
stripper.addRegion("0",region);
Log.i("ExtractHighlights","Extracting regions");
stripper.extractRegions(page);
Log.i("ExtractHighlights","Getting text from region");
String highlight = stripper.getTextForRegion("0").trim();
Log.i("ExtractHighlights",highlight);
if(highlight.length() == 0) continue;
extractedText += highlight.substring(0,highlight.length()-2)+" ";
}
Log.i("ExtractHighlights","Page done");
}
pddDocument.close();
Log.i("ExtractHighlights","Document closed");
} catch (Exception ex) {
ex.printStackTrace();
}
return extractedText;
}
Android 也需要很长时间,Android 假设程序已经崩溃。
我可以尝试将整个 PDF 转换为文本,但我如何知道突出显示了哪些文本?
正如@mkl 所指出的,这是一个已知错误。 所以我使用了 this 家伙的 PDFBox 端口。成功了!