pdfbox getcharacterbyarticle() 渲染最后一页的矢量
pdfbox getcharacterbyarticle() rendering the vector for last page
我正在尝试使用以下代码获取文本详细信息,如坐标、宽度和高度(从 here 获取此解决方案),但输出只是来自 [=21= 的文本]最后一页。
代码
public static void main( String[] args ) throws IOException {
PDDocument document = null;
String fileName = "apache.pdf"
PDFParser parser = new PDFParser(new FileInputStream(fileName));
parser.parse();
StringWriter outString = new StringWriter();
CustomPDFTextStripper stripper = new CustomPDFTextStripper();
stripper.writeText(parser.getPDDocument(), outString);
Vector<List<TextPosition>> vectorlistoftps = stripper.getCharactersByArticle();
for (int i = 0; i < vectorlistoftps.size(); i++) {
List<TextPosition> tplist = vectorlistoftps.get(i);
for (int j = 0; j < tplist.size(); j++) {
TextPosition text = tplist.get(j);
System.out.println(" String "
+ "[x: " + text.getXDirAdj() + ", y: "
+ text.getY() + ", height:" + text.getHeightDir()
+ ", space: " + text.getWidthOfSpace() + ", width: "
+ text.getWidthDirAdj() + ", yScale: " + text.getYScale() + "]"
+ text.getCharacter() +" Font "+ text.getFont().getBaseFont() + " PageNUm "+ (i+1));
}
}
}
CustomPDFTextStripper class:
class CustomPDFTextStripper extends PDFTextStripper
{
//Vector<Vector<List<TextPosition>>> data = new Vector<Vector<List<TextPosition>>>();
public CustomPDFTextStripper() throws IOException {
super();
}
public Vector<List<TextPosition>> getCharactersByArticle(){
// data.add(charactersByArticle);
return charactersByArticle;
}
}
我试图将向量添加到列表中,但是在调用 stripper() 时它会遍历所有页面,最后一页的详细信息存储在 charactersByArticle 向量并因此返回相同的。我如何获取所有页面的信息??
临时修复:
更改了将当前页设置为结束页并获取文本信息的主要方法。不过不是个好主意。
for (int page = 0; page < pageCount; page++)
{
stripper.setStartPage(0);
stripper.setEndPage(page + 1);
stripper.writeText(parser.getPDDocument(), outString);
Vector vectorlistoftps = stripper.getCharactersByArticle();
PDPage thisPage = stripper.getCurrentPage();
for (int i = 0; i < vectorlistoftps.size(); i++) {
List<TextPosition> tplist = vectorlistoftps.get(i);
}
}
我正在尝试使用以下代码获取文本详细信息,如坐标、宽度和高度(从 here 获取此解决方案),但输出只是来自 [=21= 的文本]最后一页。
代码
public static void main( String[] args ) throws IOException {
PDDocument document = null;
String fileName = "apache.pdf"
PDFParser parser = new PDFParser(new FileInputStream(fileName));
parser.parse();
StringWriter outString = new StringWriter();
CustomPDFTextStripper stripper = new CustomPDFTextStripper();
stripper.writeText(parser.getPDDocument(), outString);
Vector<List<TextPosition>> vectorlistoftps = stripper.getCharactersByArticle();
for (int i = 0; i < vectorlistoftps.size(); i++) {
List<TextPosition> tplist = vectorlistoftps.get(i);
for (int j = 0; j < tplist.size(); j++) {
TextPosition text = tplist.get(j);
System.out.println(" String "
+ "[x: " + text.getXDirAdj() + ", y: "
+ text.getY() + ", height:" + text.getHeightDir()
+ ", space: " + text.getWidthOfSpace() + ", width: "
+ text.getWidthDirAdj() + ", yScale: " + text.getYScale() + "]"
+ text.getCharacter() +" Font "+ text.getFont().getBaseFont() + " PageNUm "+ (i+1));
}
}
}
CustomPDFTextStripper class:
class CustomPDFTextStripper extends PDFTextStripper
{
//Vector<Vector<List<TextPosition>>> data = new Vector<Vector<List<TextPosition>>>();
public CustomPDFTextStripper() throws IOException {
super();
}
public Vector<List<TextPosition>> getCharactersByArticle(){
// data.add(charactersByArticle);
return charactersByArticle;
}
}
我试图将向量添加到列表中,但是在调用 stripper() 时它会遍历所有页面,最后一页的详细信息存储在 charactersByArticle 向量并因此返回相同的。我如何获取所有页面的信息??
临时修复:
更改了将当前页设置为结束页并获取文本信息的主要方法。不过不是个好主意。
for (int page = 0; page < pageCount; page++)
{
stripper.setStartPage(0);
stripper.setEndPage(page + 1);
stripper.writeText(parser.getPDDocument(), outString);
Vector vectorlistoftps = stripper.getCharactersByArticle();
PDPage thisPage = stripper.getCurrentPage();
for (int i = 0; i < vectorlistoftps.size(); i++) {
List<TextPosition> tplist = vectorlistoftps.get(i);
}
}