无法使用 java 删除 pdf 中的页面行
unable to strip page lines in pdf using java
我想剥离第 1 页和第 3 页,但为什么它只剥离第 1 页并显示相同的输出 twice.I 我正在使用 pdfbox here.It 无法剥离第 3 页,即使我有写成 stripper.setStartPage( 3 )
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
public class GetlinesFromPDF extends PDFTextStripper {
static List<String> lines = new ArrayList<String>();
public GetlinesFromPDF() throws IOException {
}
public static void main( String[] args ) throws IOException {
PDDocument document = null;
String fileName = "C://Users//policy.pdf";
try {
document = PDDocument.load( new File(fileName) );
PDFTextStripper stripper = new GetlinesFromPDF();
stripper.setSortByPosition( true );
stripper.setStartPage( 1 );
stripper.setEndPage( 1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
String qoute_number = lines.get(2);
System.out.println(qoute_number);
stripper.setStartPage( 3 );
stripper.setEndPage( 3);
Writer dummy1 = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy1);
String qoute_number1 = lines.get(2);
System.out.println(qoute_number1);
}
finally {
if( document != null ) {
document.close();
}
}
}
@Override
protected void writeString(String str, List<TextPosition> textPositions) throws IOException {
lines.add(str);
}}
剥离工作完美。您的问题是 lines
添加了所有结果 - 剥离了第一页和第三页。因此,打印出相同的索引(get(2)
)总是包含第一页的内容。如果在两次剥离之间添加清晰的调用,您应该会看到正确的结果。
stripper.setStartPage( 1 );
stripper.setEndPage( 1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
String qoute_number = lines.get(2);
System.out.println(qoute_number);
lines.clear();
stripper.setStartPage( 3 );
stripper.setEndPage( 3);
我想剥离第 1 页和第 3 页,但为什么它只剥离第 1 页并显示相同的输出 twice.I 我正在使用 pdfbox here.It 无法剥离第 3 页,即使我有写成 stripper.setStartPage( 3 )
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
public class GetlinesFromPDF extends PDFTextStripper {
static List<String> lines = new ArrayList<String>();
public GetlinesFromPDF() throws IOException {
}
public static void main( String[] args ) throws IOException {
PDDocument document = null;
String fileName = "C://Users//policy.pdf";
try {
document = PDDocument.load( new File(fileName) );
PDFTextStripper stripper = new GetlinesFromPDF();
stripper.setSortByPosition( true );
stripper.setStartPage( 1 );
stripper.setEndPage( 1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
String qoute_number = lines.get(2);
System.out.println(qoute_number);
stripper.setStartPage( 3 );
stripper.setEndPage( 3);
Writer dummy1 = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy1);
String qoute_number1 = lines.get(2);
System.out.println(qoute_number1);
}
finally {
if( document != null ) {
document.close();
}
}
}
@Override
protected void writeString(String str, List<TextPosition> textPositions) throws IOException {
lines.add(str);
}}
剥离工作完美。您的问题是 lines
添加了所有结果 - 剥离了第一页和第三页。因此,打印出相同的索引(get(2)
)总是包含第一页的内容。如果在两次剥离之间添加清晰的调用,您应该会看到正确的结果。
stripper.setStartPage( 1 );
stripper.setEndPage( 1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
String qoute_number = lines.get(2);
System.out.println(qoute_number);
lines.clear();
stripper.setStartPage( 3 );
stripper.setEndPage( 3);