使用 Apache PDFBox 在 PDF 中查找 javascript 代码
Finding javascript code in PDF using Apache PDFBox
我的目标是提取和处理 PDF 文档可能包含的任何 JavasSript 代码。通过在编辑器中打开 PDF,我可以看到这样的对象:
402 0 obj
<</S/JavaScript/JS(\n\r\n /* Set day 25 */\r\n FormRouter_SetCurrentDate\("25"\);\r)>>
endobj
我正在尝试使用 Apache PDFBox 来完成此操作,但到目前为止还没有成功。
这一行 returns 一个空列表:
jsObj = doc.getObjectsByType(COSName.JAVA_SCRIPT);
谁能给我一些指导?
此工具基于 PDFBox 中的 PrintFields 示例。它将在表单中显示 Javascript 字段。我去年为一个对 AcroForm 字段之间的关系有问题的人写了它(某些字段根据其他字段的值启用/禁用)。还有其他地方可以Javascript.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package pdfboxpageimageextraction;
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
import org.apache.pdfbox.pdmodel.interactive.action.PDFormFieldAdditionalActions;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
import org.apache.pdfbox.pdmodel.interactive.form.PDTerminalField;
/**
* This example will take a PDF document and print all the fields from the file.
*
* @author Ben Litchfield
*
*/
public class PrintJavaScriptFields
{
/**
* This will print all the fields from the document.
*
* @param pdfDocument The PDF to get the fields from.
*
* @throws IOException If there is an error getting the fields.
*/
public void printFields(PDDocument pdfDocument) throws IOException
{
PDDocumentCatalog docCatalog = pdfDocument.getDocumentCatalog();
PDAcroForm acroForm = docCatalog.getAcroForm();
List<PDField> fields = acroForm.getFields();
//System.out.println(fields.size() + " top-level fields were found on the form");
for (PDField field : fields)
{
processField(field, "|--", field.getPartialName());
}
}
private void processField(PDField field, String sLevel, String sParent) throws IOException
{
String partialName = field.getPartialName();
if (field instanceof PDTerminalField)
{
PDTerminalField termField = (PDTerminalField) field;
PDFormFieldAdditionalActions fieldActions = field.getActions();
if (fieldActions != null)
{
System.out.println(field.getFullyQualifiedName() + ": " + fieldActions.getClass().getSimpleName() + " js field actionS:\n" + fieldActions.getCOSObject());
printPossibleJS(fieldActions.getK());
printPossibleJS(fieldActions.getC());
printPossibleJS(fieldActions.getF());
printPossibleJS(fieldActions.getV());
}
for (PDAnnotationWidget widgetAction : termField.getWidgets())
{
PDAction action = widgetAction.getAction();
if (action instanceof PDActionJavaScript)
{
System.out.println(field.getFullyQualifiedName() + ": " + action.getClass().getSimpleName() + " js widget action:\n" + action.getCOSObject());
printPossibleJS(action);
}
}
}
if (field instanceof PDNonTerminalField)
{
if (!sParent.equals(field.getPartialName()))
{
if (partialName != null)
{
sParent = sParent + "." + partialName;
}
}
//System.out.println(sLevel + sParent);
for (PDField child : ((PDNonTerminalField) field).getChildren())
{
processField(child, "| " + sLevel, sParent);
}
}
else
{
String fieldValue = field.getValueAsString();
StringBuilder outputString = new StringBuilder(sLevel);
outputString.append(sParent);
if (partialName != null)
{
outputString.append(".").append(partialName);
}
outputString.append(" = ").append(fieldValue);
outputString.append(", type=").append(field.getClass().getName());
//System.out.println(outputString);
}
}
private void printPossibleJS(PDAction kAction)
{
if (kAction instanceof PDActionJavaScript)
{
PDActionJavaScript jsAction = (PDActionJavaScript) kAction;
String jsString = jsAction.getAction();
if (!jsString.contains("\n"))
{
// avoid display problems with netbeans
jsString = jsString.replaceAll("\r", "\n").replaceAll("\n\n", "\n");
}
System.out.println(jsString);
System.out.println();
}
}
/**
* This will read a PDF file and print out the form elements. <br />
* see usage() for commandline
*
* @param args command line arguments
*
* @throws IOException If there is an error importing the FDF document.
*/
public static void main(String[] args) throws IOException
{
PDDocument pdf = null;
try
{
pdf = PDDocument.load(new File("XXXX", "YYYYY.pdf"));
PrintJavaScriptFields exporter = new PrintJavaScriptFields();
exporter.printFields(pdf);
}
finally
{
if (pdf != null)
{
pdf.close();
}
}
}
}
作为奖励,下面是显示所有 COSString 对象的代码:
public class ShowAllCOSStrings
{
static Set<COSString> strings = new HashSet<COSString>();
static void crawl(COSBase base)
{
if (base instanceof COSString)
{
strings.add((COSString)base);
return;
}
if (base instanceof COSDictionary)
{
COSDictionary dict = (COSDictionary) base;
for (COSName key : dict.keySet())
{
crawl(dict.getDictionaryObject(key));
}
return;
}
if (base instanceof COSArray)
{
COSArray ar = (COSArray) base;
for (COSBase item : ar)
{
crawl(item);
}
return;
}
if (base instanceof COSNull ||
base instanceof COSObject ||
base instanceof COSName ||
base instanceof COSNumber ||
base instanceof COSBoolean ||
base == null)
{
return;
}
System.out.println("huh? " + base);
}
public static void main(String[] args) throws IOException
{
PDDocument doc = PDDocument.load(new File("XXX","YYY.pdf"));
for (COSObject obj : doc.getDocument().getObjects())
{
COSBase base = obj.getObject();
//System.out.println(obj + ": " + base);
crawl(base);
}
System.out.println(strings.size() + " strings:");
for (COSString s : strings)
{
String str = s.getString();
if (!str.contains("\n"))
{
// avoid display problems with netbeans
str = str.replaceAll("\r", "\n").replaceAll("\n\n", "\n");
}
System.out.println(str);
}
doc.close();
}
}
但是 Javascript 也可以在流中。请参阅 PDF 规范 "Additional entries specific to a rendition action",JS 条目:
A text string or stream containing a JavaScript script that shall be
executed when the action is triggered.
您也可以更改上面的代码以捕获 COSStream 对象; COSStream 是从 COSDictionary 扩展而来的。
我的目标是提取和处理 PDF 文档可能包含的任何 JavasSript 代码。通过在编辑器中打开 PDF,我可以看到这样的对象:
402 0 obj
<</S/JavaScript/JS(\n\r\n /* Set day 25 */\r\n FormRouter_SetCurrentDate\("25"\);\r)>>
endobj
我正在尝试使用 Apache PDFBox 来完成此操作,但到目前为止还没有成功。
这一行 returns 一个空列表:
jsObj = doc.getObjectsByType(COSName.JAVA_SCRIPT);
谁能给我一些指导?
此工具基于 PDFBox 中的 PrintFields 示例。它将在表单中显示 Javascript 字段。我去年为一个对 AcroForm 字段之间的关系有问题的人写了它(某些字段根据其他字段的值启用/禁用)。还有其他地方可以Javascript.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package pdfboxpageimageextraction;
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
import org.apache.pdfbox.pdmodel.interactive.action.PDFormFieldAdditionalActions;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
import org.apache.pdfbox.pdmodel.interactive.form.PDTerminalField;
/**
* This example will take a PDF document and print all the fields from the file.
*
* @author Ben Litchfield
*
*/
public class PrintJavaScriptFields
{
/**
* This will print all the fields from the document.
*
* @param pdfDocument The PDF to get the fields from.
*
* @throws IOException If there is an error getting the fields.
*/
public void printFields(PDDocument pdfDocument) throws IOException
{
PDDocumentCatalog docCatalog = pdfDocument.getDocumentCatalog();
PDAcroForm acroForm = docCatalog.getAcroForm();
List<PDField> fields = acroForm.getFields();
//System.out.println(fields.size() + " top-level fields were found on the form");
for (PDField field : fields)
{
processField(field, "|--", field.getPartialName());
}
}
private void processField(PDField field, String sLevel, String sParent) throws IOException
{
String partialName = field.getPartialName();
if (field instanceof PDTerminalField)
{
PDTerminalField termField = (PDTerminalField) field;
PDFormFieldAdditionalActions fieldActions = field.getActions();
if (fieldActions != null)
{
System.out.println(field.getFullyQualifiedName() + ": " + fieldActions.getClass().getSimpleName() + " js field actionS:\n" + fieldActions.getCOSObject());
printPossibleJS(fieldActions.getK());
printPossibleJS(fieldActions.getC());
printPossibleJS(fieldActions.getF());
printPossibleJS(fieldActions.getV());
}
for (PDAnnotationWidget widgetAction : termField.getWidgets())
{
PDAction action = widgetAction.getAction();
if (action instanceof PDActionJavaScript)
{
System.out.println(field.getFullyQualifiedName() + ": " + action.getClass().getSimpleName() + " js widget action:\n" + action.getCOSObject());
printPossibleJS(action);
}
}
}
if (field instanceof PDNonTerminalField)
{
if (!sParent.equals(field.getPartialName()))
{
if (partialName != null)
{
sParent = sParent + "." + partialName;
}
}
//System.out.println(sLevel + sParent);
for (PDField child : ((PDNonTerminalField) field).getChildren())
{
processField(child, "| " + sLevel, sParent);
}
}
else
{
String fieldValue = field.getValueAsString();
StringBuilder outputString = new StringBuilder(sLevel);
outputString.append(sParent);
if (partialName != null)
{
outputString.append(".").append(partialName);
}
outputString.append(" = ").append(fieldValue);
outputString.append(", type=").append(field.getClass().getName());
//System.out.println(outputString);
}
}
private void printPossibleJS(PDAction kAction)
{
if (kAction instanceof PDActionJavaScript)
{
PDActionJavaScript jsAction = (PDActionJavaScript) kAction;
String jsString = jsAction.getAction();
if (!jsString.contains("\n"))
{
// avoid display problems with netbeans
jsString = jsString.replaceAll("\r", "\n").replaceAll("\n\n", "\n");
}
System.out.println(jsString);
System.out.println();
}
}
/**
* This will read a PDF file and print out the form elements. <br />
* see usage() for commandline
*
* @param args command line arguments
*
* @throws IOException If there is an error importing the FDF document.
*/
public static void main(String[] args) throws IOException
{
PDDocument pdf = null;
try
{
pdf = PDDocument.load(new File("XXXX", "YYYYY.pdf"));
PrintJavaScriptFields exporter = new PrintJavaScriptFields();
exporter.printFields(pdf);
}
finally
{
if (pdf != null)
{
pdf.close();
}
}
}
}
作为奖励,下面是显示所有 COSString 对象的代码:
public class ShowAllCOSStrings
{
static Set<COSString> strings = new HashSet<COSString>();
static void crawl(COSBase base)
{
if (base instanceof COSString)
{
strings.add((COSString)base);
return;
}
if (base instanceof COSDictionary)
{
COSDictionary dict = (COSDictionary) base;
for (COSName key : dict.keySet())
{
crawl(dict.getDictionaryObject(key));
}
return;
}
if (base instanceof COSArray)
{
COSArray ar = (COSArray) base;
for (COSBase item : ar)
{
crawl(item);
}
return;
}
if (base instanceof COSNull ||
base instanceof COSObject ||
base instanceof COSName ||
base instanceof COSNumber ||
base instanceof COSBoolean ||
base == null)
{
return;
}
System.out.println("huh? " + base);
}
public static void main(String[] args) throws IOException
{
PDDocument doc = PDDocument.load(new File("XXX","YYY.pdf"));
for (COSObject obj : doc.getDocument().getObjects())
{
COSBase base = obj.getObject();
//System.out.println(obj + ": " + base);
crawl(base);
}
System.out.println(strings.size() + " strings:");
for (COSString s : strings)
{
String str = s.getString();
if (!str.contains("\n"))
{
// avoid display problems with netbeans
str = str.replaceAll("\r", "\n").replaceAll("\n\n", "\n");
}
System.out.println(str);
}
doc.close();
}
}
但是 Javascript 也可以在流中。请参阅 PDF 规范 "Additional entries specific to a rendition action",JS 条目:
A text string or stream containing a JavaScript script that shall be executed when the action is triggered.
您也可以更改上面的代码以捕获 COSStream 对象; COSStream 是从 COSDictionary 扩展而来的。