如何使用 iText 从 PDF 中提取旋转图像
How to extract rotated images from PDF with iText
我需要从 PDF 中提取图像。
我知道有些图片旋转了 90 度(我用在线工具检查过)。
我正在使用此代码:
PdfRenderListener:
public class PdfRenderListener : IExtRenderListener
{
// other methods ...
public void RenderImage(ImageRenderInfo renderInfo)
{
try
{
var mtx = renderInfo.GetImageCTM();
var image = renderInfo.GetImage();
var fillColor = renderInfo.GetCurrentFillColor();
var color = Color.FromArgb(fillColor?.RGB ?? Color.Empty.ToArgb());
var fileType = image.GetFileType();
var extension = "." + fileType;
var bytes = image.GetImageAsBytes();
var height = mtx[Matrix.I22];
var width = mtx[Matrix.I11];
// rotated image
if (height == 0 && width == 0)
{
var h = Math.Abs(mtx[Matrix.I12]);
var w = Math.Abs(mtx[Matrix.I21]);
}
// save image
}
catch (Exception e)
{
Console.WriteLine(e);
}
}
}
当我使用此代码保存图像时,旋转的图像在保存时会失真。
我已阅读此 post iText 7 ImageRenderInfo Matrix contains negative height on Even number Pages 和 mkl 答案。
在当前转换矩阵 (mtx) 中,我有这些值:
0
841.9
0
-595.1
0
0
595.1
0
1
我知道图像旋转了 90 度。如何转换图像以获得正常图像?
你可以试试这个。我正在为 java 使用 Itext 7。这里还是需要自己写监听器:
public class MyImageRenderListener implements IEventListener {
protected String path;
protected String extension;
public MyImageRenderListener (String path) {
this.path = path;
}
public void eventOccurred(IEventData data, EventType type) {
switch (type) {
case RENDER_IMAGE:
try {
String filename;
FileOutputStream os;
ImageRenderInfo renderInfo = (ImageRenderInfo) data;
PdfImageXObject image = renderInfo.getImage();
if (image == null) {
return;
}
byte[] imageByte = image.getImageBytes(true);
extension = image.identifyImageFileExtension();
filename = String.format(path, image.getPdfObject().getIndirectReference().getObjNumber(), extension);
os = new FileOutputStream(filename);
os.write(imageByte);
os.flush();
os.close();
} catch (com.itextpdf.io.exceptions.IOException | IOException e) {
System.out.println(e.getMessage());
}
break;
default:
break;
}
}
public Set<EventType> getSupportedEvents() {
return null;
}
}
我查了一个随机旋转角度的pdf,90度,得到的图片没有失真
public void manipulatePdf() throws IOException, SQLException, ParserConfigurationException, SAXException {
PdfDocument pdfDoc = new PdfDocument(new PdfReader("path to pdf"), new PdfWriter(new ByteArrayOutputStream()));
MyImageRenderListener listener = new MyImageRenderListener("path to resulting image");
PdfCanvasProcessor parser = new PdfCanvasProcessor(listener);
for (int i = 1; i <= pdfDoc.getNumberOfPages(); i++) {
parser.processPageContent(pdfDoc.getPage(i));
}
pdfDoc.close();
}
如@mkl 所述,真正的原因不在于图像的旋转,而在于所应用的滤镜。
我用 iText RUPS 分析了 pdf 文件,发现图像是用 CCITTFaxDecode 过滤器编码的:
RUPS screen
接下来,我寻找解码此过滤器的方法并发现了这些问题
- Extracting image from PDF with /CCITTFaxDecode filter.
- How to use Bit Miracle LibTiff.Net to write the image to a MemoryStream
我使用了 BitMiracle.LibTiff.NET 库
我写了这个方法:
private byte[] DecodeInternal(byte[] rawBytes, int width, int height, int k, int bitsPerComponent)
{
var compression = GetCompression(k);
using var ms = new MemoryStream();
var tms = new TiffStream();
using var tiff = Tiff.ClientOpen("in-memory", "w", ms, tms);
tiff.SetField(TiffTag.IMAGEWIDTH, width);
tiff.SetField(TiffTag.IMAGELENGTH, height);
tiff.SetField(TiffTag.COMPRESSION, compression);
tiff.SetField(TiffTag.BITSPERSAMPLE, bitsPerComponent);
tiff.SetField(TiffTag.SAMPLESPERPIXEL, 1);
var writeResult = tiff.WriteRawStrip(0, rawBytes, rawBytes.Length);
if (writeResult == -1)
{
Console.WriteLine("Decoding error");
}
tiff.CheckpointDirectory();
var decodedBytes = ms.ToArray();
tiff.Close();
return decodedBytes;
}
private Compression GetCompression(int k)
{
return k switch
{
< 0 => Compression.CCITTFAX4,
0 => Compression.CCITTFAX3,
_ => throw new NotImplementedException("K > 0"),
};
}
解码和旋转图像后,我能够保存正常图像。谢谢大家的帮助。
我需要从 PDF 中提取图像。 我知道有些图片旋转了 90 度(我用在线工具检查过)。
我正在使用此代码:
PdfRenderListener:
public class PdfRenderListener : IExtRenderListener
{
// other methods ...
public void RenderImage(ImageRenderInfo renderInfo)
{
try
{
var mtx = renderInfo.GetImageCTM();
var image = renderInfo.GetImage();
var fillColor = renderInfo.GetCurrentFillColor();
var color = Color.FromArgb(fillColor?.RGB ?? Color.Empty.ToArgb());
var fileType = image.GetFileType();
var extension = "." + fileType;
var bytes = image.GetImageAsBytes();
var height = mtx[Matrix.I22];
var width = mtx[Matrix.I11];
// rotated image
if (height == 0 && width == 0)
{
var h = Math.Abs(mtx[Matrix.I12]);
var w = Math.Abs(mtx[Matrix.I21]);
}
// save image
}
catch (Exception e)
{
Console.WriteLine(e);
}
}
}
当我使用此代码保存图像时,旋转的图像在保存时会失真。
我已阅读此 post iText 7 ImageRenderInfo Matrix contains negative height on Even number Pages 和 mkl 答案。
在当前转换矩阵 (mtx) 中,我有这些值:
0 | 841.9 | 0 |
-595.1 | 0 | 0 |
595.1 | 0 | 1 |
我知道图像旋转了 90 度。如何转换图像以获得正常图像?
你可以试试这个。我正在为 java 使用 Itext 7。这里还是需要自己写监听器:
public class MyImageRenderListener implements IEventListener {
protected String path;
protected String extension;
public MyImageRenderListener (String path) {
this.path = path;
}
public void eventOccurred(IEventData data, EventType type) {
switch (type) {
case RENDER_IMAGE:
try {
String filename;
FileOutputStream os;
ImageRenderInfo renderInfo = (ImageRenderInfo) data;
PdfImageXObject image = renderInfo.getImage();
if (image == null) {
return;
}
byte[] imageByte = image.getImageBytes(true);
extension = image.identifyImageFileExtension();
filename = String.format(path, image.getPdfObject().getIndirectReference().getObjNumber(), extension);
os = new FileOutputStream(filename);
os.write(imageByte);
os.flush();
os.close();
} catch (com.itextpdf.io.exceptions.IOException | IOException e) {
System.out.println(e.getMessage());
}
break;
default:
break;
}
}
public Set<EventType> getSupportedEvents() {
return null;
}
}
我查了一个随机旋转角度的pdf,90度,得到的图片没有失真
public void manipulatePdf() throws IOException, SQLException, ParserConfigurationException, SAXException {
PdfDocument pdfDoc = new PdfDocument(new PdfReader("path to pdf"), new PdfWriter(new ByteArrayOutputStream()));
MyImageRenderListener listener = new MyImageRenderListener("path to resulting image");
PdfCanvasProcessor parser = new PdfCanvasProcessor(listener);
for (int i = 1; i <= pdfDoc.getNumberOfPages(); i++) {
parser.processPageContent(pdfDoc.getPage(i));
}
pdfDoc.close();
}
如@mkl 所述,真正的原因不在于图像的旋转,而在于所应用的滤镜。
我用 iText RUPS 分析了 pdf 文件,发现图像是用 CCITTFaxDecode 过滤器编码的: RUPS screen
接下来,我寻找解码此过滤器的方法并发现了这些问题
- Extracting image from PDF with /CCITTFaxDecode filter.
- How to use Bit Miracle LibTiff.Net to write the image to a MemoryStream
我使用了 BitMiracle.LibTiff.NET 库
我写了这个方法:
private byte[] DecodeInternal(byte[] rawBytes, int width, int height, int k, int bitsPerComponent)
{
var compression = GetCompression(k);
using var ms = new MemoryStream();
var tms = new TiffStream();
using var tiff = Tiff.ClientOpen("in-memory", "w", ms, tms);
tiff.SetField(TiffTag.IMAGEWIDTH, width);
tiff.SetField(TiffTag.IMAGELENGTH, height);
tiff.SetField(TiffTag.COMPRESSION, compression);
tiff.SetField(TiffTag.BITSPERSAMPLE, bitsPerComponent);
tiff.SetField(TiffTag.SAMPLESPERPIXEL, 1);
var writeResult = tiff.WriteRawStrip(0, rawBytes, rawBytes.Length);
if (writeResult == -1)
{
Console.WriteLine("Decoding error");
}
tiff.CheckpointDirectory();
var decodedBytes = ms.ToArray();
tiff.Close();
return decodedBytes;
}
private Compression GetCompression(int k)
{
return k switch
{
< 0 => Compression.CCITTFAX4,
0 => Compression.CCITTFAX3,
_ => throw new NotImplementedException("K > 0"),
};
}
解码和旋转图像后,我能够保存正常图像。谢谢大家的帮助。