拆分 10k 页 PDF 时内存泄漏 (iTextSharp PDF API)
Memory leak when splitting a 10k page PDF (iTextSharp PDF API)
我有一个超过 10,000 页的 PDF,我正试图根据分隔页将其拆分成更小的 PDF。我当前的实现效果很好,直到您开始一次将完整的 10k 页扔给它。在大约第 50 个创建的 pdf(每个约 100 页)之后,它会开始显着减慢,并且在我得到 OutOfMemoryException 之前我的内存使用量跃升至大约 2GB。我对内存管理的经验很少,但我做了很多研究。我在这里问这个问题只是因为它对时间很敏感,所以如果我自己没有做足够的研究,我深表歉意。
我初读PDF原版:
var pdfDictionary = PDFHelper.ParsePDFByPage(_workItem.FileName);
//Code behind
public static Dictionary<int, string> ParsePDFByPage(string filePath)
{
var retVal = new Dictionary<int, string>();
PdfReader reader = new PdfReader(filePath);
for (int page = 1; page <= reader.NumberOfPages; page++)
{
retVal.Add(page, PdfTextExtractor.GetTextFromPage(reader, page, new StructuredTextExtractionStrategy()));
}
reader.Close();
reader.Dispose();
return retVal;
}
阅读后,我找到了哪些页面是分隔符,并为每个需要与原始页面运行ge拆分的页面创建一个HMPdf实例(定义如下)
var pdfsToCreate= pdfDictionary.Where(x => x.Value.Contains("DELIMITER"));
var pdfList = new List<HMPdf>();
foreach (var item in pdfsToCreate) //pdfsToCreate = Dictionary<int,string>
{
//Parsing logic (most removed, just know that this part works fine)
//After parsing, create new instance of HMPdf and add it to the list
var pdf = new HMPdf(startPage, endPage, fileName);
pdfList.Add(pdf);
}
解析后,创建 PDF
foreach (var hmpdf in pdfList)
{
//I've tried forcing the GC to collect after every 10 pdfs created
string error = string.Empty;
if (!hmpdf.TryCreate(sourcePath, destinationPath, out error))
{
throw new Exception("Error creating new PDF - " + error);
}
}
HMPdf 代码隐藏
public class HMPdf
{
private string _path;
private string _fileName;
private PdfCopy _pdfCopy = null;
private PdfReader _reader = null;
private Document _sourceDocument = null;
private PdfImportedPage _importedPage = null;
private int _pageFrom;
private int _pageTo;
private FileStream _fileStream;
public HMPdf(int pageFrom, int pageTo, string fileName)
{
_pageFrom = pageFrom;
_pageTo = pageTo;
_fileName = fileName;
}
public bool TryCreate(string sourcePath, string destinationPath, out string errorMessage)
{
try
{
_reader = new PdfReader(sourcePath);
_sourceDocument = new Document(_reader.GetPageSizeWithRotation(_pageFrom));
_fileStream = new System.IO.FileStream(Path.Combine(destinationPath, _fileName.ToLower().Contains(".pdf") ? _fileName : _fileName + ".pdf"),
System.IO.FileMode.Create);
_pdfCopy = new PdfCopy(_sourceDocument, _fileStream);
_sourceDocument.Open();
for (int i = _pageFrom; i <= _pageTo; i++)
{
_importedPage = _pdfCopy.GetImportedPage(_reader, i);
_pdfCopy.AddPage(_importedPage);
_importedPage = null;
}
return true;
}
catch (Exception ex)
{
errorMessage = ex.Message;
return false;
}
finally
{
if (_reader != null)
{
_reader.Close();
_reader.Dispose();
_reader = null;
}
if (_sourceDocument != null)
{
_sourceDocument.Close();
_sourceDocument.Dispose();
_sourceDocument = null;
}
if (_pdfCopy != null)
{
_pdfCopy.Close();
_pdfCopy.Dispose();
_pdfCopy = null;
}
if (_fileStream != null)
{
_fileStream.Close();
_fileStream.Dispose();
_fileStream = null;
}
}
}
}
如您所知,我 closing/disposing 所有打开的文件流、读取器等...(对吗?)。我已经尝试在每创建 10 个 pdf 后强制垃圾收集器 运行,但它不会清理任何东西。我有 运行 Telerik JustTrace 并且我对内存管理知之甚少,有几件事很突出。首先,在几个快照之间,有 0 个已处置对象,在最后一个快照中,pdfList 对象占用了将近 1 GB 的内存。
我是不是漏掉了一些显而易见的东西?
抱歉写得太长了。
也许你正在证明 The Dangers of the Large Object Heap...
尝试以减少内存使用的方式改进逻辑。
并尽可能缩小变量范围。即,不要创建不必要的 class 变量,而是将它们设为字段变量。
尝试像下面这样的方法来缩小变量的范围。
public bool TryCreate(string sourcePath, string destinationPath, out string errorMessage)
{
try
{
using (var _reader = new PdfReader(sourcePath))
{
using (var _sourceDocument = new Document(_reader.GetPageSizeWithRotation(_pageFrom)))
{
using (var _fileStream =
new System.IO.FileStream(
Path.Combine(destinationPath, _fileName.ToLower().Contains(".pdf") ? _fileName : _fileName + ".pdf"),
System.IO.FileMode.Create))
{
using (_pdfCopy = new PdfCopy(_sourceDocument, _fileStream))
{
_sourceDocument.Open();
for (int i = _pageFrom; i <= _pageTo; i++)
{
_importedPage = _pdfCopy.GetImportedPage(_reader, i);
_pdfCopy.AddPage(_importedPage);
_importedPage = null;
}
}
}
}
}
return true;
}
}
我有一个超过 10,000 页的 PDF,我正试图根据分隔页将其拆分成更小的 PDF。我当前的实现效果很好,直到您开始一次将完整的 10k 页扔给它。在大约第 50 个创建的 pdf(每个约 100 页)之后,它会开始显着减慢,并且在我得到 OutOfMemoryException 之前我的内存使用量跃升至大约 2GB。我对内存管理的经验很少,但我做了很多研究。我在这里问这个问题只是因为它对时间很敏感,所以如果我自己没有做足够的研究,我深表歉意。
我初读PDF原版:
var pdfDictionary = PDFHelper.ParsePDFByPage(_workItem.FileName);
//Code behind
public static Dictionary<int, string> ParsePDFByPage(string filePath)
{
var retVal = new Dictionary<int, string>();
PdfReader reader = new PdfReader(filePath);
for (int page = 1; page <= reader.NumberOfPages; page++)
{
retVal.Add(page, PdfTextExtractor.GetTextFromPage(reader, page, new StructuredTextExtractionStrategy()));
}
reader.Close();
reader.Dispose();
return retVal;
}
阅读后,我找到了哪些页面是分隔符,并为每个需要与原始页面运行ge拆分的页面创建一个HMPdf实例(定义如下)
var pdfsToCreate= pdfDictionary.Where(x => x.Value.Contains("DELIMITER"));
var pdfList = new List<HMPdf>();
foreach (var item in pdfsToCreate) //pdfsToCreate = Dictionary<int,string>
{
//Parsing logic (most removed, just know that this part works fine)
//After parsing, create new instance of HMPdf and add it to the list
var pdf = new HMPdf(startPage, endPage, fileName);
pdfList.Add(pdf);
}
解析后,创建 PDF
foreach (var hmpdf in pdfList)
{
//I've tried forcing the GC to collect after every 10 pdfs created
string error = string.Empty;
if (!hmpdf.TryCreate(sourcePath, destinationPath, out error))
{
throw new Exception("Error creating new PDF - " + error);
}
}
HMPdf 代码隐藏
public class HMPdf
{
private string _path;
private string _fileName;
private PdfCopy _pdfCopy = null;
private PdfReader _reader = null;
private Document _sourceDocument = null;
private PdfImportedPage _importedPage = null;
private int _pageFrom;
private int _pageTo;
private FileStream _fileStream;
public HMPdf(int pageFrom, int pageTo, string fileName)
{
_pageFrom = pageFrom;
_pageTo = pageTo;
_fileName = fileName;
}
public bool TryCreate(string sourcePath, string destinationPath, out string errorMessage)
{
try
{
_reader = new PdfReader(sourcePath);
_sourceDocument = new Document(_reader.GetPageSizeWithRotation(_pageFrom));
_fileStream = new System.IO.FileStream(Path.Combine(destinationPath, _fileName.ToLower().Contains(".pdf") ? _fileName : _fileName + ".pdf"),
System.IO.FileMode.Create);
_pdfCopy = new PdfCopy(_sourceDocument, _fileStream);
_sourceDocument.Open();
for (int i = _pageFrom; i <= _pageTo; i++)
{
_importedPage = _pdfCopy.GetImportedPage(_reader, i);
_pdfCopy.AddPage(_importedPage);
_importedPage = null;
}
return true;
}
catch (Exception ex)
{
errorMessage = ex.Message;
return false;
}
finally
{
if (_reader != null)
{
_reader.Close();
_reader.Dispose();
_reader = null;
}
if (_sourceDocument != null)
{
_sourceDocument.Close();
_sourceDocument.Dispose();
_sourceDocument = null;
}
if (_pdfCopy != null)
{
_pdfCopy.Close();
_pdfCopy.Dispose();
_pdfCopy = null;
}
if (_fileStream != null)
{
_fileStream.Close();
_fileStream.Dispose();
_fileStream = null;
}
}
}
}
如您所知,我 closing/disposing 所有打开的文件流、读取器等...(对吗?)。我已经尝试在每创建 10 个 pdf 后强制垃圾收集器 运行,但它不会清理任何东西。我有 运行 Telerik JustTrace 并且我对内存管理知之甚少,有几件事很突出。首先,在几个快照之间,有 0 个已处置对象,在最后一个快照中,pdfList 对象占用了将近 1 GB 的内存。
我是不是漏掉了一些显而易见的东西?
抱歉写得太长了。
也许你正在证明 The Dangers of the Large Object Heap...
尝试以减少内存使用的方式改进逻辑。
并尽可能缩小变量范围。即,不要创建不必要的 class 变量,而是将它们设为字段变量。
尝试像下面这样的方法来缩小变量的范围。
public bool TryCreate(string sourcePath, string destinationPath, out string errorMessage)
{
try
{
using (var _reader = new PdfReader(sourcePath))
{
using (var _sourceDocument = new Document(_reader.GetPageSizeWithRotation(_pageFrom)))
{
using (var _fileStream =
new System.IO.FileStream(
Path.Combine(destinationPath, _fileName.ToLower().Contains(".pdf") ? _fileName : _fileName + ".pdf"),
System.IO.FileMode.Create))
{
using (_pdfCopy = new PdfCopy(_sourceDocument, _fileStream))
{
_sourceDocument.Open();
for (int i = _pageFrom; i <= _pageTo; i++)
{
_importedPage = _pdfCopy.GetImportedPage(_reader, i);
_pdfCopy.AddPage(_importedPage);
_importedPage = null;
}
}
}
}
}
return true;
}
}