c# 如何从 'iTextSharp.text.pdf.PdfArray' 转换为 'iTextSharp.text.pdf.PRIndirectReference'
c# How to cast from 'iTextSharp.text.pdf.PdfArray' to 'iTextSharp.text.pdf.PRIndirectReference'
我一直在使用这段代码,直到今天它运行良好:
for (int page = 1; page <= reader.NumberOfPages; page++)
{
var cpage = reader.GetPageN(page);
var content = cpage.Get(PdfName.CONTENTS);
var ir = (PRIndirectReference)content;
var value = reader.GetPdfObject(ir.Number);
if (value.IsStream())
{
PRStream stream = (PRStream)value;
var streamBytes = PdfReader.GetStreamBytes(stream);
var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));
try
{
while (tokenizer.NextToken())
{
if (tokenizer.TokenType == PRTokeniser.TK_STRING)
{
string strs = tokenizer.StringValue;
if (!(br = excludeList.Any(st => strs.Contains(st))))
{
//strfor += tokenizer.StringValue;
if (!string.IsNullOrWhiteSpace(strs) &&
!stringsList.Any(i => i == strs && excludeHeaders.Contains(strs)))
stringsList.Add(strs);
}
}
}
}
finally
{
tokenizer.Close();
}
}
}
但是今天我得到了一些 pdf 文件的异常:Unable to cast object of type 'iTextSharp.text.pdf.PdfArray' to type 'iTextSharp.text.pdf.PRIndirectReference
在调试时我知道错误在这一行:var ir = (PRIndirectReference)content;
。那是因为我提取的 pdf 内容,我以 ArrayList
的形式获取它,如下图所示:
如果有人能帮助我,我将不胜感激。提前致谢。
EDIT :
pdf 内容是段落、表格、页眉和页脚,少数情况下是图片。但是我不在乎图像,因为我绕过了它们。
正如您从代码中看到的那样,我正在尝试将单词添加到字符串列表中,因此我希望输出为纯文本;具体的话。
这真的很简单!不知道为什么我看不出来
PdfReader reader = new PdfReader(name);
List<string> stringsList = new List<string>();
for (int page = 1; page <= reader.NumberOfPages; page++)
{
//directly get the contents into a byte stream
var streamByte = reader.GetPageContent(page);
var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamByte));
var sb = new StringBuilder(); //use a string builder instead
try
{
while (tokenizer.NextToken())
{
if (tokenizer.TokenType == PRTokeniser.TK_STRING)
{
var currentText = tokenizer.StringValue;
currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
sb.Append(tokenizer.StringValue);
}
}
}
finally
{
//add appended strings into a string list
if(sb != null)
stringsList.Add(sb.ToString());
tokenizer.Close();
}
}
我一直在使用这段代码,直到今天它运行良好:
for (int page = 1; page <= reader.NumberOfPages; page++)
{
var cpage = reader.GetPageN(page);
var content = cpage.Get(PdfName.CONTENTS);
var ir = (PRIndirectReference)content;
var value = reader.GetPdfObject(ir.Number);
if (value.IsStream())
{
PRStream stream = (PRStream)value;
var streamBytes = PdfReader.GetStreamBytes(stream);
var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));
try
{
while (tokenizer.NextToken())
{
if (tokenizer.TokenType == PRTokeniser.TK_STRING)
{
string strs = tokenizer.StringValue;
if (!(br = excludeList.Any(st => strs.Contains(st))))
{
//strfor += tokenizer.StringValue;
if (!string.IsNullOrWhiteSpace(strs) &&
!stringsList.Any(i => i == strs && excludeHeaders.Contains(strs)))
stringsList.Add(strs);
}
}
}
}
finally
{
tokenizer.Close();
}
}
}
但是今天我得到了一些 pdf 文件的异常:Unable to cast object of type 'iTextSharp.text.pdf.PdfArray' to type 'iTextSharp.text.pdf.PRIndirectReference
在调试时我知道错误在这一行:var ir = (PRIndirectReference)content;
。那是因为我提取的 pdf 内容,我以 ArrayList
的形式获取它,如下图所示:
如果有人能帮助我,我将不胜感激。提前致谢。
EDIT :
pdf 内容是段落、表格、页眉和页脚,少数情况下是图片。但是我不在乎图像,因为我绕过了它们。
正如您从代码中看到的那样,我正在尝试将单词添加到字符串列表中,因此我希望输出为纯文本;具体的话。
这真的很简单!不知道为什么我看不出来
PdfReader reader = new PdfReader(name);
List<string> stringsList = new List<string>();
for (int page = 1; page <= reader.NumberOfPages; page++)
{
//directly get the contents into a byte stream
var streamByte = reader.GetPageContent(page);
var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamByte));
var sb = new StringBuilder(); //use a string builder instead
try
{
while (tokenizer.NextToken())
{
if (tokenizer.TokenType == PRTokeniser.TK_STRING)
{
var currentText = tokenizer.StringValue;
currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
sb.Append(tokenizer.StringValue);
}
}
}
finally
{
//add appended strings into a string list
if(sb != null)
stringsList.Add(sb.ToString());
tokenizer.Close();
}
}