c# 如何从 'iTextSharp.text.pdf.PdfArray' 转换为 'iTextSharp.text.pdf.PRIndirectReference'

c# How to cast from 'iTextSharp.text.pdf.PdfArray' to 'iTextSharp.text.pdf.PRIndirectReference'

我一直在使用这段代码,直到今天它运行良好:

for (int page = 1; page <= reader.NumberOfPages; page++)
{
    var cpage = reader.GetPageN(page);
    var content = cpage.Get(PdfName.CONTENTS);

    var ir = (PRIndirectReference)content;

    var value = reader.GetPdfObject(ir.Number);

    if (value.IsStream())
    {
        PRStream stream = (PRStream)value;

        var streamBytes = PdfReader.GetStreamBytes(stream);

        var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));

        try
        {
            while (tokenizer.NextToken())
            {
                if (tokenizer.TokenType == PRTokeniser.TK_STRING)
                {
                    string strs = tokenizer.StringValue;

                    if (!(br = excludeList.Any(st => strs.Contains(st))))
                    {
                        //strfor += tokenizer.StringValue;

                        if (!string.IsNullOrWhiteSpace(strs) &&
                            !stringsList.Any(i => i == strs && excludeHeaders.Contains(strs)))
                            stringsList.Add(strs);
                    }
                }
            }
        }
        finally
        {
            tokenizer.Close();
        }
    }
}

但是今天我得到了一些 pdf 文件的异常:Unable to cast object of type 'iTextSharp.text.pdf.PdfArray' to type 'iTextSharp.text.pdf.PRIndirectReference

在调试时我知道错误在这一行:var ir = (PRIndirectReference)content;。那是因为我提取的 pdf 内容,我以 ArrayList 的形式获取它,如下图所示:

如果有人能帮助我,我将不胜感激。提前致谢。

EDIT :

pdf 内容是段落、表格、页眉和页脚,少数情况下是图片。但是我不在乎图像,因为我绕过了它们。

正如您从代码中看到的那样,我正在尝试将单词添加到字符串列表中,因此我希望输出为纯文本;具体的话。

这真的很简单!不知道为什么我看不出来

PdfReader reader = new PdfReader(name);
List<string> stringsList = new List<string>();

for (int page = 1; page <= reader.NumberOfPages; page++)
{
    //directly get the contents into a byte stream
    var streamByte = reader.GetPageContent(page);
    var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamByte));
    var sb = new StringBuilder(); //use a string builder instead

    try
    {
        while (tokenizer.NextToken())
        {
            if (tokenizer.TokenType == PRTokeniser.TK_STRING)
            {
                var currentText = tokenizer.StringValue;
                currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                sb.Append(tokenizer.StringValue);
            }
        }
    }
    finally
    {
        //add appended strings into a string list
        if(sb != null)
            stringsList.Add(sb.ToString());

        tokenizer.Close();
    }
}