iTextSharp 提取每个字符并获取矩形
iTextSharp extract each character and getRectangle
我想逐个字符地解析整个 PDF,并能够获取该 PDF 文档上该字符的 ASCII 值、字体和矩形,稍后我可以将其用于另存为位图。我尝试使用 PdfTextExtractor.GetTextFromPage 但它以字符串形式给出了 PDF 中的整个文本。
与 iTextSharp 捆绑在一起的文本提取策略(特别是 PdfTextExtractor.GetTextFromPage
默认使用的 LocationTextExtractionStrategy
重载没有策略参数)只允许直接访问收集的纯文本,而不是位置。
克里斯·哈斯MyLocationTextExtractionStrategy
@Chris Haas 在 his old answer here 中介绍了 LocationTextExtractionStrategy
的以下扩展
public class MyLocationTextExtractionStrategy : LocationTextExtractionStrategy {
//Hold each coordinate
public List<RectAndText> myPoints = new List<RectAndText>();
//Automatically called for each chunk of text in the PDF
public override void RenderText(TextRenderInfo renderInfo) {
base.RenderText(renderInfo);
//Get the bounding box for the chunk of text
var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
var topRight = renderInfo.GetAscentLine().GetEndPoint();
//Create a rectangle from it
var rect = new iTextSharp.text.Rectangle(
bottomLeft[Vector.I1],
bottomLeft[Vector.I2],
topRight[Vector.I1],
topRight[Vector.I2]
);
//Add this to our main collection
this.myPoints.Add(new RectAndText(rect, renderInfo.GetText()));
}
}
它利用了这个助手 class
//Helper class that stores our rectangle and text
public class RectAndText {
public iTextSharp.text.Rectangle Rect;
public String Text;
public RectAndText(iTextSharp.text.Rectangle rect, String text) {
this.Rect = rect;
this.Text = text;
}
}
此策略使文本块及其封闭矩形在 public 成员 List<RectAndText> myPoints
中可用,您可以像这样访问它:
//Create an instance of our strategy
var t = new MyLocationTextExtractionStrategy();
//Parse page 1 of the document above
using (var r = new PdfReader(testFile)) {
var ex = PdfTextExtractor.GetTextFromPage(r, 1, t);
}
//Loop through each chunk found
foreach (var p in t.myPoints) {
Console.WriteLine(string.Format("Found text {0} at {1}x{2}", p.Text, p.Rect.Left, p.Rect.Bottom));
}
对于您的任务逐个字符解析整个 PDF 并能够获取该字符的 ASCII 值、字体和矩形这里只有两个细节是错误的:
- 这样返回的文本块可能包含多个字符
- 未提供字体信息。
因此,我们必须稍微调整一下:
一个新的CharLocationTextExtractionStrategy
除了 MyLocationTextExtractionStrategy
class 之外,CharLocationTextExtractionStrategy
按字形拆分输入并提供字体名称:
public class CharLocationTextExtractionStrategy : LocationTextExtractionStrategy
{
//Hold each coordinate
public List<RectAndTextAndFont> myPoints = new List<RectAndTextAndFont>();
//Automatically called for each chunk of text in the PDF
public override void RenderText(TextRenderInfo wholeRenderInfo)
{
base.RenderText(wholeRenderInfo);
foreach (TextRenderInfo renderInfo in wholeRenderInfo.GetCharacterRenderInfos())
{
//Get the bounding box for the chunk of text
var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
var topRight = renderInfo.GetAscentLine().GetEndPoint();
//Create a rectangle from it
var rect = new iTextSharp.text.Rectangle(
bottomLeft[Vector.I1],
bottomLeft[Vector.I2],
topRight[Vector.I1],
topRight[Vector.I2]
);
//Add this to our main collection
this.myPoints.Add(new RectAndTextAndFont(rect, renderInfo.GetText(), renderInfo.GetFont().PostscriptFontName));
}
}
}
//Helper class that stores our rectangle, text, and font
public class RectAndTextAndFont
{
public iTextSharp.text.Rectangle Rect;
public String Text;
public String Font;
public RectAndTextAndFont(iTextSharp.text.Rectangle rect, String text, String font)
{
this.Rect = rect;
this.Text = text;
this.Font = font;
}
}
像这样使用这个策略
CharLocationTextExtractionStrategy strategy = new CharLocationTextExtractionStrategy();
using (var pdfReader = new PdfReader(testFile))
{
PdfTextExtractor.GetTextFromPage(pdfReader, 1, strategy);
}
foreach (var p in strategy.myPoints)
{
Console.WriteLine(string.Format("<{0}> in {3} at {1}x{2}", p.Text, p.Rect.Left, p.Rect.Bottom, p.Font));
}
您按字符和包括字体获取信息。
我想逐个字符地解析整个 PDF,并能够获取该 PDF 文档上该字符的 ASCII 值、字体和矩形,稍后我可以将其用于另存为位图。我尝试使用 PdfTextExtractor.GetTextFromPage 但它以字符串形式给出了 PDF 中的整个文本。
与 iTextSharp 捆绑在一起的文本提取策略(特别是 PdfTextExtractor.GetTextFromPage
默认使用的 LocationTextExtractionStrategy
重载没有策略参数)只允许直接访问收集的纯文本,而不是位置。
克里斯·哈斯MyLocationTextExtractionStrategy
@Chris Haas 在 his old answer here 中介绍了 LocationTextExtractionStrategy
public class MyLocationTextExtractionStrategy : LocationTextExtractionStrategy {
//Hold each coordinate
public List<RectAndText> myPoints = new List<RectAndText>();
//Automatically called for each chunk of text in the PDF
public override void RenderText(TextRenderInfo renderInfo) {
base.RenderText(renderInfo);
//Get the bounding box for the chunk of text
var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
var topRight = renderInfo.GetAscentLine().GetEndPoint();
//Create a rectangle from it
var rect = new iTextSharp.text.Rectangle(
bottomLeft[Vector.I1],
bottomLeft[Vector.I2],
topRight[Vector.I1],
topRight[Vector.I2]
);
//Add this to our main collection
this.myPoints.Add(new RectAndText(rect, renderInfo.GetText()));
}
}
它利用了这个助手 class
//Helper class that stores our rectangle and text
public class RectAndText {
public iTextSharp.text.Rectangle Rect;
public String Text;
public RectAndText(iTextSharp.text.Rectangle rect, String text) {
this.Rect = rect;
this.Text = text;
}
}
此策略使文本块及其封闭矩形在 public 成员 List<RectAndText> myPoints
中可用,您可以像这样访问它:
//Create an instance of our strategy
var t = new MyLocationTextExtractionStrategy();
//Parse page 1 of the document above
using (var r = new PdfReader(testFile)) {
var ex = PdfTextExtractor.GetTextFromPage(r, 1, t);
}
//Loop through each chunk found
foreach (var p in t.myPoints) {
Console.WriteLine(string.Format("Found text {0} at {1}x{2}", p.Text, p.Rect.Left, p.Rect.Bottom));
}
对于您的任务逐个字符解析整个 PDF 并能够获取该字符的 ASCII 值、字体和矩形这里只有两个细节是错误的:
- 这样返回的文本块可能包含多个字符
- 未提供字体信息。
因此,我们必须稍微调整一下:
一个新的CharLocationTextExtractionStrategy
除了 MyLocationTextExtractionStrategy
class 之外,CharLocationTextExtractionStrategy
按字形拆分输入并提供字体名称:
public class CharLocationTextExtractionStrategy : LocationTextExtractionStrategy
{
//Hold each coordinate
public List<RectAndTextAndFont> myPoints = new List<RectAndTextAndFont>();
//Automatically called for each chunk of text in the PDF
public override void RenderText(TextRenderInfo wholeRenderInfo)
{
base.RenderText(wholeRenderInfo);
foreach (TextRenderInfo renderInfo in wholeRenderInfo.GetCharacterRenderInfos())
{
//Get the bounding box for the chunk of text
var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
var topRight = renderInfo.GetAscentLine().GetEndPoint();
//Create a rectangle from it
var rect = new iTextSharp.text.Rectangle(
bottomLeft[Vector.I1],
bottomLeft[Vector.I2],
topRight[Vector.I1],
topRight[Vector.I2]
);
//Add this to our main collection
this.myPoints.Add(new RectAndTextAndFont(rect, renderInfo.GetText(), renderInfo.GetFont().PostscriptFontName));
}
}
}
//Helper class that stores our rectangle, text, and font
public class RectAndTextAndFont
{
public iTextSharp.text.Rectangle Rect;
public String Text;
public String Font;
public RectAndTextAndFont(iTextSharp.text.Rectangle rect, String text, String font)
{
this.Rect = rect;
this.Text = text;
this.Font = font;
}
}
像这样使用这个策略
CharLocationTextExtractionStrategy strategy = new CharLocationTextExtractionStrategy();
using (var pdfReader = new PdfReader(testFile))
{
PdfTextExtractor.GetTextFromPage(pdfReader, 1, strategy);
}
foreach (var p in strategy.myPoints)
{
Console.WriteLine(string.Format("<{0}> in {3} at {1}x{2}", p.Text, p.Rect.Left, p.Rect.Bottom, p.Font));
}
您按字符和包括字体获取信息。