无法使用 c# 覆盖 PDFTextStripper.writeString(String text, List<TextPosition> textPositions) 方法?
Not able to override PDFTextStripper.writeString(String text, List<TextPosition> textPositions) method using c#?
我正在使用 PdfBox 的 .net 解析以从 pdf 中提取文本以及文本 location.For,在搜索时我发现了以下 java 代码:
PDFTextStripper stripper = new PDFTextStripper()
{
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException
{
super.writeString(text, textPositions);
TextPosition firstProsition = textPositions.get(0);
TextPosition lastPosition = textPositions.get(textPositions.size() - 1);
writeString(String.format("[%s - %s / %s]", firstProsition.getXDirAdj(), lastPosition.getXDirAdj() + lastPosition.getWidthDirAdj(), firstProsition.getYDirAdj()));
}
};
stripper.setSortByPosition(true);
return stripper.getText(document);
我通过以下方式将其转换为 .net:
class PDFTextLocationStripper : PDFTextStripper
{
public string textWithPostion = "";
protected override void processTextPosition(TextPosition text)
{
textWithPostion += "String[" + text.getXDirAdj() + "," +
text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale=" +
text.getXScale() + " height=" + text.getHeightDir() + " space=" +
text.getWidthOfSpace() + " width=" +
text.getWidthDirAdj() + "]" + text.getCharacter();
}
protected override void writeString(java.lang.String text, java.util.List textPositions)
{
base.writeString(text, textPositions);
TextPosition firstProsition = (TextPosition)textPositions.get(0);
TextPosition lastPosition =(TextPosition) textPositions.get(textPositions.size() - 1);
writeString(String.Format("[%s - %s / %s]", firstProsition.getXDirAdj(), lastPosition.getXDirAdj() + lastPosition.getWidthDirAdj(), firstProsition.getYDirAdj()));
}
}
但是,上面的代码出现编译错误:
Error 1 No overload for method 'writeString' takes 2 arguments
Error 2 'PDFTextLocationStripper.writeString(java.lang.String, java.util.List)': no suitable method found to override
那么,如何重写 writeString 方法以便我可以提取文本和位置?
因为,我无法重载 writeString method.I 使用 processTextPosition 从 pdf 中提取单词及其 positions.Here 是代码:
class PDFTextLocationStripper : PDFTextStripper
{
public string textWithPostion = "";
public Dictionary<float, Dictionary<float, PdfWord>> pdfWordsByXByY;
public PDFTextLocationStripper(): base()
{
try
{
textWithPostion = "";
pdfWordsByXByY = new Dictionary<float, Dictionary<float, PdfWord>>();
}
catch (Exception ex)
{
}
}
protected override void processTextPosition(TextPosition text)
{
try
{
float textX = text.getXDirAdj();
float textY = text.getYDirAdj();
if (!String.IsNullOrWhiteSpace(text.getCharacter()))
{
if (pdfWordsByXByY.ContainsKey(textY))
{
Dictionary<float, PdfWord> wordsByX = pdfWordsByXByY[textY];
if (wordsByX.ContainsKey(textX))
{
PdfWord word = wordsByX[textX];
wordsByX.Remove(word.Right);
word.EndCharWidth = text.getWidthDirAdj();
word.Height = text.getHeightDir();
word.EndX = textX;
word.Text += text.getCharacter();
if (!wordsByX.Keys.Contains(word.Right))
{
wordsByX.Add(word.Right, word);
}
}
else
{
float requiredX = -1;
float minDiff = float.MaxValue;
for (int index = 0; index < wordsByX.Keys.Count; index++)
{
float key = wordsByX.Keys.ElementAt(index);
float diff = key - textX;
if (diff < 0)
{
diff = -diff;
}
if (diff < minDiff)
{
minDiff = diff;
requiredX = key;
}
}
if (requiredX > -1 && minDiff <= 1)
{
PdfWord word = wordsByX[requiredX];
wordsByX.Remove(requiredX);
word.EndCharWidth = text.getWidthDirAdj();
word.Height = text.getHeightDir();
word.EndX = textX;
word.Text += text.getCharacter();
if (!wordsByX.ContainsKey(word.Right))
{
wordsByX.Add(word.Right, word);
}
}
else
{
PdfWord word = new PdfWord();
word.Text = text.getCharacter();
word.EndX = word.StartX = textX;
word.Y = textY;
word.EndCharWidth = word.StartCharWidth = text.getWidthDirAdj();
word.Height = text.getHeightDir();
if (!wordsByX.ContainsKey(word.Right))
{
wordsByX.Add(word.Right, word);
}
pdfWordsByXByY[textY] = wordsByX;
}
}
}
else
{
Dictionary<float, PdfWord> wordsByX = new Dictionary<float, PdfWord>();
PdfWord word = new PdfWord();
word.Text = text.getCharacter();
word.EndX = word.StartX = textX;
word.Y = textY;
word.EndCharWidth = word.StartCharWidth = text.getWidthDirAdj();
word.Height = text.getHeightDir();
wordsByX.Add(word.Right, word);
pdfWordsByXByY.Add(textY, wordsByX);
}
}
}
catch (Exception ex)
{
}
}
}
这里是 PdfWord class。
class PdfWord
{
public float StartX { get; set; }
public float EndX { get; set; }
public float Y { get; set; }
public float StartCharWidth { get; set; }
public float EndCharWidth { get; set; }
public float Height { get; set; }
public string Text { get; set; }
public float Right { get { return EndX + EndCharWidth; } }
}
我正在使用 PdfBox 的 .net 解析以从 pdf 中提取文本以及文本 location.For,在搜索时我发现了以下 java 代码:
PDFTextStripper stripper = new PDFTextStripper()
{
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException
{
super.writeString(text, textPositions);
TextPosition firstProsition = textPositions.get(0);
TextPosition lastPosition = textPositions.get(textPositions.size() - 1);
writeString(String.format("[%s - %s / %s]", firstProsition.getXDirAdj(), lastPosition.getXDirAdj() + lastPosition.getWidthDirAdj(), firstProsition.getYDirAdj()));
}
};
stripper.setSortByPosition(true);
return stripper.getText(document);
我通过以下方式将其转换为 .net:
class PDFTextLocationStripper : PDFTextStripper
{
public string textWithPostion = "";
protected override void processTextPosition(TextPosition text)
{
textWithPostion += "String[" + text.getXDirAdj() + "," +
text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale=" +
text.getXScale() + " height=" + text.getHeightDir() + " space=" +
text.getWidthOfSpace() + " width=" +
text.getWidthDirAdj() + "]" + text.getCharacter();
}
protected override void writeString(java.lang.String text, java.util.List textPositions)
{
base.writeString(text, textPositions);
TextPosition firstProsition = (TextPosition)textPositions.get(0);
TextPosition lastPosition =(TextPosition) textPositions.get(textPositions.size() - 1);
writeString(String.Format("[%s - %s / %s]", firstProsition.getXDirAdj(), lastPosition.getXDirAdj() + lastPosition.getWidthDirAdj(), firstProsition.getYDirAdj()));
}
}
但是,上面的代码出现编译错误:
Error 1 No overload for method 'writeString' takes 2 arguments
Error 2 'PDFTextLocationStripper.writeString(java.lang.String, java.util.List)': no suitable method found to override
那么,如何重写 writeString 方法以便我可以提取文本和位置?
因为,我无法重载 writeString method.I 使用 processTextPosition 从 pdf 中提取单词及其 positions.Here 是代码:
class PDFTextLocationStripper : PDFTextStripper
{
public string textWithPostion = "";
public Dictionary<float, Dictionary<float, PdfWord>> pdfWordsByXByY;
public PDFTextLocationStripper(): base()
{
try
{
textWithPostion = "";
pdfWordsByXByY = new Dictionary<float, Dictionary<float, PdfWord>>();
}
catch (Exception ex)
{
}
}
protected override void processTextPosition(TextPosition text)
{
try
{
float textX = text.getXDirAdj();
float textY = text.getYDirAdj();
if (!String.IsNullOrWhiteSpace(text.getCharacter()))
{
if (pdfWordsByXByY.ContainsKey(textY))
{
Dictionary<float, PdfWord> wordsByX = pdfWordsByXByY[textY];
if (wordsByX.ContainsKey(textX))
{
PdfWord word = wordsByX[textX];
wordsByX.Remove(word.Right);
word.EndCharWidth = text.getWidthDirAdj();
word.Height = text.getHeightDir();
word.EndX = textX;
word.Text += text.getCharacter();
if (!wordsByX.Keys.Contains(word.Right))
{
wordsByX.Add(word.Right, word);
}
}
else
{
float requiredX = -1;
float minDiff = float.MaxValue;
for (int index = 0; index < wordsByX.Keys.Count; index++)
{
float key = wordsByX.Keys.ElementAt(index);
float diff = key - textX;
if (diff < 0)
{
diff = -diff;
}
if (diff < minDiff)
{
minDiff = diff;
requiredX = key;
}
}
if (requiredX > -1 && minDiff <= 1)
{
PdfWord word = wordsByX[requiredX];
wordsByX.Remove(requiredX);
word.EndCharWidth = text.getWidthDirAdj();
word.Height = text.getHeightDir();
word.EndX = textX;
word.Text += text.getCharacter();
if (!wordsByX.ContainsKey(word.Right))
{
wordsByX.Add(word.Right, word);
}
}
else
{
PdfWord word = new PdfWord();
word.Text = text.getCharacter();
word.EndX = word.StartX = textX;
word.Y = textY;
word.EndCharWidth = word.StartCharWidth = text.getWidthDirAdj();
word.Height = text.getHeightDir();
if (!wordsByX.ContainsKey(word.Right))
{
wordsByX.Add(word.Right, word);
}
pdfWordsByXByY[textY] = wordsByX;
}
}
}
else
{
Dictionary<float, PdfWord> wordsByX = new Dictionary<float, PdfWord>();
PdfWord word = new PdfWord();
word.Text = text.getCharacter();
word.EndX = word.StartX = textX;
word.Y = textY;
word.EndCharWidth = word.StartCharWidth = text.getWidthDirAdj();
word.Height = text.getHeightDir();
wordsByX.Add(word.Right, word);
pdfWordsByXByY.Add(textY, wordsByX);
}
}
}
catch (Exception ex)
{
}
}
}
这里是 PdfWord class。
class PdfWord
{
public float StartX { get; set; }
public float EndX { get; set; }
public float Y { get; set; }
public float StartCharWidth { get; set; }
public float EndCharWidth { get; set; }
public float Height { get; set; }
public string Text { get; set; }
public float Right { get { return EndX + EndCharWidth; } }
}