如何从 c# 中的文档 .doc 或 .docx 中获取两个标题之间的所有文本
How to get all text between two Headings from a word Document .doc or .docx in c#
如何获取两个标题之间的所有文本或特定标题下的文本?喜欢..
"Heading ABC"
"Heading XYZ"
这是 XYZ 标题下的内容
测试..
"Sub heading or heading 2 of XYZ"
XYZ 标题继续
"Heading 123"
标题 123
下的内容
我想获取 XYZ 标题的所有内容,包括子标题,直到下一个标题 123 出现。我如何找到该特定标题,然后在 c# 中获取该标题下的所有内容?文件可以是 .doc 或 .docx
您可以使用NPOI库来阅读word文档。一些示例代码可以帮助您入门。
public string ReadAllTextFromWordDocFile(string fileName)
{
using (StreamReader streamReader = new StreamReader(fileName))
{
var document = new HWPFDocument(streamReader.BaseStream);
var wordExtractor = new WordExtractor(document);
var docText = new StringBuilder();
foreach (string text in wordExtractor.ParagraphText)
{
docText.AppendLine(text.Trim());
}
streamReader.Close();
return docText.ToString();
}
}
稍微玩一下。
你也想看看DocX. Basic examples here。每个段落的 MagicText
属性 可能有助于您识别标题。
private void DocReader(string fileLocation,string headingText, string headingStyle)
{
Microsoft.Office.Interop.Word.Application word = new Microsoft.Office.Interop.Word.Application();
object miss = System.Reflection.Missing.Value;
object path = fileLocation;
object readOnly = true;
Microsoft.Office.Interop.Word.Document docs = word.Documents.Open(ref path, ref miss, ref readOnly, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss);
string totaltext = "";
int ind = 0;
bool flag = false;
int paraCount = docs.Paragraphs.Count;
for (int i = 1; i < paraCount; i++)
{
Microsoft.Office.Interop.Word.Style style = docs.Paragraphs[i].get_Style() as Microsoft.Office.Interop.Word.Style;
if (style != null && style.NameLocal.Equals(headingStyle))
{
flag = false;
if (docs.Paragraphs[i].Range.Text.ToString().TrimEnd('\r').ToUpper() == headingText.ToUpper())
{
ind++;
flag = true;
}
}
if (flag && ind>=1)
totaltext += " \r\n " + docs.Paragraphs[i].Range.Text.ToString();
}
if (totaltext == "") { totaltext = "No such data found!"; }
richTextBox1.Text = totaltext;
docs.Close();
word.Quit(); }
如何获取两个标题之间的所有文本或特定标题下的文本?喜欢..
"Heading ABC"
"Heading XYZ"
这是 XYZ 标题下的内容
测试..
"Sub heading or heading 2 of XYZ"
XYZ 标题继续
"Heading 123" 标题 123
下的内容我想获取 XYZ 标题的所有内容,包括子标题,直到下一个标题 123 出现。我如何找到该特定标题,然后在 c# 中获取该标题下的所有内容?文件可以是 .doc 或 .docx
您可以使用NPOI库来阅读word文档。一些示例代码可以帮助您入门。
public string ReadAllTextFromWordDocFile(string fileName)
{
using (StreamReader streamReader = new StreamReader(fileName))
{
var document = new HWPFDocument(streamReader.BaseStream);
var wordExtractor = new WordExtractor(document);
var docText = new StringBuilder();
foreach (string text in wordExtractor.ParagraphText)
{
docText.AppendLine(text.Trim());
}
streamReader.Close();
return docText.ToString();
}
}
稍微玩一下。
你也想看看DocX. Basic examples here。每个段落的 MagicText
属性 可能有助于您识别标题。
private void DocReader(string fileLocation,string headingText, string headingStyle)
{
Microsoft.Office.Interop.Word.Application word = new Microsoft.Office.Interop.Word.Application();
object miss = System.Reflection.Missing.Value;
object path = fileLocation;
object readOnly = true;
Microsoft.Office.Interop.Word.Document docs = word.Documents.Open(ref path, ref miss, ref readOnly, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss);
string totaltext = "";
int ind = 0;
bool flag = false;
int paraCount = docs.Paragraphs.Count;
for (int i = 1; i < paraCount; i++)
{
Microsoft.Office.Interop.Word.Style style = docs.Paragraphs[i].get_Style() as Microsoft.Office.Interop.Word.Style;
if (style != null && style.NameLocal.Equals(headingStyle))
{
flag = false;
if (docs.Paragraphs[i].Range.Text.ToString().TrimEnd('\r').ToUpper() == headingText.ToUpper())
{
ind++;
flag = true;
}
}
if (flag && ind>=1)
totaltext += " \r\n " + docs.Paragraphs[i].Range.Text.ToString();
}
if (totaltext == "") { totaltext = "No such data found!"; }
richTextBox1.Text = totaltext;
docs.Close();
word.Quit(); }