遍历 HTML Agility Pack 中的多个 HTML 表
Loop thorough multiple HTML tables in HTML Agility Pack
我按照下面 link 中的示例进行操作,并且能够将 HTML table 成功解析为数据 table.
http://blog.ditran.net/parsing-html-table-to-c-usable-datalist/
但是我无法解析多个 tables,当我遍历 TR 时,第一个 TR 总是有列名,其余的在每个 table.So 中都有数据我正在使用这个逻辑并将 table 数据存储在字典中并发送到我的 ToDataTable 函数。
有人可以帮助我如何遍历多个 table 并实现相同的 logic.Appreciate 它。
var tRowList = doc.DocumentNode.SelectNodes("//tr");
foreach (HtmlNode tRow in tRowList)
{
if (previousRowSpanList.Count > 0)
{
theDict = previousRowSpanList[0];
previousRowSpanList.Remove(theDict); //remove it off the list
isWorkingWithRowSpan = true;
}
else
{
theDict = new List<KeyValuePair<string, string>>();
isWorkingWithRowSpan = false;
}
var tCellList = tRow.SelectNodes("td|th");
tCelCount = tCellList.Count;
if (tCelCount > 0 &&
!(tCelCount == 1 && string.IsNullOrEmpty(tCellList[0].InnerText.Trim()))
)
{
//colOrder = 1;
IsNullEntireRow = true;
for (int colIndex = 0; colIndex < tCelCount; colIndex++)
{
cell = tCellList[colIndex];
ColInnerText = cell.InnerText.Replace(" ", " ").Trim();
if (!string.IsNullOrEmpty(ColInnerText))
IsNullEntireRow = false;
//
static DataTable ToDataTable(List<List<KeyValuePair<string, string>>> list)
{
DataTable result = new DataTable();
if (list.Count == 0)
return result;
result.Columns.AddRange(
list.First().Select(r => new DataColumn(r.Value)).ToArray()
);
list= list.Skip(1).ToArray().ToList();
list.ForEach(r => result.Rows.Add(r.Select(c => c.Value).Cast<object>().ToArray()));
return result;
样本HTML:
<table>
<tbody>
<tr><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Node</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Logtime</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Hardware</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Prcstate A</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Prcstate B</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Cluster</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">RAID</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">AD replication A</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">AD replication B</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">File replication A</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">File replication B</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">hcstart RESULT</td></tr>
<tr><td class="center">DTMSCB1</td><td class="center">2016-08-26 16:40</td><td class="center">APG43L</td><td class="center">active</td><td class="center">passive</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">Not OK</td></tr>
<tr><td class="center">MSC9</td><td class="center">2016-08-26 16:40</td><td class="center">APG40C/4</td><td class="center">passive</td><td class="center">active</td><td class="center">OK</td><td class="center">OK</td><td class="center">OK</td><td class="center">OK</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">Not OK</td><td class="center">OK</td><td class="center">-</td></tr>
</tbody>
</table>
<table>
<tbody>
<tr><td style="background-color:#A9F5A9;" class="center">Node Type</td><td style="background-color:#A9F5A9;" class="center">Node</td><td style="background-color:#A9F5A9;" class="center">Log Time</td><td style="background-color:#A9F5A9;" class="center">New Mon. Alarms</td><td style="background-color:#A9F5A9;" class="center">Mon. Alarms Total</td><td style="background-color:#A9F5A9;" class="center">Other Alarms</td><td style="background-color:#A9F5A9;" class="center">MML</td></tr>
<tr><td class="center">BSC</td><td class="center">BMBSC1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">46</td><td class="center">445</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">BMBSC2C</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">27</td><td class="center">609</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">CYBSC1</td><td class="center">2016-08-26 16:45</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">1</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">45</td><td class="center">665</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">CYBSC2C</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">30</td><td class="center">849</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-BC</td><td class="center">CYMSCB1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">38</td><td class="center">283</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">DTBSC1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">48</td><td class="center">201</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">DTBSC2</td><td class="center">2016-08-26 16:45</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">1</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">31</td><td class="center">310</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-BC</td><td class="center">DTMSCB1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">25</td><td class="center">130</td><td class="center">OK</td></tr>
<tr><td class="center">HLR</td><td class="center">HLR1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">16</td><td class="center">12</td><td class="center">OK</td></tr>
<tr><td class="center">HLR</td><td class="center">HLR2</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">24</td><td class="center">10</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-S</td><td class="center">MSC10</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">48</td><td class="center">79</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-S</td><td class="center">MSC9</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">46</td><td class="center">131</td><td class="center">OK</td></tr>
</tbody>
</table>
既然你想解析多个 html table,你应该 return 一个 DataSet
每个 html 有一个 DataTable
table。如果 table headers 存在,下面的代码会将列名称添加到相应的 DataTable
。 html table id 将用作 DataTable
的名称,您可以使用它直接从 DataSet
:
访问
将 html tables 转换为 DataSet
的方法:
public static DataSet HtmlTablesToDataset(string html)
{
var resultDataset = new DataSet();
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
foreach (HtmlNode table in doc.DocumentNode.SelectNodes("//table"))
{
var resultTable = new DataTable(table.Id);
foreach (HtmlNode row in table.SelectNodes("tr"))
{
var headerCells = row.SelectNodes("th");
if (headerCells != null)
{
foreach (HtmlNode cell in headerCells)
{
resultTable.Columns.Add(cell.InnerText);
}
}
var dataCells = row.SelectNodes("td");
if (dataCells != null)
{
var dataRow = resultTable.NewRow();
for (int i=0; i < dataCells.Count; i++)
{
dataRow[i] = dataCells[i].InnerText;
}
resultTable.Rows.Add(dataRow);
}
}
resultDataset.Tables.Add(resultTable);
}
return resultDataset;
}
测试代码:
var resultDS = HtmlTablesToDataset(html);
foreach(DataTable dt in resultDS.Tables)
{
Console.WriteLine("Table: " + dt.TableName);
string line = "";
foreach (DataColumn col in dt.Columns)
{
line += col.ToString() + " ";
}
Console.WriteLine(line.Trim());
foreach (DataRow row in dt.Rows)
{
line = "";
foreach (DataColumn col in dt.Columns)
{
line += row[col].ToString() + " ";
}
Console.WriteLine(line.Trim());
}
}
样本HTML:
string html =
@"
<html>
<head>
<title>Test</title>
</head>
<body>
<table id='t1'>
<tr>
<th>Col1</th>
<th>Col2</th>
</tr>
<tr>
<td>1</td>
<td>2</td>
</tr>
<tr>
<td>3</td>
<td>4</td>
</tr>
</table>
<table id='t2'>
<tr>
<th>Col1</th>
<th>Col2</th>
</tr>
<tr>
<td>5</td>
<td>6</td>
</tr>
<tr>
<td>7</td>
<td>8</td>
</tr>
</table>
</body>
</html>
";
我会保留第一个答案以供参考,但下面是一种将原始 html 拆分为字符串数组的方法,每个字符串元素包含 HTML 一个 table:
public static string[] ParseHtmlSplitTables(string htmlString)
{
string[] result = new string[] { };
if (!String.IsNullOrWhiteSpace(htmlString))
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(htmlString);
var tableNodes = doc.DocumentNode.SelectNodes("//table");
if (tableNodes != null)
{
result = Array.ConvertAll<HtmlNode, string>(tableNodes.ToArray(), n => n.OuterHtml);
}
}
return result;
}
根据结果,您可以继续解析每个 table:
string[] htmlTables = ParseHtmlSplitTables(htmlString);
foreach (string html in htmlTables)
{
List<List<KeyValuePair<string, string>>> parseResult = ParseHtmlToDataTable(html);
DataTable dataTable = ToDataTable(parseResult);
}
我按照下面 link 中的示例进行操作,并且能够将 HTML table 成功解析为数据 table.
http://blog.ditran.net/parsing-html-table-to-c-usable-datalist/
但是我无法解析多个 tables,当我遍历 TR 时,第一个 TR 总是有列名,其余的在每个 table.So 中都有数据我正在使用这个逻辑并将 table 数据存储在字典中并发送到我的 ToDataTable 函数。
有人可以帮助我如何遍历多个 table 并实现相同的 logic.Appreciate 它。
var tRowList = doc.DocumentNode.SelectNodes("//tr");
foreach (HtmlNode tRow in tRowList)
{
if (previousRowSpanList.Count > 0)
{
theDict = previousRowSpanList[0];
previousRowSpanList.Remove(theDict); //remove it off the list
isWorkingWithRowSpan = true;
}
else
{
theDict = new List<KeyValuePair<string, string>>();
isWorkingWithRowSpan = false;
}
var tCellList = tRow.SelectNodes("td|th");
tCelCount = tCellList.Count;
if (tCelCount > 0 &&
!(tCelCount == 1 && string.IsNullOrEmpty(tCellList[0].InnerText.Trim()))
)
{
//colOrder = 1;
IsNullEntireRow = true;
for (int colIndex = 0; colIndex < tCelCount; colIndex++)
{
cell = tCellList[colIndex];
ColInnerText = cell.InnerText.Replace(" ", " ").Trim();
if (!string.IsNullOrEmpty(ColInnerText))
IsNullEntireRow = false;
//
static DataTable ToDataTable(List<List<KeyValuePair<string, string>>> list)
{
DataTable result = new DataTable();
if (list.Count == 0)
return result;
result.Columns.AddRange(
list.First().Select(r => new DataColumn(r.Value)).ToArray()
);
list= list.Skip(1).ToArray().ToList();
list.ForEach(r => result.Rows.Add(r.Select(c => c.Value).Cast<object>().ToArray()));
return result;
样本HTML:
<table>
<tbody>
<tr><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Node</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Logtime</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Hardware</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Prcstate A</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Prcstate B</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">Cluster</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">RAID</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">AD replication A</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">AD replication B</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">File replication A</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">File replication B</td><td style="background-color:#A9F5A9;font-weight:bold;" class="center">hcstart RESULT</td></tr>
<tr><td class="center">DTMSCB1</td><td class="center">2016-08-26 16:40</td><td class="center">APG43L</td><td class="center">active</td><td class="center">passive</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td class="center">-</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">Not OK</td></tr>
<tr><td class="center">MSC9</td><td class="center">2016-08-26 16:40</td><td class="center">APG40C/4</td><td class="center">passive</td><td class="center">active</td><td class="center">OK</td><td class="center">OK</td><td class="center">OK</td><td class="center">OK</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">Not OK</td><td class="center">OK</td><td class="center">-</td></tr>
</tbody>
</table>
<table>
<tbody>
<tr><td style="background-color:#A9F5A9;" class="center">Node Type</td><td style="background-color:#A9F5A9;" class="center">Node</td><td style="background-color:#A9F5A9;" class="center">Log Time</td><td style="background-color:#A9F5A9;" class="center">New Mon. Alarms</td><td style="background-color:#A9F5A9;" class="center">Mon. Alarms Total</td><td style="background-color:#A9F5A9;" class="center">Other Alarms</td><td style="background-color:#A9F5A9;" class="center">MML</td></tr>
<tr><td class="center">BSC</td><td class="center">BMBSC1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">46</td><td class="center">445</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">BMBSC2C</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">27</td><td class="center">609</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">CYBSC1</td><td class="center">2016-08-26 16:45</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">1</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">45</td><td class="center">665</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">CYBSC2C</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">30</td><td class="center">849</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-BC</td><td class="center">CYMSCB1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">38</td><td class="center">283</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">DTBSC1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">48</td><td class="center">201</td><td class="center">OK</td></tr>
<tr><td class="center">BSC</td><td class="center">DTBSC2</td><td class="center">2016-08-26 16:45</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">1</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">31</td><td class="center">310</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-BC</td><td class="center">DTMSCB1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">25</td><td class="center">130</td><td class="center">OK</td></tr>
<tr><td class="center">HLR</td><td class="center">HLR1</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">16</td><td class="center">12</td><td class="center">OK</td></tr>
<tr><td class="center">HLR</td><td class="center">HLR2</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">24</td><td class="center">10</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-S</td><td class="center">MSC10</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">48</td><td class="center">79</td><td class="center">OK</td></tr>
<tr><td class="center">MSC-S</td><td class="center">MSC9</td><td class="center">2016-08-26 16:45</td><td class="center">0</td><td style="background-color:#FF0000;color:#FFFFFF;font-weight:bold;" class="center">46</td><td class="center">131</td><td class="center">OK</td></tr>
</tbody>
</table>
既然你想解析多个 html table,你应该 return 一个 DataSet
每个 html 有一个 DataTable
table。如果 table headers 存在,下面的代码会将列名称添加到相应的 DataTable
。 html table id 将用作 DataTable
的名称,您可以使用它直接从 DataSet
:
将 html tables 转换为 DataSet
的方法:
public static DataSet HtmlTablesToDataset(string html)
{
var resultDataset = new DataSet();
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
foreach (HtmlNode table in doc.DocumentNode.SelectNodes("//table"))
{
var resultTable = new DataTable(table.Id);
foreach (HtmlNode row in table.SelectNodes("tr"))
{
var headerCells = row.SelectNodes("th");
if (headerCells != null)
{
foreach (HtmlNode cell in headerCells)
{
resultTable.Columns.Add(cell.InnerText);
}
}
var dataCells = row.SelectNodes("td");
if (dataCells != null)
{
var dataRow = resultTable.NewRow();
for (int i=0; i < dataCells.Count; i++)
{
dataRow[i] = dataCells[i].InnerText;
}
resultTable.Rows.Add(dataRow);
}
}
resultDataset.Tables.Add(resultTable);
}
return resultDataset;
}
测试代码:
var resultDS = HtmlTablesToDataset(html);
foreach(DataTable dt in resultDS.Tables)
{
Console.WriteLine("Table: " + dt.TableName);
string line = "";
foreach (DataColumn col in dt.Columns)
{
line += col.ToString() + " ";
}
Console.WriteLine(line.Trim());
foreach (DataRow row in dt.Rows)
{
line = "";
foreach (DataColumn col in dt.Columns)
{
line += row[col].ToString() + " ";
}
Console.WriteLine(line.Trim());
}
}
样本HTML:
string html =
@"
<html>
<head>
<title>Test</title>
</head>
<body>
<table id='t1'>
<tr>
<th>Col1</th>
<th>Col2</th>
</tr>
<tr>
<td>1</td>
<td>2</td>
</tr>
<tr>
<td>3</td>
<td>4</td>
</tr>
</table>
<table id='t2'>
<tr>
<th>Col1</th>
<th>Col2</th>
</tr>
<tr>
<td>5</td>
<td>6</td>
</tr>
<tr>
<td>7</td>
<td>8</td>
</tr>
</table>
</body>
</html>
";
我会保留第一个答案以供参考,但下面是一种将原始 html 拆分为字符串数组的方法,每个字符串元素包含 HTML 一个 table:
public static string[] ParseHtmlSplitTables(string htmlString)
{
string[] result = new string[] { };
if (!String.IsNullOrWhiteSpace(htmlString))
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(htmlString);
var tableNodes = doc.DocumentNode.SelectNodes("//table");
if (tableNodes != null)
{
result = Array.ConvertAll<HtmlNode, string>(tableNodes.ToArray(), n => n.OuterHtml);
}
}
return result;
}
根据结果,您可以继续解析每个 table:
string[] htmlTables = ParseHtmlSplitTables(htmlString);
foreach (string html in htmlTables)
{
List<List<KeyValuePair<string, string>>> parseResult = ParseHtmlToDataTable(html);
DataTable dataTable = ToDataTable(parseResult);
}