如何使用 HtmlAgilityPack 对特定节点之间的所有内部文本进行上下文感知解析
How to use HtmlAgilityPack for context aware parsing of all of the inner text between specific nodes
我从外部 API 传入了以下 5 个 Html 片段,我无法控制现有格式。
我需要将 HTML 拆分为具有以下结构的响应 class
public class Resposne{
bool isLeading {get;set;} //set to true only if there was an input detected after the last response was read
int ResponseId {get;set;}
string LeadingStem {get;set;}
bool isLagging{get;set;} //set to true only if it's the final text node, and has no input after it
}
预期输出列在每个 Html 块下方(请原谅那些伪代码}
HTML 1:
<tr style="height: 179px;">
<td style="width: 421px; height: 179px;"><img style="display: block; margin-left: auto; margin-right: auto;" src="../../Content/uploads/319e1bf6-9b79-4009-8108-c1d2ed77ffbb/85c232e7-33e8-49bb-acbe-374f5ad49361.jpg" alt="" width="400" height="209" /></td>
<td style="width: 692px; height: 179px; vertical-align: top;">
<div style="color: #626262;">
<span style="color: #626262;">The United States </span>
<input id="response[0]_box" class="response" style="border: 2px double #4363d8;" contenteditable="false" disabled="disabled" name="response[0]_box" type="text" value="Constitution" data-id="0" />
<span style="color: #626262;"> has 27 amendments, of which the first </span>
<input id="response[1]_box" class="response" style="border: 2px double #3cb44b;" contenteditable="false" disabled="disabled" name="response[1]_box" type="text" value="10" data-id="1" />
<input id="response[1]_box" style="border: 2px double #3cb44b;" contenteditable="false" disabled="disabled" name="response[1]_box" type="text" value="BAD INPUT BAD INPUT" data-id="1" />
<span style="color: #626262;"> amendments are called the Bill of Rights. The purpose of the Bill of Rights is to provide specific freedoms to citizens and limit the power of the </span>
<input id="response[2]_box" class="response" style="border: 2px double #e6194b;" contenteditable="false" disabled="disabled" name="response[2]_box" type="text" value="government" data-id="2" />
<span style="color: #626262;">.</span></div>
<br style="color: #626262;" /><span style="color: #626262;">The United States Bill of Rights plays a central role in American law and government, and remains a fundamental symbol of the freedoms and culture of the nation. One of the original fourteen copies of the U.S. Bill of Rights is on public display at the National </span><input id="response[3]_box" class="response" style="border: 2px double #ffe119;" contenteditable="false" disabled="disabled" name="response[3]_box" type="text" value="Archives" data-id="3" /><span style="color: #626262;"> in Washington, D.C.</span></td>
</tr>
回复 1:
{false, 1, "The United States", false}
{true, 2, "has 27 amendments, of which the first", false}
{true, 3, "amendments are called the Bill of Rights. The purpose of the Bill of Rights is to provide specific freedoms to citizens and limit the power of the", false}
{true, 4, "The United States Bill of Rights plays a central role in American law and government, and remains a fundamental symbol of the freedoms and culture of the nation. One of the original fourteen copies of the U.S. Bill of Rights is on public display at the National", false}
{true, 5, "in Washington, D.C.", true}
HTML 2:
South <input id="response[0]_box" class="response" style="border: 2px double #4363d8;" contenteditable="false" disabled="disabled" name="response[0]_box" type="text" value="FIB" data-id="0" /> is the largest provice in the Southern Hemisphere.
回复 2:
{false, 1, "South", false}
{true, 2, "is the largest province in the Southern Hemisphere", true}
HTML 3:
<input id="response[0]_box" class="response" style="border: 2px double #4363d8;" contenteditable="false" disabled="disabled" name="response[0]_box" type="text" value="FIB" data-id="0" /> Smith
回复 3:
{true, 1, "Smtih", true}
HTML 4:
<span>John <input id="response[0]_box" class="response" style="border: 2px double #4363d8;" contenteditable="false" disabled="disabled" name="response[0]_box" type="text" value="FIB" data-id="0" />
planted apples.</span>
回复 4:
{false, 1, "John", false}
{true, 2, "Planted Apples", true}
HTML 5:
Smith <input id="response[0]_box" class="response" style="border: 2px double #4363d8;" contenteditable="false" disabled="disabled" name="response[0]_box" type="text" value="FIB" data-id="0" />
回复 5:
{false, 1, "Smtih", false}
最初,我正在寻找任何跨度节点,这对原始 HTML blob 来说是有意义的,但是,随着新请求的到来,原来的方式很快就变得不可用了。它假设了太多关于 HTML blob 结构的事情。
让我费尽心思的是如何重新构建 HtmlAgilityPack 搜索。部分原因是我们需要知道节点在 HTML blob 中的位置,并且如果输入节点之间有多个文本节点,我们还需要能够组合文本节点。
我认为有意义的是搜索每个输入节点。
获取输入节点之前的所有内部文本(如果有)并将其作为响应。
然后转到下一个输入节点,获取它与前一个输入节点之间的所有内部文本,冲洗并重复。
然后对最后一个输入后的最后一个节点的内部文本进行最终搜索,并创建一个响应(如果存在)。
void Main()
{
HtmlDocument doc = new HtmlDocument();
var htmlToLoad = "Short <input id=\"response[0]_box\" class=\"response\" style=\"border: 2px double #4363d8;\" contenteditable=\"false\" disabled=\"disabled\" name=\"response[0]_box\" type=\"text\" value=\"FIB\" data-id=\"0\" /> item";
doc.LoadHtml (htmlToLoad);
var tableNode = doc.DocumentNode.SelectNodes ("//span");
var answerNodes = doc.DocumentNode.SelectNodes ("//input[@class='response']");
if (tableNode == null)
{
htmlToLoad = string.Format ("<span> {0} </span>", htmlToLoad);
doc.LoadHtml (htmlToLoad);
tableNode = doc.DocumentNode.SelectNodes ("//span");
}
var responses = new List<Response>();
for (var i = 0; i < tableNode.Count; i++)
{
var response = new Response
{
isLeading = false,
ResponseId = i,
LeadingStem = tableNode [i].InnerText,
isLagging = false
};
responses.Add (response);
if (i == tableNode.Count - 2)
{
++i;
Console.WriteLine (answerNodes [i - 2].GetAttributeValue ("value", ""));
Console.WriteLine (tableNode [i].InnerText);
var laggingResponse = new Response
{
isLeading = true,
ResponseId = i,
LeadingStem = tableNode [i].InnerText,
isLagging = true
};
}
else if (i < answerNodes.Count - 1)
{
Console.WriteLine (answerNodes [i].GetAttributeValue ("value", ""));
}
}
}
public class Response
{
public bool isLeading { get; set; } //set to true only if there was an input detected after the last response was read
public int ResponseId { get; set; }
public string LeadingStem { get; set; }
public bool isLagging { get; set; } //set to true only if it's the final text node, and has no input after it
}
事实证明,使用正则表达式遍历文档并根据需要使用 HtmlAgilityPack 读取属性要容易得多。这也允许将响应 class 改造成比以前的实现简单得多的方法。
var pattern = @"(<input)(.*?)(>)";
var options = RegexOptions.Multiline | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant;
var input = item.GetHtmlInput();
var matStems = new Queue<Tuple<string, int>>();
var matches = Regex.Matches(input, pattern, options);
var currentStringPosition = 0;
foreach (Match m in matches)
{
if(m.Index == 0)
{
var leadingStem = new Tuple<string, int>("", -1);
matStems.Enqueue(leadingStem);
}
var doc = new HtmlDocument();
doc.LoadHtml(m.Value);
var inputElement = doc.DocumentNode.SelectNodes("//input").LastOrDefault();
var inputResponseId = inputElement.Attributes["data-id"].Value;
var endingIndex = m.Index + m.Length;
var stemLength = m.Index - currentStringPosition;
var stemContent = input.Substring(currentStringPosition, stemLength);
var stemToAdd = new Tuple<string, int>(stemContent, Convert.ToInt32(inputResponseId));
matStems.Enqueue(stemToAdd);
currentStringPosition = endingIndex;
if (endingIndex != input.Length) continue;
var laggingStem = new Tuple<string, int>("", -1);
matStems.Enqueue(laggingStem);
}
if (currentStringPosition < input.Length)
{
var stemContent = input.Substring(currentStringPosition);
var stemToAdd = new Tuple<string, int>(stemContent, Convert.ToInt32(-1));
matStems.Enqueue(stemToAdd);
}
然后可以按如下方式使用队列中的项目:
while (matStems.Any())
{
var stem = MaterialStems.Dequeue();
item.WriteStem(stem.Item1);
var response = item.GetResponses().ElementAtOrDefault(stem.Item2);
if(response != null) item.WriteResponse(response);
}
我从外部 API 传入了以下 5 个 Html 片段,我无法控制现有格式。
我需要将 HTML 拆分为具有以下结构的响应 class
public class Resposne{
bool isLeading {get;set;} //set to true only if there was an input detected after the last response was read
int ResponseId {get;set;}
string LeadingStem {get;set;}
bool isLagging{get;set;} //set to true only if it's the final text node, and has no input after it
}
预期输出列在每个 Html 块下方(请原谅那些伪代码}
HTML 1:
<tr style="height: 179px;">
<td style="width: 421px; height: 179px;"><img style="display: block; margin-left: auto; margin-right: auto;" src="../../Content/uploads/319e1bf6-9b79-4009-8108-c1d2ed77ffbb/85c232e7-33e8-49bb-acbe-374f5ad49361.jpg" alt="" width="400" height="209" /></td>
<td style="width: 692px; height: 179px; vertical-align: top;">
<div style="color: #626262;">
<span style="color: #626262;">The United States </span>
<input id="response[0]_box" class="response" style="border: 2px double #4363d8;" contenteditable="false" disabled="disabled" name="response[0]_box" type="text" value="Constitution" data-id="0" />
<span style="color: #626262;"> has 27 amendments, of which the first </span>
<input id="response[1]_box" class="response" style="border: 2px double #3cb44b;" contenteditable="false" disabled="disabled" name="response[1]_box" type="text" value="10" data-id="1" />
<input id="response[1]_box" style="border: 2px double #3cb44b;" contenteditable="false" disabled="disabled" name="response[1]_box" type="text" value="BAD INPUT BAD INPUT" data-id="1" />
<span style="color: #626262;"> amendments are called the Bill of Rights. The purpose of the Bill of Rights is to provide specific freedoms to citizens and limit the power of the </span>
<input id="response[2]_box" class="response" style="border: 2px double #e6194b;" contenteditable="false" disabled="disabled" name="response[2]_box" type="text" value="government" data-id="2" />
<span style="color: #626262;">.</span></div>
<br style="color: #626262;" /><span style="color: #626262;">The United States Bill of Rights plays a central role in American law and government, and remains a fundamental symbol of the freedoms and culture of the nation. One of the original fourteen copies of the U.S. Bill of Rights is on public display at the National </span><input id="response[3]_box" class="response" style="border: 2px double #ffe119;" contenteditable="false" disabled="disabled" name="response[3]_box" type="text" value="Archives" data-id="3" /><span style="color: #626262;"> in Washington, D.C.</span></td>
</tr>
回复 1:
{false, 1, "The United States", false}
{true, 2, "has 27 amendments, of which the first", false}
{true, 3, "amendments are called the Bill of Rights. The purpose of the Bill of Rights is to provide specific freedoms to citizens and limit the power of the", false}
{true, 4, "The United States Bill of Rights plays a central role in American law and government, and remains a fundamental symbol of the freedoms and culture of the nation. One of the original fourteen copies of the U.S. Bill of Rights is on public display at the National", false}
{true, 5, "in Washington, D.C.", true}
HTML 2:
South <input id="response[0]_box" class="response" style="border: 2px double #4363d8;" contenteditable="false" disabled="disabled" name="response[0]_box" type="text" value="FIB" data-id="0" /> is the largest provice in the Southern Hemisphere.
回复 2:
{false, 1, "South", false}
{true, 2, "is the largest province in the Southern Hemisphere", true}
HTML 3:
<input id="response[0]_box" class="response" style="border: 2px double #4363d8;" contenteditable="false" disabled="disabled" name="response[0]_box" type="text" value="FIB" data-id="0" /> Smith
回复 3:
{true, 1, "Smtih", true}
HTML 4:
<span>John <input id="response[0]_box" class="response" style="border: 2px double #4363d8;" contenteditable="false" disabled="disabled" name="response[0]_box" type="text" value="FIB" data-id="0" />
planted apples.</span>
回复 4:
{false, 1, "John", false}
{true, 2, "Planted Apples", true}
HTML 5:
Smith <input id="response[0]_box" class="response" style="border: 2px double #4363d8;" contenteditable="false" disabled="disabled" name="response[0]_box" type="text" value="FIB" data-id="0" />
回复 5:
{false, 1, "Smtih", false}
最初,我正在寻找任何跨度节点,这对原始 HTML blob 来说是有意义的,但是,随着新请求的到来,原来的方式很快就变得不可用了。它假设了太多关于 HTML blob 结构的事情。
让我费尽心思的是如何重新构建 HtmlAgilityPack 搜索。部分原因是我们需要知道节点在 HTML blob 中的位置,并且如果输入节点之间有多个文本节点,我们还需要能够组合文本节点。
我认为有意义的是搜索每个输入节点。
获取输入节点之前的所有内部文本(如果有)并将其作为响应。
然后转到下一个输入节点,获取它与前一个输入节点之间的所有内部文本,冲洗并重复。
然后对最后一个输入后的最后一个节点的内部文本进行最终搜索,并创建一个响应(如果存在)。
void Main()
{
HtmlDocument doc = new HtmlDocument();
var htmlToLoad = "Short <input id=\"response[0]_box\" class=\"response\" style=\"border: 2px double #4363d8;\" contenteditable=\"false\" disabled=\"disabled\" name=\"response[0]_box\" type=\"text\" value=\"FIB\" data-id=\"0\" /> item";
doc.LoadHtml (htmlToLoad);
var tableNode = doc.DocumentNode.SelectNodes ("//span");
var answerNodes = doc.DocumentNode.SelectNodes ("//input[@class='response']");
if (tableNode == null)
{
htmlToLoad = string.Format ("<span> {0} </span>", htmlToLoad);
doc.LoadHtml (htmlToLoad);
tableNode = doc.DocumentNode.SelectNodes ("//span");
}
var responses = new List<Response>();
for (var i = 0; i < tableNode.Count; i++)
{
var response = new Response
{
isLeading = false,
ResponseId = i,
LeadingStem = tableNode [i].InnerText,
isLagging = false
};
responses.Add (response);
if (i == tableNode.Count - 2)
{
++i;
Console.WriteLine (answerNodes [i - 2].GetAttributeValue ("value", ""));
Console.WriteLine (tableNode [i].InnerText);
var laggingResponse = new Response
{
isLeading = true,
ResponseId = i,
LeadingStem = tableNode [i].InnerText,
isLagging = true
};
}
else if (i < answerNodes.Count - 1)
{
Console.WriteLine (answerNodes [i].GetAttributeValue ("value", ""));
}
}
}
public class Response
{
public bool isLeading { get; set; } //set to true only if there was an input detected after the last response was read
public int ResponseId { get; set; }
public string LeadingStem { get; set; }
public bool isLagging { get; set; } //set to true only if it's the final text node, and has no input after it
}
事实证明,使用正则表达式遍历文档并根据需要使用 HtmlAgilityPack 读取属性要容易得多。这也允许将响应 class 改造成比以前的实现简单得多的方法。
var pattern = @"(<input)(.*?)(>)";
var options = RegexOptions.Multiline | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant;
var input = item.GetHtmlInput();
var matStems = new Queue<Tuple<string, int>>();
var matches = Regex.Matches(input, pattern, options);
var currentStringPosition = 0;
foreach (Match m in matches)
{
if(m.Index == 0)
{
var leadingStem = new Tuple<string, int>("", -1);
matStems.Enqueue(leadingStem);
}
var doc = new HtmlDocument();
doc.LoadHtml(m.Value);
var inputElement = doc.DocumentNode.SelectNodes("//input").LastOrDefault();
var inputResponseId = inputElement.Attributes["data-id"].Value;
var endingIndex = m.Index + m.Length;
var stemLength = m.Index - currentStringPosition;
var stemContent = input.Substring(currentStringPosition, stemLength);
var stemToAdd = new Tuple<string, int>(stemContent, Convert.ToInt32(inputResponseId));
matStems.Enqueue(stemToAdd);
currentStringPosition = endingIndex;
if (endingIndex != input.Length) continue;
var laggingStem = new Tuple<string, int>("", -1);
matStems.Enqueue(laggingStem);
}
if (currentStringPosition < input.Length)
{
var stemContent = input.Substring(currentStringPosition);
var stemToAdd = new Tuple<string, int>(stemContent, Convert.ToInt32(-1));
matStems.Enqueue(stemToAdd);
}
然后可以按如下方式使用队列中的项目:
while (matStems.Any())
{
var stem = MaterialStems.Dequeue();
item.WriteStem(stem.Item1);
var response = item.GetResponses().ElementAtOrDefault(stem.Item2);
if(response != null) item.WriteResponse(response);
}