使用敏捷包清理未知数量的后代不起作用
Sanitizing unknown number of descendants with agility pack doesn't work
下面这段代码的目的是能够接受来自客户端的可能包含 HTML 的字符串,并删除样式、脚本、某些标签并将 H 标签替换为 B 标签。
private IDictionary<string, string[]> Whitelist;
public vacatures PostPutVacancy(vacancy vacancy)
{
//List of allowed tags
Whitelist = new Dictionary<string, string[]> {
{ "p", null },
{ "ul", null },
{ "li", null },
{ "br", null },
{ "b", null },
{ "table", null },
{ "tr", null },
{ "th", null },
{ "td", null },
{ "strong", null }
};
foreach (var item in vacancy.GetType().GetProperties())
{
if (vacancy.GetType().GetProperty(item.Name).PropertyType.FullName.Contains("String"))
{
var value = item.GetValue(vacancy, null);
if (value != null)
{
item.SetValue(vacancy, CallSanitizers(item.GetValue(vacancy, null)));
var test1 = item.GetValue(vacancy);
}
}
}
return vacancy;
}
private List<string> hList = new List<string>
{
{ "h1"},
{ "h2"},
{ "h3"},
{ "h4"},
{ "h5"},
{ "h6"}
};
private string CallSanitizers(object obj)//==Sanitize()
{
string str = obj.ToString();
if (str != HttpUtility.HtmlEncode(str))
{
doc.LoadHtml(str);
SanitizeNode(doc.DocumentNode);
string test = doc.DocumentNode.WriteTo().Trim();
return doc.DocumentNode.WriteTo().Trim();
}
else
{
return str;
}
}
private void SanitizeChildren(HtmlNode parentNode)
{
for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--)
{
SanitizeNode(parentNode.ChildNodes[i]);
}
}
private void SanitizeNode(HtmlNode node)
{
if (node.NodeType == HtmlNodeType.Element)
{
if (!Whitelist.ContainsKey(node.Name))
{
if (hList.Contains(node.Name))
{
HtmlNode b = doc.CreateElement("b");
b.InnerHtml = node.InnerHtml;
node.ParentNode.ReplaceChild(b, node);
}
else
{
node.ParentNode.RemoveChild(node, true);
}
}
if (node.HasAttributes)
{
for (int i = node.Attributes.Count - 1; i >= 0; i--)
{
HtmlAttribute currentAttribute = node.Attributes[i];
node.Attributes.Remove(currentAttribute);
}
}
}
if (node.HasChildNodes)
{
SanitizeChildren(node);
}
}
它可以工作,但有一个问题,子节点的子节点没有被清理,请参见示例。
输入:
"Lorem ipsum<h1 style='font-size:38px;'><p style='font-size:38px;'>dolor sit</p></h1> amet <h1 style='font-size:38px;'><strong style='font-size:38px;'>consectetur adipiscing</strong></h1>"
结果:
"Lorem ipsum<b><p style='font-size:38px;'>dolor sit</p></b> amet <b style='font-size:38px;'><strong style='font-size:38px;'>consectetur adipiscing</strong></b>"
问题一定是由于无法将子项放回已更改的父项,因为由于标记类型的更改,父项不再被识别。
有人知道如何解决这个问题吗?
如果问题不清楚或表述不当,请post发表评论。
提前致谢
这修复了它
private string CallSanitizers(string str)
{
if (str != HttpUtility.HtmlEncode(str))
{
doc.LoadHtml(str);
str = Sanitizers();
return doc.DocumentNode.WriteTo().Trim();
}
else
{
return str;
}
}
private string Sanitizers()
{
doc.DocumentNode.Descendants().Where(l => l.Name == "script" || l.Name == "style").ToList().ForEach(l => l.Remove());
doc.DocumentNode.Descendants().Where(l => hList.Contains(l.Name)).ToList().ForEach(l => l.Name = "b");
doc.DocumentNode.Descendants().Where(l => l.Attributes != null).ToList().ForEach(l => l.Attributes.ToList().ForEach(a => a.Remove()));
doc.DocumentNode.Descendants().Where(l => !Whitelist.Contains(l.Name) && l.NodeType == HtmlNodeType.Element).ToList().ForEach(l => l.ParentNode.RemoveChild(l, true));
return doc.DocumentNode.OuterHtml;
}
//lijst van tags die worden vervangen door <b></b>
private List<string> hList = new List<string>
{
{ "h1"},
{ "h2"},
{ "h3"},
{ "h4"},
{ "h5"},
{ "h6"}
};
List<string> Whitelist = new List<string>
{
{ "p"},
{ "ul"},
{ "li"},
{ "br"},
{ "b"},
{ "table"},
{ "tr"},
{ "th"},
{ "td"},
{ "strong"}
};
输入是
"<head><script>alert('Hello!');</script></head><div><div><h1>Lorem ipsum </h1></div></div> <h1 style='font-size:38px;'><p style='font-size:38px;'>dolor </p></h1> sit <h1 style='font-size:38px;'><strong style='font-size:38px;'>amet</strong></h1>"
输出为
"<b>Lorem ipsum</b> <b><p>dolor</p></b> sit <b><strong>amet</strong></b>"
下面这段代码的目的是能够接受来自客户端的可能包含 HTML 的字符串,并删除样式、脚本、某些标签并将 H 标签替换为 B 标签。
private IDictionary<string, string[]> Whitelist;
public vacatures PostPutVacancy(vacancy vacancy)
{
//List of allowed tags
Whitelist = new Dictionary<string, string[]> {
{ "p", null },
{ "ul", null },
{ "li", null },
{ "br", null },
{ "b", null },
{ "table", null },
{ "tr", null },
{ "th", null },
{ "td", null },
{ "strong", null }
};
foreach (var item in vacancy.GetType().GetProperties())
{
if (vacancy.GetType().GetProperty(item.Name).PropertyType.FullName.Contains("String"))
{
var value = item.GetValue(vacancy, null);
if (value != null)
{
item.SetValue(vacancy, CallSanitizers(item.GetValue(vacancy, null)));
var test1 = item.GetValue(vacancy);
}
}
}
return vacancy;
}
private List<string> hList = new List<string>
{
{ "h1"},
{ "h2"},
{ "h3"},
{ "h4"},
{ "h5"},
{ "h6"}
};
private string CallSanitizers(object obj)//==Sanitize()
{
string str = obj.ToString();
if (str != HttpUtility.HtmlEncode(str))
{
doc.LoadHtml(str);
SanitizeNode(doc.DocumentNode);
string test = doc.DocumentNode.WriteTo().Trim();
return doc.DocumentNode.WriteTo().Trim();
}
else
{
return str;
}
}
private void SanitizeChildren(HtmlNode parentNode)
{
for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--)
{
SanitizeNode(parentNode.ChildNodes[i]);
}
}
private void SanitizeNode(HtmlNode node)
{
if (node.NodeType == HtmlNodeType.Element)
{
if (!Whitelist.ContainsKey(node.Name))
{
if (hList.Contains(node.Name))
{
HtmlNode b = doc.CreateElement("b");
b.InnerHtml = node.InnerHtml;
node.ParentNode.ReplaceChild(b, node);
}
else
{
node.ParentNode.RemoveChild(node, true);
}
}
if (node.HasAttributes)
{
for (int i = node.Attributes.Count - 1; i >= 0; i--)
{
HtmlAttribute currentAttribute = node.Attributes[i];
node.Attributes.Remove(currentAttribute);
}
}
}
if (node.HasChildNodes)
{
SanitizeChildren(node);
}
}
它可以工作,但有一个问题,子节点的子节点没有被清理,请参见示例。
输入:
"Lorem ipsum<h1 style='font-size:38px;'><p style='font-size:38px;'>dolor sit</p></h1> amet <h1 style='font-size:38px;'><strong style='font-size:38px;'>consectetur adipiscing</strong></h1>"
结果:
"Lorem ipsum<b><p style='font-size:38px;'>dolor sit</p></b> amet <b style='font-size:38px;'><strong style='font-size:38px;'>consectetur adipiscing</strong></b>"
问题一定是由于无法将子项放回已更改的父项,因为由于标记类型的更改,父项不再被识别。
有人知道如何解决这个问题吗?
如果问题不清楚或表述不当,请post发表评论。
提前致谢
这修复了它
private string CallSanitizers(string str)
{
if (str != HttpUtility.HtmlEncode(str))
{
doc.LoadHtml(str);
str = Sanitizers();
return doc.DocumentNode.WriteTo().Trim();
}
else
{
return str;
}
}
private string Sanitizers()
{
doc.DocumentNode.Descendants().Where(l => l.Name == "script" || l.Name == "style").ToList().ForEach(l => l.Remove());
doc.DocumentNode.Descendants().Where(l => hList.Contains(l.Name)).ToList().ForEach(l => l.Name = "b");
doc.DocumentNode.Descendants().Where(l => l.Attributes != null).ToList().ForEach(l => l.Attributes.ToList().ForEach(a => a.Remove()));
doc.DocumentNode.Descendants().Where(l => !Whitelist.Contains(l.Name) && l.NodeType == HtmlNodeType.Element).ToList().ForEach(l => l.ParentNode.RemoveChild(l, true));
return doc.DocumentNode.OuterHtml;
}
//lijst van tags die worden vervangen door <b></b>
private List<string> hList = new List<string>
{
{ "h1"},
{ "h2"},
{ "h3"},
{ "h4"},
{ "h5"},
{ "h6"}
};
List<string> Whitelist = new List<string>
{
{ "p"},
{ "ul"},
{ "li"},
{ "br"},
{ "b"},
{ "table"},
{ "tr"},
{ "th"},
{ "td"},
{ "strong"}
};
输入是
"<head><script>alert('Hello!');</script></head><div><div><h1>Lorem ipsum </h1></div></div> <h1 style='font-size:38px;'><p style='font-size:38px;'>dolor </p></h1> sit <h1 style='font-size:38px;'><strong style='font-size:38px;'>amet</strong></h1>"
输出为
"<b>Lorem ipsum</b> <b><p>dolor</p></b> sit <b><strong>amet</strong></b>"