如何防止 .NET XML 解析器扩展 XML 中的参数实体?
How do you keep .NET XML parsers from expanding parameter entities in XML?
当我尝试解析下面的 xml(使用下面的代码)时,我不断得到 <sgml>&question;&signature;</sgml>
扩展到
<sgml>Why couldn’t I publish my books directly in standard SGML? — William Shakespeare.</sgml>
或
<sgml></sgml>
因为我正在研究 XML 三向合并算法,所以我想检索未扩展的
<sgml>&question;&signature;</sgml>
我试过:
- 正常解析 xml(这导致扩展的 sgml 标签)
- 从 xml 开头删除 Doctype 这会导致空 sgml 标记)
- 各种 XmlReader DTD 设置
我有以下 XML 文件:
<!DOCTYPE sgml [
<!ELEMENT sgml ANY>
<!ENTITY std "standard SGML">
<!ENTITY signature " — &author;.">
<!ENTITY question "Why couldn’t I publish my books directly in &std;?">
<!ENTITY author "William Shakespeare">
]>
<sgml>&question;&signature;</sgml>
这是我尝试过的代码(多次尝试):
using System.IO;
using System.Xml;
using System.Xml.Linq;
using System.Reflection;
class Program
{
static void Main(string[] args)
{
string xml = @"C:\src\Apps\Wit\MergingAlgorithmTest\MergingAlgorithmTest\Tests\XMLMerge-DocTypeExpansion\DocTypeExpansion.0.xml";
var xmlSettingsIgnore = new XmlReaderSettings
{
CheckCharacters = false,
DtdProcessing = DtdProcessing.Ignore
};
var xmlSettingsParse = new XmlReaderSettings
{
CheckCharacters = false,
DtdProcessing = DtdProcessing.Parse
};
using (var fs = File.Open(xml, FileMode.Open, FileAccess.Read))
{
using (var xmkReaderIgnore = XmlReader.Create(fs, xmlSettingsIgnore))
{
// Prevents Exception "Reference to undeclared entity 'question'"
PropertyInfo propertyInfo = xmkReaderIgnore.GetType().GetProperty("DisableUndeclaredEntityCheck", BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic);
propertyInfo.SetValue(xmkReaderIgnore, true, null);
var doc = XDocument.Load(xmkReaderIgnore);
Console.WriteLine(doc.Root.ToString()); // outputs <sgml></sgml> not <sgml>&question;&signature;</sgml>
}// using xml ignore
fs.Position = 0;
using (var xmkReaderIgnore = XmlReader.Create(fs, xmlSettingsParse))
{
var doc = XDocument.Load(xmkReaderIgnore);
Console.WriteLine(doc.Root.ToString()); // outputs <sgml>Why couldn't I publish my books directly in standard SGML? - William Shakespeare.</sgml> not <sgml>&question;&signature;</sgml>
}
fs.Position = 0;
string parseXmlString = String.Empty;
using (StreamReader sr = new StreamReader(fs))
{
for (int i = 0; i < 7; ++i) // Skip DocType
sr.ReadLine();
parseXmlString = sr.ReadLine();
}
using (XmlReader xmlReaderSkip = XmlReader.Create(new StringReader(parseXmlString),xmlSettingsParse))
{
// Prevents Exception "Reference to undeclared entity 'question'"
PropertyInfo propertyInfo = xmlReaderSkip.GetType().GetProperty("DisableUndeclaredEntityCheck", BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic);
propertyInfo.SetValue(xmlReaderSkip, true, null);
var doc2 = XDocument.Load(xmlReaderSkip); // Empty sgml tag
}
}//using FileStream
}
}
Linq-to-XML 不支持实体引用的建模 -- 它们会自动扩展为它们的值(source 1, source 2). There simply is no subclass of XObject
为一般实体引用定义。
但是,假设您的 XML 有效(即实体引用存在于 DTD 中,它们在您的示例中就是这样做的),您可以使用 old XML Document Object Model to parse your XML and insert XmlEntityReference
节点到您的 XML DOM 树,而不是将实体引用扩展为纯文本:
using (var sr = new StreamReader(xml))
using (var xtr = new XmlTextReader(sr))
{
xtr.EntityHandling = EntityHandling.ExpandCharEntities; // Expands character entities and returns general entities as System.Xml.XmlNodeType.EntityReference
var oldDoc = new XmlDocument();
oldDoc.Load(xtr);
Debug.WriteLine(oldDoc.DocumentElement.OuterXml); // Outputs <sgml>&question;&signature;</sgml>
Debug.Assert(oldDoc.DocumentElement.OuterXml.Contains("&question;")); // Verify that the entity references are still there - no assert
Debug.Assert(oldDoc.DocumentElement.OuterXml.Contains("&signature;")); // Verify that the entity references are still there - no assert
}
每个 XmlEntityReference
的 ChildNodes
将具有一般实体的文本值。如果一个通用实体引用其他通用实体,就像您的情况一样,相应的内部 XmlEntityReference
将嵌套在外部的 ChildNodes
中。然后,您可以使用旧 XmlDocument
API.
比较新旧 XML
请注意,您还需要使用旧的 XmlTextReader
and set EntityHandling = EntityHandling.ExpandCharEntities
。
当我尝试解析下面的 xml(使用下面的代码)时,我不断得到 <sgml>&question;&signature;</sgml>
扩展到
<sgml>Why couldn’t I publish my books directly in standard SGML? — William Shakespeare.</sgml>
或
<sgml></sgml>
因为我正在研究 XML 三向合并算法,所以我想检索未扩展的
<sgml>&question;&signature;</sgml>
我试过:
- 正常解析 xml(这导致扩展的 sgml 标签)
- 从 xml 开头删除 Doctype 这会导致空 sgml 标记)
- 各种 XmlReader DTD 设置
我有以下 XML 文件:
<!DOCTYPE sgml [
<!ELEMENT sgml ANY>
<!ENTITY std "standard SGML">
<!ENTITY signature " — &author;.">
<!ENTITY question "Why couldn’t I publish my books directly in &std;?">
<!ENTITY author "William Shakespeare">
]>
<sgml>&question;&signature;</sgml>
这是我尝试过的代码(多次尝试):
using System.IO;
using System.Xml;
using System.Xml.Linq;
using System.Reflection;
class Program
{
static void Main(string[] args)
{
string xml = @"C:\src\Apps\Wit\MergingAlgorithmTest\MergingAlgorithmTest\Tests\XMLMerge-DocTypeExpansion\DocTypeExpansion.0.xml";
var xmlSettingsIgnore = new XmlReaderSettings
{
CheckCharacters = false,
DtdProcessing = DtdProcessing.Ignore
};
var xmlSettingsParse = new XmlReaderSettings
{
CheckCharacters = false,
DtdProcessing = DtdProcessing.Parse
};
using (var fs = File.Open(xml, FileMode.Open, FileAccess.Read))
{
using (var xmkReaderIgnore = XmlReader.Create(fs, xmlSettingsIgnore))
{
// Prevents Exception "Reference to undeclared entity 'question'"
PropertyInfo propertyInfo = xmkReaderIgnore.GetType().GetProperty("DisableUndeclaredEntityCheck", BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic);
propertyInfo.SetValue(xmkReaderIgnore, true, null);
var doc = XDocument.Load(xmkReaderIgnore);
Console.WriteLine(doc.Root.ToString()); // outputs <sgml></sgml> not <sgml>&question;&signature;</sgml>
}// using xml ignore
fs.Position = 0;
using (var xmkReaderIgnore = XmlReader.Create(fs, xmlSettingsParse))
{
var doc = XDocument.Load(xmkReaderIgnore);
Console.WriteLine(doc.Root.ToString()); // outputs <sgml>Why couldn't I publish my books directly in standard SGML? - William Shakespeare.</sgml> not <sgml>&question;&signature;</sgml>
}
fs.Position = 0;
string parseXmlString = String.Empty;
using (StreamReader sr = new StreamReader(fs))
{
for (int i = 0; i < 7; ++i) // Skip DocType
sr.ReadLine();
parseXmlString = sr.ReadLine();
}
using (XmlReader xmlReaderSkip = XmlReader.Create(new StringReader(parseXmlString),xmlSettingsParse))
{
// Prevents Exception "Reference to undeclared entity 'question'"
PropertyInfo propertyInfo = xmlReaderSkip.GetType().GetProperty("DisableUndeclaredEntityCheck", BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic);
propertyInfo.SetValue(xmlReaderSkip, true, null);
var doc2 = XDocument.Load(xmlReaderSkip); // Empty sgml tag
}
}//using FileStream
}
}
Linq-to-XML 不支持实体引用的建模 -- 它们会自动扩展为它们的值(source 1, source 2). There simply is no subclass of XObject
为一般实体引用定义。
但是,假设您的 XML 有效(即实体引用存在于 DTD 中,它们在您的示例中就是这样做的),您可以使用 old XML Document Object Model to parse your XML and insert XmlEntityReference
节点到您的 XML DOM 树,而不是将实体引用扩展为纯文本:
using (var sr = new StreamReader(xml))
using (var xtr = new XmlTextReader(sr))
{
xtr.EntityHandling = EntityHandling.ExpandCharEntities; // Expands character entities and returns general entities as System.Xml.XmlNodeType.EntityReference
var oldDoc = new XmlDocument();
oldDoc.Load(xtr);
Debug.WriteLine(oldDoc.DocumentElement.OuterXml); // Outputs <sgml>&question;&signature;</sgml>
Debug.Assert(oldDoc.DocumentElement.OuterXml.Contains("&question;")); // Verify that the entity references are still there - no assert
Debug.Assert(oldDoc.DocumentElement.OuterXml.Contains("&signature;")); // Verify that the entity references are still there - no assert
}
每个 XmlEntityReference
的 ChildNodes
将具有一般实体的文本值。如果一个通用实体引用其他通用实体,就像您的情况一样,相应的内部 XmlEntityReference
将嵌套在外部的 ChildNodes
中。然后,您可以使用旧 XmlDocument
API.
请注意,您还需要使用旧的 XmlTextReader
and set EntityHandling = EntityHandling.ExpandCharEntities
。