如何防止 .NET XML 解析器扩展 XML 中的参数实体?

How do you keep .NET XML parsers from expanding parameter entities in XML?

当我尝试解析下面的 xml(使用下面的代码)时,我不断得到 <sgml>&question;&signature;</sgml>

扩展到

<sgml>Why couldn’t I publish my books directly in standard SGML? — William Shakespeare.</sgml>

<sgml></sgml>

因为我正在研究 XML 三向合并算法,所以我想检索未扩展的 <sgml>&question;&signature;</sgml>

我试过:

我有以下 XML 文件:

<!DOCTYPE sgml [
  <!ELEMENT sgml ANY>
  <!ENTITY  std       "standard SGML">
  <!ENTITY  signature " &#x2014; &author;.">
  <!ENTITY  question  "Why couldn&#x2019;t I publish my books directly in &std;?">
  <!ENTITY  author    "William Shakespeare">
]>
<sgml>&question;&signature;</sgml>

这是我尝试过的代码(多次尝试):

using System.IO;
using System.Xml;
using System.Xml.Linq;
using System.Reflection;

class Program
{
    static void Main(string[] args)
    {
        string xml = @"C:\src\Apps\Wit\MergingAlgorithmTest\MergingAlgorithmTest\Tests\XMLMerge-DocTypeExpansion\DocTypeExpansion.0.xml";
        var xmlSettingsIgnore = new XmlReaderSettings 
            {
                CheckCharacters = false,
                DtdProcessing = DtdProcessing.Ignore
            };

        var xmlSettingsParse = new XmlReaderSettings
        {
            CheckCharacters = false,
            DtdProcessing = DtdProcessing.Parse
        };

        using (var fs = File.Open(xml, FileMode.Open, FileAccess.Read))
        {
            using (var xmkReaderIgnore = XmlReader.Create(fs, xmlSettingsIgnore))
            {
                // Prevents Exception "Reference to undeclared entity 'question'"
                PropertyInfo propertyInfo = xmkReaderIgnore.GetType().GetProperty("DisableUndeclaredEntityCheck", BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic);
                propertyInfo.SetValue(xmkReaderIgnore, true, null);

                var doc = XDocument.Load(xmkReaderIgnore);

                Console.WriteLine(doc.Root.ToString()); // outputs <sgml></sgml> not <sgml>&question;&signature;</sgml>
            }// using xml ignore

            fs.Position = 0;
            using (var xmkReaderIgnore = XmlReader.Create(fs, xmlSettingsParse))
            {
                var doc = XDocument.Load(xmkReaderIgnore);
                Console.WriteLine(doc.Root.ToString()); // outputs <sgml>Why couldn't I publish my books directly in standard SGML? - William Shakespeare.</sgml> not <sgml>&question;&signature;</sgml>
            }

            fs.Position = 0;
            string parseXmlString = String.Empty;
            using (StreamReader sr = new StreamReader(fs))
            {
                for (int i = 0; i < 7; ++i) // Skip DocType
                    sr.ReadLine();

                parseXmlString = sr.ReadLine();
            }

            using (XmlReader xmlReaderSkip = XmlReader.Create(new StringReader(parseXmlString),xmlSettingsParse))
            {
                // Prevents Exception "Reference to undeclared entity 'question'"
                PropertyInfo propertyInfo = xmlReaderSkip.GetType().GetProperty("DisableUndeclaredEntityCheck", BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic);
                propertyInfo.SetValue(xmlReaderSkip, true, null);

                var doc2 = XDocument.Load(xmlReaderSkip); // Empty sgml tag

            }
        }//using FileStream
    }
}

Linq-to-XML 不支持实体引用的建模 -- 它们会自动扩展为它们的值(source 1, source 2). There simply is no subclass of XObject 为一般实体引用定义。

但是,假设您的 XML 有效(即实体引用存在于 DTD 中,它们在您的示例中就是这样做的),您可以使用 old XML Document Object Model to parse your XML and insert XmlEntityReference 节点到您的 XML DOM 树,而不是将实体引用扩展为纯文本:

        using (var sr = new StreamReader(xml))
        using (var xtr = new XmlTextReader(sr))
        {
            xtr.EntityHandling = EntityHandling.ExpandCharEntities; // Expands character entities and returns general entities as System.Xml.XmlNodeType.EntityReference
            var oldDoc = new XmlDocument();
            oldDoc.Load(xtr);
            Debug.WriteLine(oldDoc.DocumentElement.OuterXml); // Outputs <sgml>&question;&signature;</sgml>
            Debug.Assert(oldDoc.DocumentElement.OuterXml.Contains("&question;")); // Verify that the entity references are still there - no assert
            Debug.Assert(oldDoc.DocumentElement.OuterXml.Contains("&signature;")); // Verify that the entity references are still there - no assert
        }

每个 XmlEntityReferenceChildNodes 将具有一般实体的文本值。如果一个通用实体引用其他通用实体,就像您的情况一样,相应的内部 XmlEntityReference 将嵌套在外部的 ChildNodes 中。然后,您可以使用旧 XmlDocument API.

比较新旧 XML

请注意,您还需要使用旧的 XmlTextReader and set EntityHandling = EntityHandling.ExpandCharEntities