在 XDocument.SetAttributeValue 中写入一个巨大的字符串

Write a huge string in XDocument.SetAttributeValue

我有一个很大的 StringBuilder (~140MB),我需要在 XML 属性中写入。我正在使用 XDocument 来处理 XML 操作。

当尝试将 string 写入 XAttribute 时,我得到一个 System.OutOfMemoryException(因为我需要调用 StringBuilder.ToString(),我想这会将整个字符串加载到内存中)。

var length = value.RawArtifact.Content.Length;
StringBuilder b = new StringBuilder();
int pos = 0;
while (pos < length - 1000)
{
    b.Append(BitConverter.ToString(value.RawArtifact.Content, pos, 1000).Replace("-", ""));
    pos += 1000;
}
b.Append(BitConverter.ToString(value.RawArtifact.Content, pos)).Replace("-", "");
var buffer = b.ToString(); // This throws an exception
myAttribute.SetAttributeValue("my-attribute", buffer);

我找不到 SetAttributeValue 的任何重载,需要 StreamReader 之类的东西,所以我现在感觉有点卡住了。

有什么建议吗?

如果您检查 reference source for XAttribute,您会看到 XAttributeinternal string value;,因此无法使用 StringBuilderStreamReader 作为值。

相反,您可能会考虑一种流式传输方法,在这种方法中,您在写出 XDocument 时将所需的属性注入 XML 流中。如果你这样做,你可以结合XmlWriter.WriteStartAttribute() with XmlWriter.WriteChars() to write your huge attribute value in chunks. The WriteChars()方法:

can be used to write large amounts of text one buffer at a time.

正是为这种情况而设计的。有两种基本方法可以实现属性值的流式注入:

  1. 使用从Combining the XmlReader and XmlWriter classes for simple streaming transformations by Mark Fussell and inject the attribute while streaming from an XmlReader returned by XDocument.CreateReader()XmlWriter的算法。

    有关示例,请参阅 File size restriction or limitation in C#, Edit a large XML file and

  2. 子类 XmlWriter 本身并在编写目标元素时注入属性。

    有关示例,请参阅

采用第二种方式,首先创建如下扩展方法:

public static partial class XmlExtensions
{
    public static void WriteAttribute(this XmlWriter writer, string localName, IEnumerable<(char [] Buffer, int Length)> valueSegments) =>
        WriteAttribute(writer, null, localName, null, valueSegments);
        
    public static void WriteAttribute(this XmlWriter writer, string localName, string namespaceUri, IEnumerable<(char [] Buffer, int Length)> valueSegments) =>
        WriteAttribute(writer, null, localName, namespaceUri, valueSegments);
    
    public static void WriteAttribute(this XmlWriter writer, string prefix, string localName, string namespaceUri, IEnumerable<(char [] Buffer, int Length)> valueSegments)
    {
        writer.WriteStartAttribute(prefix, localName, namespaceUri);
        char [] surrogateBuffer = null;

        // According to the docs, surrogate pairs cannot be split across calls to WriteChars():
        // https://docs.microsoft.com/en-us/dotnet/api/system.xml.xmlwriter.writechars?view=net-5.0#remarks
        // So if the last character of a segment is a high surrogate, buffer it and write it with the first character of the next buffer.
        foreach (var segment in valueSegments)
        {
            if (segment.Length < 1)
                continue;
            int start = 0;
            if (surrogateBuffer != null && surrogateBuffer[0] != '[=10=]')
            {
                surrogateBuffer[1] = segment.Buffer[start++];
                writer.WriteChars(surrogateBuffer, 0, 2);
                surrogateBuffer[0] = surrogateBuffer[1] = '[=10=]';
            }
            int count = segment.Length - start;
            if (count > 0 && char.IsHighSurrogate(segment.Buffer[segment.Length-1]))
            {
                (surrogateBuffer = surrogateBuffer ?? new char[2])[0] = segment.Buffer[segment.Length-1];
                count--;
            }
            writer.WriteChars(segment.Buffer, start, count);
        }
        writer.WriteEndAttribute();
        if (surrogateBuffer != null && surrogateBuffer[0] != '[=10=]')
            throw new XmlException(string.Format("Unterminated surrogate pair {0}", surrogateBuffer[0]));
    }
}

public static class ByteExtensions
{
    // Copied from this answer 
    // By https://whosebug.com/users/445517/codesinchaos
    // To 
    // And modified to populate a char span rather than return a string.
    public static void ByteToHexBitFiddle(ReadOnlySpan<byte> bytes, Span<char> c)
    {
        if (c.Length < 2* bytes.Length)
            throw new ArgumentException("c.Length < 2* bytes.Length");
        int b;
        for (int i = 0; i < bytes.Length; i++) {
            b = bytes[i] >> 4;
            c[i * 2] = (char)(55 + b + (((b-10)>>31)&-7));
            b = bytes[i] & 0xF;
            c[i * 2 + 1] = (char)(55 + b + (((b-10)>>31)&-7));
        }
    }
    
    public static IEnumerable<(char [] segment, int length)> GetHexCharSegments(ReadOnlyMemory<byte> bytes, int chunkSize = 1000)
    {
        var buffer = new char[2*chunkSize];
        var length = bytes.Length;
        int pos = 0;
        while (pos < length - chunkSize)
        {
            ByteExtensions.ByteToHexBitFiddle(bytes.Span.Slice(pos, chunkSize), buffer);
            yield return (buffer, buffer.Length);
            pos += chunkSize;
        }
        ByteExtensions.ByteToHexBitFiddle(bytes.Span.Slice(pos), buffer);
        yield return (buffer, 2*(length - pos));
    }
}

接下来,子类XmlWriter如下:

public class ElementEventArgs : EventArgs
{
    public XName Element { get; init; }
    public Stack<XName> ElementStack { get; init; }
}

public class NotifyingXmlWriter : XmlWriterProxy
{
    readonly Stack<XName> elements = new Stack<XName>();

    public NotifyingXmlWriter(XmlWriter baseWriter) : base(baseWriter) { }

    public event EventHandler<ElementEventArgs> OnElementStarted;
    public event EventHandler<ElementEventArgs> OnElementEnded;

    public override void WriteStartElement(string prefix, string localName, string ns)
    {
        base.WriteStartElement(prefix, localName, ns);
        var name = XName.Get(localName, ns);
        elements.Push(name);
        OnElementStarted?.Invoke(this, new ElementEventArgs { Element = name, ElementStack = elements });
    }

    public override void WriteEndElement()
    {
        base.WriteEndElement();
        var name = elements.Pop(); // Pop after base.WriteEndElement() lets the base class throw an exception on a stack error.
        OnElementEnded?.Invoke(this, new ElementEventArgs { Element = name, ElementStack = elements });
    }
}

public class XmlWriterProxy : XmlWriter
{
    // Taken from this answer 
    // by https://whosebug.com/users/3744182/dbc
    // To 
    // NOTE: async methods not implemented
    readonly XmlWriter baseWriter;

    public XmlWriterProxy(XmlWriter baseWriter) => this.baseWriter = baseWriter ?? throw new ArgumentNullException();

    protected virtual bool IsSuspended { get { return false; } }

    public override void Close() => baseWriter.Close();

    public override void Flush() => baseWriter.Flush();

    public override string LookupPrefix(string ns) => baseWriter.LookupPrefix(ns);

    public override void WriteBase64(byte[] buffer, int index, int count)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteBase64(buffer, index, count);
    }

    public override void WriteCData(string text)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteCData(text);
    }

    public override void WriteCharEntity(char ch)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteCharEntity(ch);
    }

    public override void WriteChars(char[] buffer, int index, int count)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteChars(buffer, index, count);
    }

    public override void WriteComment(string text)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteComment(text);
    }

    public override void WriteDocType(string name, string pubid, string sysid, string subset)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteDocType(name, pubid, sysid, subset);
    }

    public override void WriteEndAttribute()
    {
        if (IsSuspended)
            return;
        baseWriter.WriteEndAttribute();
    }

    public override void WriteEndDocument()
    {
        if (IsSuspended)
            return;
        baseWriter.WriteEndDocument();
    }

    public override void WriteEndElement()
    {
        if (IsSuspended)
            return;
        baseWriter.WriteEndElement();
    }

    public override void WriteEntityRef(string name)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteEntityRef(name);
    }

    public override void WriteFullEndElement()
    {
        if (IsSuspended)
            return;
        baseWriter.WriteFullEndElement();
    }

    public override void WriteProcessingInstruction(string name, string text)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteProcessingInstruction(name, text);
    }

    public override void WriteRaw(string data)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteRaw(data);
    }

    public override void WriteRaw(char[] buffer, int index, int count)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteRaw(buffer, index, count);
    }

    public override void WriteStartAttribute(string prefix, string localName, string ns)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteStartAttribute(prefix, localName, ns);
    }

    public override void WriteStartDocument(bool standalone) => baseWriter.WriteStartDocument(standalone);

    public override void WriteStartDocument() => baseWriter.WriteStartDocument();

    public override void WriteStartElement(string prefix, string localName, string ns)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteStartElement(prefix, localName, ns);
    }

    public override WriteState WriteState => baseWriter.WriteState;

    public override void WriteString(string text)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteString(text);
    }

    public override void WriteSurrogateCharEntity(char lowChar, char highChar)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteSurrogateCharEntity(lowChar, highChar);
    }

    public override void WriteWhitespace(string ws)
    {
        if (IsSuspended)
            return;
        baseWriter.WriteWhitespace(ws);
    }
}   

现在您可以执行以下操作:

string fileName = @"Question68941254.xml"; // or whatever

XNamespace targetNamespace = "";
XName targetName = targetNamespace + "TheNode";

using (var textWriter = new StreamWriter(fileName))
using (var innerXmlWriter = XmlWriter.Create(textWriter, new XmlWriterSettings { Indent = true }))
using (var xmlWriter = new NotifyingXmlWriter(innerXmlWriter))
{
    xmlWriter.OnElementStarted += (o, e) =>
    {
        if (e.Element == targetName)
        {
            // Add the attribute with the byte hex value to the target element.
            ((XmlWriter)o).WriteAttribute("TheAttribute", ByteExtensions.GetHexCharSegments(value.RawArtifact.Content.AsMemory()));
        }
    };
    xdocument.WriteTo(xmlWriter);
}

当然,xdocument 是您要填充的一些 XDocument,并将属性 TheAttribute 添加到节点 TheNode

备注:

  • 由于您的代码显示您正在通过将大字节数组转换为大十六进制字符串缓冲区来填充 StringBuilder,因此我消除了中间 StringBuilder 并编写了字节数组直接成块。

    如果确实需要将一些StringBuilder b的内容写成块,使用

    public static partial class StringBuilderExtensions
    {
        public static IEnumerable<(char [] segment, int length)> GetSegments(this StringBuilder sb, int bufferSize = 1024)
        {
            var buffer = new char[bufferSize];
            for (int i = 0; i < sb.Length; i += buffer.Length)
            {
                int length = Math.Min(buffer.Length, sb.Length - i);
                sb.CopyTo(i, buffer, length);
                yield return (buffer, length);
            }
        }
    }
    

    并将b.GetSegments()传递给XmlExtensions.WriteAttribute()

演示 fiddle here 结果:

<?xml version="1.0" encoding="utf-8"?>
<Root>
  <SomeOtherNode>some value</SomeOtherNode>
  <TheNode TheAttribute="000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F606162636465666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F808182838485868788898A8B8C8D8E8F909192939495969798999A9B9C9D9E9FA0A1A2A3A4A5A6A7A8A9AAABACADAEAFB0B1B2B3B4B5B6B7B8B9BABBBCBDBEBFC0C1C2C3C4C5C6C7C8C9CACBCCCDCECFD0D1D2D3D4D5D6D7D8D9DADBDCDDDEDFE0E1E2E3E4E5E6E7E8E9EAEBECEDEEEFF0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B">
    <foo></foo>the node value</TheNode>
  <AnotherNode>another value</AnotherNode>
</Root>