您如何读取、处理和写入非标准格式的内容 xml

How do you read, process and write content of a non-standard formatted xml

我正在尝试用 C# 处理语言-XML-文件的内容以进行机器翻译。

<seg-source> 段的内容应翻译并写回 <target> 段。原文或译文句段内的标签格式应保持不变。

我的第一个问题是,由于开始和结束标记不是 <xml></xml>,所以无法正确读取 xml 文件。用<xml>-tag替换前两行文本是行不通的,因为原来的XML-file是全部写在一行中的(下面的例子是为了更好阅读而格式化的)

有没有一种简单的方法可以复制所有应该转换为数组的源信息,并在我处理完后写回?

这是 XML-文件 (.sdlxliff) 的样子:

<?xml version="1.0" encoding="utf-8"?>
<xliff xmlns:sdl="http://sdl.com/FileTypes/SdlXliff/1.0" xmlns="urn:oasis:names:tc:xliff:document:1.2" version="1.2" sdl:version="1.0">
    <file original="" datatype="x-sdlfilterframework2" source-language="de-DE" target-language="en-US">
        <header>
            <file-info xmlns="http://sdl.com/FileTypes/SdlXliff/1.0">
                <value key="SDL:FileId">77260240-fccf-4e75-81e3-7a1ab00fe948</value>
                <value key="SDL:CreationDate">03/18/2022 16:00:07</value>
                <value key="SDL:OriginalFilePath"></value>
                <value key="SDL:FileTypeDllVersion">1.8.2.0</value>
                <value key="SDL:OriginalEncoding">utf-8</value>
                <value key="SDL:AutoClonedFlagSupported">True</value>
                <value key="HasUtf8Bom">False</value>
                <value key="LineBreakType">
</value>
                <value key="ParagraphTextDirections"/>
                <sniff-info>
                    <detected-encoding detection-level="Likely" encoding="utf-8"/>
                    <detected-source-lang detection-level="Guess" lang="de-DE"/>
                    <props>
                        <value key="HasUtf8Bom">False</value>
                        <value key="LineBreakType">
</value>
                    </props>
                </sniff-info>
            </file-info>
            <sdl:filetype-info>
                <sdl:filetype-id>Plain Text v 1.0.0.0</sdl:filetype-id>
            </sdl:filetype-info>
            <tag-defs xmlns="http://sdl.com/FileTypes/SdlXliff/1.0">
                <tag id="0">
                    <st name="^">^</st>
                </tag>
                <tag id="1">
                    <st name="$">$</st>
                </tag>
                <tag id="2">
                    <st name="^">^</st>
                </tag>
                <tag id="3">
                    <st name="$">$</st>
                </tag>
                <tag id="4">
                    <st name="^">^</st>
                </tag>
                <tag id="5">
                    <st name="$">$</st>
                </tag>
            </tag-defs>
        </header>
        <body>
            <trans-unit translate="no" id="08c58142-03fe-4aad-8bc6-64e45600e91c">
                <source>
                    <x id="0"/>
                </source>
            </trans-unit>
            <trans-unit id="038509df-7f97-4faa-867f-ec00a1290f62">
                <source>Ein Satz zu übersetzen</source>
                <seg-source>
                    <mrk mtype="seg" mid="1">Ein Satz zu übersetzen</mrk>
                </seg-source>
                <target>
                    <mrk mtype="seg" mid="1"/>
                </target>
                <sdl:seg-defs>
                    <sdl:seg id="1"/>
                </sdl:seg-defs>
            </trans-unit>
            <trans-unit translate="no" id="b3f5e43b-6bba-41e4-a9fd-b7e4077694cc">
                <source>
                    <x id="1"/>
                    <x id="2"/>
                </source>
            </trans-unit>
            <trans-unit id="4c7dcbe2-1ebe-4e56-bb9a-2fe647b12f1f">
                <source>Ein zweiter Satz zu übersetzen</source>
                <seg-source>
                    <mrk mtype="seg" mid="2">Ein zweiter Satz zu übersetzen</mrk>
                </seg-source>
                <target>
                    <mrk mtype="seg" mid="2"/>
                </target>
                <sdl:seg-defs>
                    <sdl:seg id="2"/>
                </sdl:seg-defs>
            </trans-unit>
            <trans-unit translate="no" id="0ca0c301-f5a2-44e8-8754-7618c98e14c6">
                <source>
                    <x id="3"/>
                    <x id="4"/>
                </source>
            </trans-unit>
            <trans-unit id="5b3973af-b0cf-4dcf-b66c-aea309389c2d">
                <source>Ein letzter weiterer Satz zu übersetzen</source>
                <seg-source>
                    <mrk mtype="seg" mid="3">Ein letzter weiterer Satz zu übersetzen</mrk>
                </seg-source>
                <target>
                    <mrk mtype="seg" mid="3"/>
                </target>
                <sdl:seg-defs>
                    <sdl:seg id="3"/>
                </sdl:seg-defs>
            </trans-unit>
            <trans-unit translate="no" id="1cced868-b401-45c5-be2b-ea1fede236c0">
                <source>
                    <x id="5"/>
                </source>
            </trans-unit>
        </body>
    </file>
</xliff>

这是我读取文件的代码,但我不知道如何处理源代码段中的标签,我想一定有更好的方法来替换开始标签:

    string fileContents = File.ReadAllText(ofd_ToTranslate.FileName);

    fileContents = fileContents.Replace("<?xml version=\"1.0\" encoding=\"utf - 8\"?><xliff xmlns:sdl=\"http://sdl.com/FileTypes/SdlXliff/1.0\" xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\" sdl:version=\"1.0\">", "<xml>");
    fileContents = fileContents.Replace("</xliff>", "</xml>");

    XmlReaderSettings settings = new XmlReaderSettings { NameTable = new NameTable() };
    XmlNamespaceManager xmlns = new XmlNamespaceManager(settings.NameTable);
    xmlns.AddNamespace("sdl", "");
    XmlParserContext context = new XmlParserContext(null, xmlns, "", XmlSpace.Default);
    XmlReader reader = XmlReader.Create(new StringReader(fileContents), settings, context);
    XmlDocument xmlDoc = new XmlDocument();

    xmlDoc.Load(reader);

    XmlNodeList sourceElements = xmlDoc.GetElementsByTagName("source");
    XmlNodeList targetElements = xmlDoc.GetElementsByTagName("target");

可以使用xml序列化

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Serialization;

namespace ConsoleApp1
{
    class Program
    {
        const string FILENAME = @"c:\temp\test.xml";
        static void Main(string[] args)
        {
            XmlReader reader = XmlReader.Create(FILENAME);
            XmlSerializer serializer = new XmlSerializer(typeof(Xliff));
            Xliff xliff = (Xliff)serializer.Deserialize(reader);
        }
    }
    [XmlRoot(ElementName = "xliff", Namespace = "urn:oasis:names:tc:xliff:document:1.2")]
    public class Xliff
    {
        [XmlElement("file")]
        public File file { get; set; }
    }
    public class File
    {
        [XmlAttribute()]
        public string datatype { get; set; }
        [XmlAttribute("source-language")]
        public string sourceLanguage { get; set; }

        [XmlElement("header")]
        public Header header { get; set; }
        [XmlElement("body")]
        public Body body { get; set; }
    }
    public class Header
    { 
        [XmlElement(ElementName = "file-info", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
        public FileInfo fileInfo { get; set; }
        [XmlElement(ElementName = "filetype-info", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
        public FileType fileType { get; set; }
        [XmlArray(ElementName = "tag-defs", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
        [XmlArrayItem(ElementName = "tag")]
        public List<Tag> tags { get; set; }
    }
    public class FileInfo
    {
        [XmlElement(ElementName = "value", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
        public List<Value> values { get; set; }
        [XmlElement(ElementName = "sniff-info")]
        public SniffInfo sniffInfo { get; set; }
    }
    public class Value
    {
        [XmlAttribute()]
        public string key { get; set; }
        [XmlText]
        public string value { get; set; }
    }
    public class SniffInfo
    {
        [XmlElement(ElementName = "detected-encoding")]
        public Encoding encoding { get; set; }
        [XmlElement(ElementName = "detected-source-lang")]
        public Source source { get; set; }
        [XmlArray("props")]
        [XmlArrayItem("value")]
        public List<Value> values { get; set; }

    }
    public class Encoding
    {
        [XmlAttribute("detection-level")]
        public string detectionLevel { get; set; }
        [XmlAttribute()]
        public string encoding { get; set; }
    }
    public class Source
    {
        [XmlAttribute("detection-level")]
        public string detectionLevel { get; set; }
        [XmlAttribute()]
        public string lang { get; set; }
    }
    public class FileType
    {
        [XmlElement(ElementName = "filetype-id")]
        public string id { get; set; }
    }
    public class Tag
    {
        [XmlAttribute("id")]
        public int id { get; set; }
        [XmlElement(ElementName = "st")]
        public St st { get; set; }
    }
    public class St
    {
        [XmlAttribute()]
        public string name { get; set; }
        public string value { get; set; }
    }
    public class Body
    {
        [XmlElement(ElementName = "trans-unit")]
        public List<TransUnit> transUnits { get; set; }
    }
    public class TransUnit
    {
        [XmlAttribute()]
        public string translate { get; set; }
        [XmlAttribute()]
        public string id { get; set; }
        [XmlArray("source")]
        [XmlArrayItem("x")]
        public List<X> xs { get; set; }
        [XmlElement(ElementName = "seg-source")]
        public SegSource segSource { get; set; }

        public Target target { get; set; }
        [XmlElement(ElementName = "seg-defs", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
        public SegDefs segDeg { get; set; }
    }
    public class X
    {
        [XmlAttribute("id")]
        public int id { get; set; }
    }
    public class SegSource
    {
        public Mrk mrk { get; set; }
    }
    public class Target
    {
        public Mrk mrk { get; set; }
    }
    public class Mrk
    {
        [XmlAttribute()]
        public string mtype { get; set; }
        [XmlAttribute()]
        public string mid { get; set; }
        [XmlText]
        public string value { get; set; }
    }
    public class SegDefs
    {
        public Seg seg { get; set; }
    }
    public class Seg
    {
        [XmlAttribute()]
        public int id { get; set; }
    }
}

您的 XML 完全没问题,但它有一个默认命名空间:

xmlns="urn:oasis:names:tc:xliff:document:1.2"

要访问您需要使用命名空间的节点。

这是一个例子:

var xd = XDocument.Load(@"file.xml");
var xn = XNamespace.Get("urn:oasis:names:tc:xliff:document:1.2");
var tus = xd.Root?.Descendants(xn + "trans-unit");
Console.WriteLine(tus.Count());

为我输出 7