您如何读取、处理和写入非标准格式的内容 xml
How do you read, process and write content of a non-standard formatted xml
我正在尝试用 C# 处理语言-XML-文件的内容以进行机器翻译。
<seg-source>
段的内容应翻译并写回 <target>
段。原文或译文句段内的标签格式应保持不变。
我的第一个问题是,由于开始和结束标记不是 <xml>
和 </xml>
,所以无法正确读取 xml 文件。用<xml>
-tag替换前两行文本是行不通的,因为原来的XML-file是全部写在一行中的(下面的例子是为了更好阅读而格式化的)
有没有一种简单的方法可以复制所有应该转换为数组的源信息,并在我处理完后写回?
这是 XML-文件 (.sdlxliff) 的样子:
<?xml version="1.0" encoding="utf-8"?>
<xliff xmlns:sdl="http://sdl.com/FileTypes/SdlXliff/1.0" xmlns="urn:oasis:names:tc:xliff:document:1.2" version="1.2" sdl:version="1.0">
<file original="" datatype="x-sdlfilterframework2" source-language="de-DE" target-language="en-US">
<header>
<file-info xmlns="http://sdl.com/FileTypes/SdlXliff/1.0">
<value key="SDL:FileId">77260240-fccf-4e75-81e3-7a1ab00fe948</value>
<value key="SDL:CreationDate">03/18/2022 16:00:07</value>
<value key="SDL:OriginalFilePath"></value>
<value key="SDL:FileTypeDllVersion">1.8.2.0</value>
<value key="SDL:OriginalEncoding">utf-8</value>
<value key="SDL:AutoClonedFlagSupported">True</value>
<value key="HasUtf8Bom">False</value>
<value key="LineBreakType">
</value>
<value key="ParagraphTextDirections"/>
<sniff-info>
<detected-encoding detection-level="Likely" encoding="utf-8"/>
<detected-source-lang detection-level="Guess" lang="de-DE"/>
<props>
<value key="HasUtf8Bom">False</value>
<value key="LineBreakType">
</value>
</props>
</sniff-info>
</file-info>
<sdl:filetype-info>
<sdl:filetype-id>Plain Text v 1.0.0.0</sdl:filetype-id>
</sdl:filetype-info>
<tag-defs xmlns="http://sdl.com/FileTypes/SdlXliff/1.0">
<tag id="0">
<st name="^">^</st>
</tag>
<tag id="1">
<st name="$">$</st>
</tag>
<tag id="2">
<st name="^">^</st>
</tag>
<tag id="3">
<st name="$">$</st>
</tag>
<tag id="4">
<st name="^">^</st>
</tag>
<tag id="5">
<st name="$">$</st>
</tag>
</tag-defs>
</header>
<body>
<trans-unit translate="no" id="08c58142-03fe-4aad-8bc6-64e45600e91c">
<source>
<x id="0"/>
</source>
</trans-unit>
<trans-unit id="038509df-7f97-4faa-867f-ec00a1290f62">
<source>Ein Satz zu übersetzen</source>
<seg-source>
<mrk mtype="seg" mid="1">Ein Satz zu übersetzen</mrk>
</seg-source>
<target>
<mrk mtype="seg" mid="1"/>
</target>
<sdl:seg-defs>
<sdl:seg id="1"/>
</sdl:seg-defs>
</trans-unit>
<trans-unit translate="no" id="b3f5e43b-6bba-41e4-a9fd-b7e4077694cc">
<source>
<x id="1"/>
<x id="2"/>
</source>
</trans-unit>
<trans-unit id="4c7dcbe2-1ebe-4e56-bb9a-2fe647b12f1f">
<source>Ein zweiter Satz zu übersetzen</source>
<seg-source>
<mrk mtype="seg" mid="2">Ein zweiter Satz zu übersetzen</mrk>
</seg-source>
<target>
<mrk mtype="seg" mid="2"/>
</target>
<sdl:seg-defs>
<sdl:seg id="2"/>
</sdl:seg-defs>
</trans-unit>
<trans-unit translate="no" id="0ca0c301-f5a2-44e8-8754-7618c98e14c6">
<source>
<x id="3"/>
<x id="4"/>
</source>
</trans-unit>
<trans-unit id="5b3973af-b0cf-4dcf-b66c-aea309389c2d">
<source>Ein letzter weiterer Satz zu übersetzen</source>
<seg-source>
<mrk mtype="seg" mid="3">Ein letzter weiterer Satz zu übersetzen</mrk>
</seg-source>
<target>
<mrk mtype="seg" mid="3"/>
</target>
<sdl:seg-defs>
<sdl:seg id="3"/>
</sdl:seg-defs>
</trans-unit>
<trans-unit translate="no" id="1cced868-b401-45c5-be2b-ea1fede236c0">
<source>
<x id="5"/>
</source>
</trans-unit>
</body>
</file>
</xliff>
这是我读取文件的代码,但我不知道如何处理源代码段中的标签,我想一定有更好的方法来替换开始标签:
string fileContents = File.ReadAllText(ofd_ToTranslate.FileName);
fileContents = fileContents.Replace("<?xml version=\"1.0\" encoding=\"utf - 8\"?><xliff xmlns:sdl=\"http://sdl.com/FileTypes/SdlXliff/1.0\" xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\" sdl:version=\"1.0\">", "<xml>");
fileContents = fileContents.Replace("</xliff>", "</xml>");
XmlReaderSettings settings = new XmlReaderSettings { NameTable = new NameTable() };
XmlNamespaceManager xmlns = new XmlNamespaceManager(settings.NameTable);
xmlns.AddNamespace("sdl", "");
XmlParserContext context = new XmlParserContext(null, xmlns, "", XmlSpace.Default);
XmlReader reader = XmlReader.Create(new StringReader(fileContents), settings, context);
XmlDocument xmlDoc = new XmlDocument();
xmlDoc.Load(reader);
XmlNodeList sourceElements = xmlDoc.GetElementsByTagName("source");
XmlNodeList targetElements = xmlDoc.GetElementsByTagName("target");
可以使用xml序列化
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Serialization;
namespace ConsoleApp1
{
class Program
{
const string FILENAME = @"c:\temp\test.xml";
static void Main(string[] args)
{
XmlReader reader = XmlReader.Create(FILENAME);
XmlSerializer serializer = new XmlSerializer(typeof(Xliff));
Xliff xliff = (Xliff)serializer.Deserialize(reader);
}
}
[XmlRoot(ElementName = "xliff", Namespace = "urn:oasis:names:tc:xliff:document:1.2")]
public class Xliff
{
[XmlElement("file")]
public File file { get; set; }
}
public class File
{
[XmlAttribute()]
public string datatype { get; set; }
[XmlAttribute("source-language")]
public string sourceLanguage { get; set; }
[XmlElement("header")]
public Header header { get; set; }
[XmlElement("body")]
public Body body { get; set; }
}
public class Header
{
[XmlElement(ElementName = "file-info", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
public FileInfo fileInfo { get; set; }
[XmlElement(ElementName = "filetype-info", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
public FileType fileType { get; set; }
[XmlArray(ElementName = "tag-defs", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
[XmlArrayItem(ElementName = "tag")]
public List<Tag> tags { get; set; }
}
public class FileInfo
{
[XmlElement(ElementName = "value", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
public List<Value> values { get; set; }
[XmlElement(ElementName = "sniff-info")]
public SniffInfo sniffInfo { get; set; }
}
public class Value
{
[XmlAttribute()]
public string key { get; set; }
[XmlText]
public string value { get; set; }
}
public class SniffInfo
{
[XmlElement(ElementName = "detected-encoding")]
public Encoding encoding { get; set; }
[XmlElement(ElementName = "detected-source-lang")]
public Source source { get; set; }
[XmlArray("props")]
[XmlArrayItem("value")]
public List<Value> values { get; set; }
}
public class Encoding
{
[XmlAttribute("detection-level")]
public string detectionLevel { get; set; }
[XmlAttribute()]
public string encoding { get; set; }
}
public class Source
{
[XmlAttribute("detection-level")]
public string detectionLevel { get; set; }
[XmlAttribute()]
public string lang { get; set; }
}
public class FileType
{
[XmlElement(ElementName = "filetype-id")]
public string id { get; set; }
}
public class Tag
{
[XmlAttribute("id")]
public int id { get; set; }
[XmlElement(ElementName = "st")]
public St st { get; set; }
}
public class St
{
[XmlAttribute()]
public string name { get; set; }
public string value { get; set; }
}
public class Body
{
[XmlElement(ElementName = "trans-unit")]
public List<TransUnit> transUnits { get; set; }
}
public class TransUnit
{
[XmlAttribute()]
public string translate { get; set; }
[XmlAttribute()]
public string id { get; set; }
[XmlArray("source")]
[XmlArrayItem("x")]
public List<X> xs { get; set; }
[XmlElement(ElementName = "seg-source")]
public SegSource segSource { get; set; }
public Target target { get; set; }
[XmlElement(ElementName = "seg-defs", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
public SegDefs segDeg { get; set; }
}
public class X
{
[XmlAttribute("id")]
public int id { get; set; }
}
public class SegSource
{
public Mrk mrk { get; set; }
}
public class Target
{
public Mrk mrk { get; set; }
}
public class Mrk
{
[XmlAttribute()]
public string mtype { get; set; }
[XmlAttribute()]
public string mid { get; set; }
[XmlText]
public string value { get; set; }
}
public class SegDefs
{
public Seg seg { get; set; }
}
public class Seg
{
[XmlAttribute()]
public int id { get; set; }
}
}
您的 XML 完全没问题,但它有一个默认命名空间:
xmlns="urn:oasis:names:tc:xliff:document:1.2"
要访问您需要使用命名空间的节点。
这是一个例子:
var xd = XDocument.Load(@"file.xml");
var xn = XNamespace.Get("urn:oasis:names:tc:xliff:document:1.2");
var tus = xd.Root?.Descendants(xn + "trans-unit");
Console.WriteLine(tus.Count());
为我输出 7
。
我正在尝试用 C# 处理语言-XML-文件的内容以进行机器翻译。
<seg-source>
段的内容应翻译并写回 <target>
段。原文或译文句段内的标签格式应保持不变。
我的第一个问题是,由于开始和结束标记不是 <xml>
和 </xml>
,所以无法正确读取 xml 文件。用<xml>
-tag替换前两行文本是行不通的,因为原来的XML-file是全部写在一行中的(下面的例子是为了更好阅读而格式化的)
有没有一种简单的方法可以复制所有应该转换为数组的源信息,并在我处理完后写回?
这是 XML-文件 (.sdlxliff) 的样子:
<?xml version="1.0" encoding="utf-8"?>
<xliff xmlns:sdl="http://sdl.com/FileTypes/SdlXliff/1.0" xmlns="urn:oasis:names:tc:xliff:document:1.2" version="1.2" sdl:version="1.0">
<file original="" datatype="x-sdlfilterframework2" source-language="de-DE" target-language="en-US">
<header>
<file-info xmlns="http://sdl.com/FileTypes/SdlXliff/1.0">
<value key="SDL:FileId">77260240-fccf-4e75-81e3-7a1ab00fe948</value>
<value key="SDL:CreationDate">03/18/2022 16:00:07</value>
<value key="SDL:OriginalFilePath"></value>
<value key="SDL:FileTypeDllVersion">1.8.2.0</value>
<value key="SDL:OriginalEncoding">utf-8</value>
<value key="SDL:AutoClonedFlagSupported">True</value>
<value key="HasUtf8Bom">False</value>
<value key="LineBreakType">
</value>
<value key="ParagraphTextDirections"/>
<sniff-info>
<detected-encoding detection-level="Likely" encoding="utf-8"/>
<detected-source-lang detection-level="Guess" lang="de-DE"/>
<props>
<value key="HasUtf8Bom">False</value>
<value key="LineBreakType">
</value>
</props>
</sniff-info>
</file-info>
<sdl:filetype-info>
<sdl:filetype-id>Plain Text v 1.0.0.0</sdl:filetype-id>
</sdl:filetype-info>
<tag-defs xmlns="http://sdl.com/FileTypes/SdlXliff/1.0">
<tag id="0">
<st name="^">^</st>
</tag>
<tag id="1">
<st name="$">$</st>
</tag>
<tag id="2">
<st name="^">^</st>
</tag>
<tag id="3">
<st name="$">$</st>
</tag>
<tag id="4">
<st name="^">^</st>
</tag>
<tag id="5">
<st name="$">$</st>
</tag>
</tag-defs>
</header>
<body>
<trans-unit translate="no" id="08c58142-03fe-4aad-8bc6-64e45600e91c">
<source>
<x id="0"/>
</source>
</trans-unit>
<trans-unit id="038509df-7f97-4faa-867f-ec00a1290f62">
<source>Ein Satz zu übersetzen</source>
<seg-source>
<mrk mtype="seg" mid="1">Ein Satz zu übersetzen</mrk>
</seg-source>
<target>
<mrk mtype="seg" mid="1"/>
</target>
<sdl:seg-defs>
<sdl:seg id="1"/>
</sdl:seg-defs>
</trans-unit>
<trans-unit translate="no" id="b3f5e43b-6bba-41e4-a9fd-b7e4077694cc">
<source>
<x id="1"/>
<x id="2"/>
</source>
</trans-unit>
<trans-unit id="4c7dcbe2-1ebe-4e56-bb9a-2fe647b12f1f">
<source>Ein zweiter Satz zu übersetzen</source>
<seg-source>
<mrk mtype="seg" mid="2">Ein zweiter Satz zu übersetzen</mrk>
</seg-source>
<target>
<mrk mtype="seg" mid="2"/>
</target>
<sdl:seg-defs>
<sdl:seg id="2"/>
</sdl:seg-defs>
</trans-unit>
<trans-unit translate="no" id="0ca0c301-f5a2-44e8-8754-7618c98e14c6">
<source>
<x id="3"/>
<x id="4"/>
</source>
</trans-unit>
<trans-unit id="5b3973af-b0cf-4dcf-b66c-aea309389c2d">
<source>Ein letzter weiterer Satz zu übersetzen</source>
<seg-source>
<mrk mtype="seg" mid="3">Ein letzter weiterer Satz zu übersetzen</mrk>
</seg-source>
<target>
<mrk mtype="seg" mid="3"/>
</target>
<sdl:seg-defs>
<sdl:seg id="3"/>
</sdl:seg-defs>
</trans-unit>
<trans-unit translate="no" id="1cced868-b401-45c5-be2b-ea1fede236c0">
<source>
<x id="5"/>
</source>
</trans-unit>
</body>
</file>
</xliff>
这是我读取文件的代码,但我不知道如何处理源代码段中的标签,我想一定有更好的方法来替换开始标签:
string fileContents = File.ReadAllText(ofd_ToTranslate.FileName);
fileContents = fileContents.Replace("<?xml version=\"1.0\" encoding=\"utf - 8\"?><xliff xmlns:sdl=\"http://sdl.com/FileTypes/SdlXliff/1.0\" xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\" sdl:version=\"1.0\">", "<xml>");
fileContents = fileContents.Replace("</xliff>", "</xml>");
XmlReaderSettings settings = new XmlReaderSettings { NameTable = new NameTable() };
XmlNamespaceManager xmlns = new XmlNamespaceManager(settings.NameTable);
xmlns.AddNamespace("sdl", "");
XmlParserContext context = new XmlParserContext(null, xmlns, "", XmlSpace.Default);
XmlReader reader = XmlReader.Create(new StringReader(fileContents), settings, context);
XmlDocument xmlDoc = new XmlDocument();
xmlDoc.Load(reader);
XmlNodeList sourceElements = xmlDoc.GetElementsByTagName("source");
XmlNodeList targetElements = xmlDoc.GetElementsByTagName("target");
可以使用xml序列化
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Serialization;
namespace ConsoleApp1
{
class Program
{
const string FILENAME = @"c:\temp\test.xml";
static void Main(string[] args)
{
XmlReader reader = XmlReader.Create(FILENAME);
XmlSerializer serializer = new XmlSerializer(typeof(Xliff));
Xliff xliff = (Xliff)serializer.Deserialize(reader);
}
}
[XmlRoot(ElementName = "xliff", Namespace = "urn:oasis:names:tc:xliff:document:1.2")]
public class Xliff
{
[XmlElement("file")]
public File file { get; set; }
}
public class File
{
[XmlAttribute()]
public string datatype { get; set; }
[XmlAttribute("source-language")]
public string sourceLanguage { get; set; }
[XmlElement("header")]
public Header header { get; set; }
[XmlElement("body")]
public Body body { get; set; }
}
public class Header
{
[XmlElement(ElementName = "file-info", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
public FileInfo fileInfo { get; set; }
[XmlElement(ElementName = "filetype-info", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
public FileType fileType { get; set; }
[XmlArray(ElementName = "tag-defs", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
[XmlArrayItem(ElementName = "tag")]
public List<Tag> tags { get; set; }
}
public class FileInfo
{
[XmlElement(ElementName = "value", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
public List<Value> values { get; set; }
[XmlElement(ElementName = "sniff-info")]
public SniffInfo sniffInfo { get; set; }
}
public class Value
{
[XmlAttribute()]
public string key { get; set; }
[XmlText]
public string value { get; set; }
}
public class SniffInfo
{
[XmlElement(ElementName = "detected-encoding")]
public Encoding encoding { get; set; }
[XmlElement(ElementName = "detected-source-lang")]
public Source source { get; set; }
[XmlArray("props")]
[XmlArrayItem("value")]
public List<Value> values { get; set; }
}
public class Encoding
{
[XmlAttribute("detection-level")]
public string detectionLevel { get; set; }
[XmlAttribute()]
public string encoding { get; set; }
}
public class Source
{
[XmlAttribute("detection-level")]
public string detectionLevel { get; set; }
[XmlAttribute()]
public string lang { get; set; }
}
public class FileType
{
[XmlElement(ElementName = "filetype-id")]
public string id { get; set; }
}
public class Tag
{
[XmlAttribute("id")]
public int id { get; set; }
[XmlElement(ElementName = "st")]
public St st { get; set; }
}
public class St
{
[XmlAttribute()]
public string name { get; set; }
public string value { get; set; }
}
public class Body
{
[XmlElement(ElementName = "trans-unit")]
public List<TransUnit> transUnits { get; set; }
}
public class TransUnit
{
[XmlAttribute()]
public string translate { get; set; }
[XmlAttribute()]
public string id { get; set; }
[XmlArray("source")]
[XmlArrayItem("x")]
public List<X> xs { get; set; }
[XmlElement(ElementName = "seg-source")]
public SegSource segSource { get; set; }
public Target target { get; set; }
[XmlElement(ElementName = "seg-defs", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
public SegDefs segDeg { get; set; }
}
public class X
{
[XmlAttribute("id")]
public int id { get; set; }
}
public class SegSource
{
public Mrk mrk { get; set; }
}
public class Target
{
public Mrk mrk { get; set; }
}
public class Mrk
{
[XmlAttribute()]
public string mtype { get; set; }
[XmlAttribute()]
public string mid { get; set; }
[XmlText]
public string value { get; set; }
}
public class SegDefs
{
public Seg seg { get; set; }
}
public class Seg
{
[XmlAttribute()]
public int id { get; set; }
}
}
您的 XML 完全没问题,但它有一个默认命名空间:
xmlns="urn:oasis:names:tc:xliff:document:1.2"
要访问您需要使用命名空间的节点。
这是一个例子:
var xd = XDocument.Load(@"file.xml");
var xn = XNamespace.Get("urn:oasis:names:tc:xliff:document:1.2");
var tus = xd.Root?.Descendants(xn + "trans-unit");
Console.WriteLine(tus.Count());
为我输出 7
。