如何使用 XmlSerializer 反序列化包含 \u0000 的 XML 文件?
How to deserialize an XML file containing an \u0000 using XmlSerializer?
我对 XmlSerializer 有疑问。在我巨大的 XML 文件中,有一些空字符 (\u0000),所以 XmlSerializer (Deserializer) 给我一个错误。我发现我需要将 Normalization 设置为 false(通过:https://msdn.microsoft.com/en-us/library/aa302290.aspx),所以我尝试了这个:
XmlSerializer deserializer = new XmlSerializer(typeof(T));
XmlTextReader reader = new XmlTextReader(filename);
reader.Normalization = false;
return (T)deserializer.Deserialize(reader);
我也尝试了第二种可能性,当我使用 XmlReader 时,因为 MSDN 也建议我尝试将 CheckCharacters 设置为 false,如下所示:
XmlSerializer deserializer = new XmlSerializer(typeof(T));
XmlReaderSettings settings = new XmlReaderSettings() { CheckCharacters = false };
using (XmlReader reader = XmlReader.Create(filename, settings))
{
return (T)deserializer.Deserialize(reader);
}
`
但是两种解决方案都给我相同的结果:XML 中的行和列上的 InvalidOperationException 空字符在哪里。
你能给我一些建议吗?我需要 "load" XML 结构到我定义的 class。没有这些字符的行它工作正常。
谢谢! :)
编辑: 我忘了说,我试过将内容加载到字符串并更新字符串,但是插入的内容太大,所以我得到 System.OutOfMemoryException 如果我尝试逐行解析文件,它会太慢。 :(
您可以转至 reader 级别 - subclass TextReader
class 执行清理并将其提取到 XmlSerializer
.
var deserializer = new XmlSerializer(typeof(T));
T instance;
using(var cleanupTextReader = new CleanupTextReader(reader)) {
instance = deserializer.Deserialize(cleanupTextReader);
}
其中 CleanupTextReader
类似于:
internal sealed class CleanupTextReader : TextReader
{
private TextReader _in;
internal CleanupTextReader(TextReader t)
{
_in = t;
}
public override void Close()
{
_in.Close();
}
protected override void Dispose(bool disposing)
{
if (disposing)
{
((IDisposable) _in).Dispose();
}
}
public override int Peek()
{
return _in.Peek();
}
public override int Read()
{
while(true)
{
var result = _in.Read();
if (result != '\u0000')
{
return result;
}
}
}
private string CleanupString(string value)
{
if (string.IsNullOrEmpty(value) || value.IndexOfAny(new char['\u0000']) < 0)
{
return value;
}
var builder = new StringBuilder(value.Length);
foreach (var ch in value)
{
if (ch != '\u0000')
{
builder.Append(ch);
}
}
return builder.ToString();
}
private int CleanupBuffer(char[] buffer, int index, int count)
{
int adjustedCount = count;
if (count > 0)
{
var readIndex = index;
var writeIndex = index;
while (readIndex < index + count)
{
var ch = buffer[readIndex];
readIndex++;
if (ch == '\u0000')
{
adjustedCount--;
}
else
{
buffer[writeIndex] = ch;
writeIndex++;
}
}
}
return adjustedCount;
}
public override int Read(char[] buffer, int index, int count)
{
while (true)
{
int reallyRead = _in.Read(buffer, index, count);
if (reallyRead <= 0)
{
return reallyRead;
}
int cleanRead = CleanupBuffer(buffer, index, reallyRead);
if (cleanRead != 0)
{
return cleanRead;
}
}
}
public override int ReadBlock(char[] buffer, int index, int count)
{
while (true)
{
int reallyRead = _in.ReadBlock(buffer, index, count);
if (reallyRead <= 0)
{
return reallyRead;
}
int cleanRead = CleanupBuffer(buffer, index, reallyRead);
if (cleanRead != 0)
{
return cleanRead;
}
}
}
public override string ReadLine()
{
return CleanupString(_in.ReadLine());
}
public override string ReadToEnd()
{
return CleanupString(_in.ReadToEnd());
}
}
我对 XmlSerializer 有疑问。在我巨大的 XML 文件中,有一些空字符 (\u0000),所以 XmlSerializer (Deserializer) 给我一个错误。我发现我需要将 Normalization 设置为 false(通过:https://msdn.microsoft.com/en-us/library/aa302290.aspx),所以我尝试了这个:
XmlSerializer deserializer = new XmlSerializer(typeof(T));
XmlTextReader reader = new XmlTextReader(filename);
reader.Normalization = false;
return (T)deserializer.Deserialize(reader);
我也尝试了第二种可能性,当我使用 XmlReader 时,因为 MSDN 也建议我尝试将 CheckCharacters 设置为 false,如下所示:
XmlSerializer deserializer = new XmlSerializer(typeof(T));
XmlReaderSettings settings = new XmlReaderSettings() { CheckCharacters = false };
using (XmlReader reader = XmlReader.Create(filename, settings))
{
return (T)deserializer.Deserialize(reader);
}
`
但是两种解决方案都给我相同的结果:XML 中的行和列上的 InvalidOperationException 空字符在哪里。
你能给我一些建议吗?我需要 "load" XML 结构到我定义的 class。没有这些字符的行它工作正常。
谢谢! :)
编辑: 我忘了说,我试过将内容加载到字符串并更新字符串,但是插入的内容太大,所以我得到 System.OutOfMemoryException 如果我尝试逐行解析文件,它会太慢。 :(
您可以转至 reader 级别 - subclass TextReader
class 执行清理并将其提取到 XmlSerializer
.
var deserializer = new XmlSerializer(typeof(T));
T instance;
using(var cleanupTextReader = new CleanupTextReader(reader)) {
instance = deserializer.Deserialize(cleanupTextReader);
}
其中 CleanupTextReader
类似于:
internal sealed class CleanupTextReader : TextReader
{
private TextReader _in;
internal CleanupTextReader(TextReader t)
{
_in = t;
}
public override void Close()
{
_in.Close();
}
protected override void Dispose(bool disposing)
{
if (disposing)
{
((IDisposable) _in).Dispose();
}
}
public override int Peek()
{
return _in.Peek();
}
public override int Read()
{
while(true)
{
var result = _in.Read();
if (result != '\u0000')
{
return result;
}
}
}
private string CleanupString(string value)
{
if (string.IsNullOrEmpty(value) || value.IndexOfAny(new char['\u0000']) < 0)
{
return value;
}
var builder = new StringBuilder(value.Length);
foreach (var ch in value)
{
if (ch != '\u0000')
{
builder.Append(ch);
}
}
return builder.ToString();
}
private int CleanupBuffer(char[] buffer, int index, int count)
{
int adjustedCount = count;
if (count > 0)
{
var readIndex = index;
var writeIndex = index;
while (readIndex < index + count)
{
var ch = buffer[readIndex];
readIndex++;
if (ch == '\u0000')
{
adjustedCount--;
}
else
{
buffer[writeIndex] = ch;
writeIndex++;
}
}
}
return adjustedCount;
}
public override int Read(char[] buffer, int index, int count)
{
while (true)
{
int reallyRead = _in.Read(buffer, index, count);
if (reallyRead <= 0)
{
return reallyRead;
}
int cleanRead = CleanupBuffer(buffer, index, reallyRead);
if (cleanRead != 0)
{
return cleanRead;
}
}
}
public override int ReadBlock(char[] buffer, int index, int count)
{
while (true)
{
int reallyRead = _in.ReadBlock(buffer, index, count);
if (reallyRead <= 0)
{
return reallyRead;
}
int cleanRead = CleanupBuffer(buffer, index, reallyRead);
if (cleanRead != 0)
{
return cleanRead;
}
}
}
public override string ReadLine()
{
return CleanupString(_in.ReadLine());
}
public override string ReadToEnd()
{
return CleanupString(_in.ReadToEnd());
}
}