如何将字符串从一种编码转换为另一种编码
How to convert string from one encoding to another
我有一个 1252 编码的字符串,如何将它转换成 UTF-8 编码
尝试 Encoding.Convert 但在打印时得到相同的 1252 编码字符串
var destEncoding = Encoding.UTF8; // utf-8
var srcEncoding = Encoding.GetEncoding(1252);
// convert the source bytes to the destination bytes
var destBytes = Encoding.Convert(srcEncoding, destEncoding, srcEncoding.GetBytes(srcString));
// process the byte[]
//File.WriteAllBytes("myFile", destBytes); // write it to a file OR ...
var destString = destEncoding.GetString(destBytes); // ... get the string
代码页 1252 是 8 位。可见转义 (%DC) 看起来更像是 URL 编码。见RFC3986你可以这样解码:
using System.Web;
string inputString = "C:/Users/%DCser";
string decoded = HttpUtility.UrlDecode(inputString, Encoding.GetEncoding(1252));
Console.WriteLine(decoded);
上面的代码应该输出不带引号的“c:/Users/Üser”。此示例中的字符串将采用 UTF16 编码,因为这是 .NET 的默认编码。因此,您可以从这里将其转换为目标编码。
正如我试图在评论中解释的那样,这里真正的问题是您有一个 % 编码的字符串值,但是使用的 与您预期的编码不同;要解决此问题,您需要:
- 识别源数据中的 % 编码标记
- 从源 % 编码块中解析出字节
- 使用源编码
解码这些字节
- 使用目的地编码
重新编码这些字节
- 重新应用这些字节的 % 编码
- 将这些值替换回原始字符串
例如(将 "C:/Users/%C5%92ser"
更改为 "C:/Users/%8Cser"
):
using System;
using System.Text;
using System.Text.RegularExpressions;
static class P
{
static void Main()
{
var result = RewriteUrlPercentEncoding("C:/Users/%C5%92ser",
Encoding.UTF8, Encoding.GetEncoding(1252));
Console.WriteLine(result);
}
static string RewriteUrlPercentEncoding(string value, Encoding from, Encoding to)
=> Regex.Replace(value, @"(\%[0-9a-fA-F]{2})+", match => // #1
{
var s = match.Value;
// #2
var bytes = new byte[s.Length / 3];
for (int i = 0; i < bytes.Length; i++)
{
byte hi = ParseNibble(s[(i * 3) + 1]),
lo = ParseNibble(s[(i * 3) + 2]);
bytes[i] = (byte)((hi << 4) | lo);
}
// #3 and #4
var reencoded = to.GetBytes(from.GetString(bytes));
// #5
var chars = new char[3 * reencoded.Length];
int index = 0;
for (int i = 0; i < reencoded.Length; i++)
{
var b = reencoded[i];
chars[index++] = '%';
chars[index++] = WriteNibble((byte)(b >> 4));
chars[index++] = WriteNibble((byte)(b & 0b1111));
}
// #6
return new string(chars);
static byte ParseNibble(char c) => c switch
{
'0' => 0x0,
'1' => 0x1,
'2' => 0x2,
'3' => 0x3,
'4' => 0x4,
'5' => 0x5,
'6' => 0x6,
'7' => 0x7,
'8' => 0x8,
'9' => 0x9,
'A' => 0xA,
'B' => 0xB,
'C' => 0xC,
'D' => 0xD,
'E' => 0xE,
'F' => 0xF,
'a' => 0xA,
'b' => 0xB,
'c' => 0xC,
'd' => 0xD,
'e' => 0xF,
'f' => 0xF,
_ => throw new ArgumentOutOfRangeException(nameof(c)),
};
static char WriteNibble(byte b) => b switch
{
0x0 => '0',
0x1 => '1',
0x2 => '2',
0x3 => '3',
0x4 => '4',
0x5 => '5',
0x6 => '6',
0x7 => '7',
0x8 => '8',
0x9 => '9',
0xA => 'A',
0xB => 'B',
0xC => 'C',
0xD => 'D',
0xE => 'E',
0xF => 'F',
_ => throw new ArgumentOutOfRangeException(nameof(b)),
};
});
}
请注意,以上内容旨在简化而非提高效率;对于大批量工作,有很多方法可以改善这一点。
类似地,反转编码允许我们从 "C:/Users/%DCser"
到 "C:/Users/%C3%9Cser"
:
var result = RewriteUrlPercentEncoding("C:/Users/%DCser",
Encoding.GetEncoding(1252), Encoding.UTF8);
Console.WriteLine(result);
我有一个 1252 编码的字符串,如何将它转换成 UTF-8 编码 尝试 Encoding.Convert 但在打印时得到相同的 1252 编码字符串
var destEncoding = Encoding.UTF8; // utf-8
var srcEncoding = Encoding.GetEncoding(1252);
// convert the source bytes to the destination bytes
var destBytes = Encoding.Convert(srcEncoding, destEncoding, srcEncoding.GetBytes(srcString));
// process the byte[]
//File.WriteAllBytes("myFile", destBytes); // write it to a file OR ...
var destString = destEncoding.GetString(destBytes); // ... get the string
代码页 1252 是 8 位。可见转义 (%DC) 看起来更像是 URL 编码。见RFC3986你可以这样解码:
using System.Web;
string inputString = "C:/Users/%DCser";
string decoded = HttpUtility.UrlDecode(inputString, Encoding.GetEncoding(1252));
Console.WriteLine(decoded);
上面的代码应该输出不带引号的“c:/Users/Üser”。此示例中的字符串将采用 UTF16 编码,因为这是 .NET 的默认编码。因此,您可以从这里将其转换为目标编码。
正如我试图在评论中解释的那样,这里真正的问题是您有一个 % 编码的字符串值,但是使用的 与您预期的编码不同;要解决此问题,您需要:
- 识别源数据中的 % 编码标记
- 从源 % 编码块中解析出字节
- 使用源编码 解码这些字节
- 使用目的地编码 重新编码这些字节
- 重新应用这些字节的 % 编码
- 将这些值替换回原始字符串
例如(将 "C:/Users/%C5%92ser"
更改为 "C:/Users/%8Cser"
):
using System;
using System.Text;
using System.Text.RegularExpressions;
static class P
{
static void Main()
{
var result = RewriteUrlPercentEncoding("C:/Users/%C5%92ser",
Encoding.UTF8, Encoding.GetEncoding(1252));
Console.WriteLine(result);
}
static string RewriteUrlPercentEncoding(string value, Encoding from, Encoding to)
=> Regex.Replace(value, @"(\%[0-9a-fA-F]{2})+", match => // #1
{
var s = match.Value;
// #2
var bytes = new byte[s.Length / 3];
for (int i = 0; i < bytes.Length; i++)
{
byte hi = ParseNibble(s[(i * 3) + 1]),
lo = ParseNibble(s[(i * 3) + 2]);
bytes[i] = (byte)((hi << 4) | lo);
}
// #3 and #4
var reencoded = to.GetBytes(from.GetString(bytes));
// #5
var chars = new char[3 * reencoded.Length];
int index = 0;
for (int i = 0; i < reencoded.Length; i++)
{
var b = reencoded[i];
chars[index++] = '%';
chars[index++] = WriteNibble((byte)(b >> 4));
chars[index++] = WriteNibble((byte)(b & 0b1111));
}
// #6
return new string(chars);
static byte ParseNibble(char c) => c switch
{
'0' => 0x0,
'1' => 0x1,
'2' => 0x2,
'3' => 0x3,
'4' => 0x4,
'5' => 0x5,
'6' => 0x6,
'7' => 0x7,
'8' => 0x8,
'9' => 0x9,
'A' => 0xA,
'B' => 0xB,
'C' => 0xC,
'D' => 0xD,
'E' => 0xE,
'F' => 0xF,
'a' => 0xA,
'b' => 0xB,
'c' => 0xC,
'd' => 0xD,
'e' => 0xF,
'f' => 0xF,
_ => throw new ArgumentOutOfRangeException(nameof(c)),
};
static char WriteNibble(byte b) => b switch
{
0x0 => '0',
0x1 => '1',
0x2 => '2',
0x3 => '3',
0x4 => '4',
0x5 => '5',
0x6 => '6',
0x7 => '7',
0x8 => '8',
0x9 => '9',
0xA => 'A',
0xB => 'B',
0xC => 'C',
0xD => 'D',
0xE => 'E',
0xF => 'F',
_ => throw new ArgumentOutOfRangeException(nameof(b)),
};
});
}
请注意,以上内容旨在简化而非提高效率;对于大批量工作,有很多方法可以改善这一点。
类似地,反转编码允许我们从 "C:/Users/%DCser"
到 "C:/Users/%C3%9Cser"
:
var result = RewriteUrlPercentEncoding("C:/Users/%DCser",
Encoding.GetEncoding(1252), Encoding.UTF8);
Console.WriteLine(result);