使用编码 ISO-8859-1 读取电子邮件正文
Reading email body with encoding ISO-8859-1
我正在使用 Mailkit 通过 IMAP 阅读一些电子邮件的正文内容。
其中一些电子邮件带有内容类型 text/plain
和字符集 ISO-8859-1
,这导致我的代码替换了一些拉丁字符 á é í ó ú
并且显然还替换了 CR
和 LF
通过奇怪的字符,例如 =E1
=FA
=F3
=
...
var body = message.BodyParts.OfType<BodyPart>().FirstOrDefault(x => x.ContentType.IsMimeType("text", "plain"));
var bodyText = (TextPart)folder.GetBodyPart(message.UniqueId, body);
var bodyContent = bodyText.Text;
用Thunderbird或Outlook等邮件客户端打开这些邮件没有问题。他们按原样显示这些字符。我希望能够检索这些拉丁字符。
我试过一些编码选项但没有成功。
var bodyContent = bodyText.GetText(System.Text.Encoding.ASCII);
var bodyContent = bodyText.GetText(System.Text.Encoding.UTF-8);
消息正文使用quoted printable编码。
你必须先解码它。
在 MailKit 中它应该是 DecodeTo method
我终于可以使用 QuotedPrintableDecoder from MimeKit 库让它工作了。
var body = message.BodyParts.OfType<BodyPart>().FirstOrDefault(x => x.ContentType.IsMimeType("text", "plain"));
// If it's encoded using quoted-printable we'll need to decode it first. To do so, we'll need the charset.
var charset = body.ContentType.Charset;
var bodyText = (TextPart)folder.GetBodyPart(message.UniqueId, body);
// Decodes the content by using QuotedPrintableDecoder from MimeKit library.
var bodyContent = DecodeQuotedPrintable(bodyText.Text, charset);
static string DecodeQuotedPrintable (string input, string charset)
{
var decoder = new QuotedPrintableDecoder ();
var buffer = Encoding.ASCII.GetBytes (input);
var output = new byte[decoder.EstimateOutputLength (buffer.Length)];
int used = decoder.Decode (buffer, 0, buffer.Length, output);
var encoding = Encoding.GetEncoding (charset);
return encoding.GetString (output, 0, used);
}
通常您不需要自己解码 quoted-printable 编码的内容,但我的猜测是发送此消息的客户端使用 quoted-printable 编码对内容进行了编码,但没有设置 Content-Transfer-Encoding
header 正确。
我可能会将您的代码更改为更像这样的代码:
// figure out which body part we need
var body = message.BodyParts.OfType<BodyPartText>().FirstOrDefault(x => x.ContentType.IsMimeType("text", "plain"));
// download the body part we need
var bodyText = (TextPart)folder.GetBodyPart(message.UniqueId, body);
// If it's encoded using quoted-printable we'll need to decode it first.
// To do so, we'll need the charset.
//
// The reason I would get it from the `bodyText.ContentType` is because
// this will work even if you used MessageSummaryItems.Body instead of
// MessageSummaryItems.BodyStructure.
var charset = bodyText.ContentType.Charset;
// Decodes the content by using QuotedPrintableDecoder from MimeKit library.
var bodyContent = DecodeQuotedPrintable(bodyText.Content, charset);
// The main changes I'm making to this function compared to what you have is
// using the stream/filter interfaces rather than using the low-level decoder
// directly. You can do it either way, but if you continue using your
// method - I would recommend using Encoding.UTF8.GetBytes() rather than
// Encoding.ASCII.GetBytes() because UTF-8 can handle all strings while
// ASCII cannot.
static string DecodeQuotedPrintable (IMimeContent content, string charset)
{
using (var output = new MemoryStream ()) {
using (filtered = new FilteredStream (output)) {
// add a quoted-printable decoder
filtered.Add (DecoderFilter.Create (ContentEncoding.QuotedPrintable));
// pump the content through the decoder
content.DecodeTo (filtered);
// flush the filtered stream
filtered.Flush ();
}
var encoding = Encoding.GetEncoding (charset);
return encoding.GetString (output.GetBuffer (), 0, (int) output.Length);
}
}
我正在使用 Mailkit 通过 IMAP 阅读一些电子邮件的正文内容。
其中一些电子邮件带有内容类型 text/plain
和字符集 ISO-8859-1
,这导致我的代码替换了一些拉丁字符 á é í ó ú
并且显然还替换了 CR
和 LF
通过奇怪的字符,例如 =E1
=FA
=F3
=
...
var body = message.BodyParts.OfType<BodyPart>().FirstOrDefault(x => x.ContentType.IsMimeType("text", "plain"));
var bodyText = (TextPart)folder.GetBodyPart(message.UniqueId, body);
var bodyContent = bodyText.Text;
用Thunderbird或Outlook等邮件客户端打开这些邮件没有问题。他们按原样显示这些字符。我希望能够检索这些拉丁字符。
我试过一些编码选项但没有成功。
var bodyContent = bodyText.GetText(System.Text.Encoding.ASCII);
var bodyContent = bodyText.GetText(System.Text.Encoding.UTF-8);
消息正文使用quoted printable编码。 你必须先解码它。
在 MailKit 中它应该是 DecodeTo method
我终于可以使用 QuotedPrintableDecoder from MimeKit 库让它工作了。
var body = message.BodyParts.OfType<BodyPart>().FirstOrDefault(x => x.ContentType.IsMimeType("text", "plain"));
// If it's encoded using quoted-printable we'll need to decode it first. To do so, we'll need the charset.
var charset = body.ContentType.Charset;
var bodyText = (TextPart)folder.GetBodyPart(message.UniqueId, body);
// Decodes the content by using QuotedPrintableDecoder from MimeKit library.
var bodyContent = DecodeQuotedPrintable(bodyText.Text, charset);
static string DecodeQuotedPrintable (string input, string charset)
{
var decoder = new QuotedPrintableDecoder ();
var buffer = Encoding.ASCII.GetBytes (input);
var output = new byte[decoder.EstimateOutputLength (buffer.Length)];
int used = decoder.Decode (buffer, 0, buffer.Length, output);
var encoding = Encoding.GetEncoding (charset);
return encoding.GetString (output, 0, used);
}
通常您不需要自己解码 quoted-printable 编码的内容,但我的猜测是发送此消息的客户端使用 quoted-printable 编码对内容进行了编码,但没有设置 Content-Transfer-Encoding
header 正确。
我可能会将您的代码更改为更像这样的代码:
// figure out which body part we need
var body = message.BodyParts.OfType<BodyPartText>().FirstOrDefault(x => x.ContentType.IsMimeType("text", "plain"));
// download the body part we need
var bodyText = (TextPart)folder.GetBodyPart(message.UniqueId, body);
// If it's encoded using quoted-printable we'll need to decode it first.
// To do so, we'll need the charset.
//
// The reason I would get it from the `bodyText.ContentType` is because
// this will work even if you used MessageSummaryItems.Body instead of
// MessageSummaryItems.BodyStructure.
var charset = bodyText.ContentType.Charset;
// Decodes the content by using QuotedPrintableDecoder from MimeKit library.
var bodyContent = DecodeQuotedPrintable(bodyText.Content, charset);
// The main changes I'm making to this function compared to what you have is
// using the stream/filter interfaces rather than using the low-level decoder
// directly. You can do it either way, but if you continue using your
// method - I would recommend using Encoding.UTF8.GetBytes() rather than
// Encoding.ASCII.GetBytes() because UTF-8 can handle all strings while
// ASCII cannot.
static string DecodeQuotedPrintable (IMimeContent content, string charset)
{
using (var output = new MemoryStream ()) {
using (filtered = new FilteredStream (output)) {
// add a quoted-printable decoder
filtered.Add (DecoderFilter.Create (ContentEncoding.QuotedPrintable));
// pump the content through the decoder
content.DecodeTo (filtered);
// flush the filtered stream
filtered.Flush ();
}
var encoding = Encoding.GetEncoding (charset);
return encoding.GetString (output.GetBuffer (), 0, (int) output.Length);
}
}