C++解码电子邮件的主题
C++ decode e-mail's subject
我用 Poco/Net/POP3ClientSession 下载了邮件,我想将电子邮件主题转换为人类可读的,所以我尝试使用这里的 neagoegab's 解决方案:
不幸的是它不起作用:
#include <Poco/Net/POP3ClientSession.h>
#include <Poco/Net/MailMessage.h>
#include <iostream>
#include <string>
using namespace std;
using namespace Poco::Net;
#include <iconv.h>
const size_t BUF_SIZE=1024;
class IConv {
iconv_t ic_;
public:
IConv(const char* to, const char* from)
: ic_(iconv_open(to,from)) { }
~IConv() { iconv_close(ic_); }
bool convert(char* input, char* output, size_t& out_size) {
size_t inbufsize = strlen(input)+1;
return iconv(ic_, &input, &inbufsize, &output, &out_size);
}
};
int main()
{
POP3ClientSession session("poczta.o2.pl");
session.login("my mail", "my password");
POP3ClientSession::MessageInfoVec messages;
session.listMessages(messages);
cout << "id: " << messages[0].id << " size: " << messages[0].size << endl;
MailMessage message;
session.retrieveMessage(messages[0].id, message);
const string subject = message.getSubject();
cout << "Original subject: " << subject << endl;
IConv iconv_("UTF8","ISO-8859-2");
char from[BUF_SIZE];// "=?ISO-8859-2?Q?Re: M=F3j sen o JP II?=";
subject.copy(from, sizeof(from));
char to[BUF_SIZE] = "bye";
size_t outsize = BUF_SIZE;//you will need it
iconv_.convert(from, to, outsize);
cout << "converted: " << to << endl;
}
输出为:
id: 1 size: 2792
Original subject: =?ISO-8859-2?Q?Re: M=F3j sen o JP II?=
converted: =?ISO-8859-2?Q?Re: M=F3j sen o JP II?=
有趣的是,当我尝试使用 POCO 转换主题时,它失败了:
cout << "Encoded with POCO: " << MailMessage::encodeWord("Re: Mój sen o JP II", "ISO-8859-2") << endl; // output: Encoded with POCO: =?ISO-8859-2?q?Re=3A_M=C3=B3j_sen_o_JP_II?=
但是我要接收的主题是:
"Re: Mój sen o JP II"
我发现转换主题的唯一成功方法是:
https://docs.python.org/2/library/email.header.html#email.header.decode_header
所以我的问题是 - 如何将 C++ 中的电子邮件主题转换为 UTF-8 等格式?
与您的情况相关的 RFC 是 RFC 2047。该 RFC 指定了非 ASCII 数据应如何在邮件消息中编码。基本要点是除了可打印的 ASCII 字符之外的所有字节都被转义为“=”字符后跟两个十六进制数字。由于“ó”在 ISO-8859-2 中由字节 0xF3
表示,而 0xF3
不是可打印的 ASCII 字符,因此它被编码为“=F3”。您需要解码邮件中的所有编码字符。
我找到了解决问题的方法(我不确定它是否是 100% 正确的解决方案),但看起来它足以使用:
Poco::UTF8Encoding::convert 将 characterCode 转换为 utf8:
#include <Poco/Net/POP3ClientSession.h>
#include <Poco/Net/MessageHeader.h>
#include <Poco/Net/MailMessage.h>
#include <Poco/UTF8Encoding.h>
#include <iostream>
#include <string>
using namespace std;
using namespace Poco::Net;
class EncoderLatin2
{
public:
EncoderLatin2(const string& encodedSubject)
{
/// encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
int charsetBeginPosition = strlen("=?");
int charsetEndPosition = encodedSubject.find("?", charsetBeginPosition);
charset = encodedSubject.substr(charsetBeginPosition, charsetEndPosition-charsetBeginPosition);
int encodingPosition = charsetEndPosition + strlen("?");
encoding = encodedSubject[encodingPosition];
if ("ISO-8859-2" != charset)
throw std::invalid_argument("Invalid encoding!");
const int lenghtOfEncodedText = encodedSubject.length() - encodingPosition-strlen("?=")-2;
extractedEncodedSubjectToConvert = encodedSubject.substr(encodingPosition+2, lenghtOfEncodedText);
}
string convert()
{
size_t positionOfAssignment = -1;
while (true)
{
positionOfAssignment = extractedEncodedSubjectToConvert.find('=', positionOfAssignment+1);
if (string::npos != positionOfAssignment)
{
const string& charHexCode = extractedEncodedSubjectToConvert.substr(positionOfAssignment + 1, 2);
replaceAllSubstringsWithUnicode(extractedEncodedSubjectToConvert, charHexCode);
}
else
break;
}
return extractedEncodedSubjectToConvert;
}
void replaceAllSubstringsWithUnicode(string& s, const string& charHexCode)
{
const int charCode = stoi(charHexCode, nullptr, 16);
char buffer[10] = {};
encodingConverter.convert(charCode, (unsigned char*)buffer, sizeof(buffer));
replaceAll(s, '=' + charHexCode, buffer);
}
void replaceAll(string& s, const string& replaceFrom, const string& replaceTo)
{
size_t needlePosition = -1;
while (true)
{
needlePosition = s.find(replaceFrom, needlePosition + 1);
if (string::npos == needlePosition)
break;
s.replace(needlePosition, replaceFrom.length(), replaceTo);
}
}
private:
string charset;
char encoding;
Poco::UTF8Encoding encodingConverter;
string extractedEncodedSubjectToConvert;
};
int main()
{
POP3ClientSession session("poczta.o2.pl");
session.login("my mail", "my password");
POP3ClientSession::MessageInfoVec messages;
session.listMessages(messages);
MessageHeader header;
MailMessage message;
auto currentMessage = messages[0];
session.retrieveHeader(currentMessage.id, header);
session.retrieveMessage(currentMessage.id, message);
const string subject = message.getSubject();
EncoderLatin2 encoder(subject);
cout << "Original subject: " << subject << endl;
cout << "Encoded: " << encoder.convert() << endl;
}
我找到了另一个解决方案,比以前更好。
一些电子邮件主题有不同的编码,我注意到:
- Latin2,编码如下:=?ISO-8859-2?Q?...?=
- UTF-8 Base64 像:
=?utf-8?B?Wm9iYWN6Y2llIGNvIGRsYSBXYXMgcHJ6eWdvdG93YWxpxZtteSAvIHN0eWN6ZcWEIHcgTGFzZXJwYXJrdQ==?=
- UTF-8 引用可打印,如:
=?utf-8?Q?...?=
- 无编码(如果只有 ASCII 字符)如:...
因此,使用 POCO(Base64Decoder、Latin2Encoding、UTF8Encoding、QuotedPrintableDecoder)我设法转换了所有大小写:
#include <iostream>
#include <string>
#include <sstream>
#include <Poco/Net/POP3ClientSession.h>
#include <Poco/Net/MessageHeader.h>
#include <Poco/Net/MailMessage.h>
#include <Poco/Base64Decoder.h>
#include <Poco/Latin2Encoding.h>
#include <Poco/UTF8Encoding.h>
#include <Poco/Net/QuotedPrintableDecoder.h>
using namespace std;
class Encoder
{
public:
Encoder(const string& encodedText)
{
isStringEncoded = isEncoded(encodedText);
if (!isStringEncoded)
{
extractedEncodedSubjectToConvert = encodedText;
return;
}
splitEncodedText(encodedText);
}
string convert()
{
if (isStringEncoded)
{
if (Poco::Latin2Encoding().isA(charset))
return decodeFromLatin2();
if (Poco::UTF8Encoding().isA(charset))
return decodeFromUtf8();
}
return extractedEncodedSubjectToConvert;
}
private:
void splitEncodedText(const string& encodedText)
{
/// encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
const int charsetBeginPosition = strlen(sequenceBeginEncodedText);
const int charsetEndPosition = encodedText.find("?", charsetBeginPosition);
charset = encodedText.substr(charsetBeginPosition, charsetEndPosition-charsetBeginPosition);
const int encodingPosition = charsetEndPosition + strlen("?");
encoding = encodedText[encodingPosition];
const int lenghtOfEncodedText = encodedText.length() - encodingPosition-strlen(sequenceBeginEncodedText)-strlen(sequenceEndEncodedText);
extractedEncodedSubjectToConvert = encodedText.substr(encodingPosition+2, lenghtOfEncodedText);
}
bool isEncoded(const string& encodedSubject)
{
if (encodedSubject.size() < 4)
return false;
if (0 != encodedSubject.find(sequenceBeginEncodedText))
return false;
const unsigned positionOfLastTwoCharacters = encodedSubject.size() - strlen(sequenceEndEncodedText);
return positionOfLastTwoCharacters == encodedSubject.rfind(sequenceEndEncodedText);
}
string decodeFromLatin2()
{
size_t positionOfAssignment = -1;
while (true)
{
positionOfAssignment = extractedEncodedSubjectToConvert.find('=', positionOfAssignment+1);
if (string::npos != positionOfAssignment)
{
const string& charHexCode = extractedEncodedSubjectToConvert.substr(positionOfAssignment + 1, 2);
replaceAllSubstringsWithUnicode(extractedEncodedSubjectToConvert, charHexCode);
}
else
break;
}
return extractedEncodedSubjectToConvert;
}
void replaceAllSubstringsWithUnicode(string& s, const string& charHexCode)
{
static Poco::UTF8Encoding encodingConverter;
const int charCode = stoi(charHexCode, nullptr, 16);
char buffer[10] = {};
encodingConverter.convert(charCode, (unsigned char*)buffer, sizeof(buffer));
replaceAll(s, '=' + charHexCode, buffer);
}
void replaceAll(string& s, const string& replaceFrom, const string& replaceTo)
{
size_t needlePosition = -1;
while (true)
{
needlePosition = s.find(replaceFrom, needlePosition + 1);
if (string::npos == needlePosition)
break;
s.replace(needlePosition, replaceFrom.length(), replaceTo);
}
}
string decodeFromUtf8()
{
if('B' == toupper(encoding))
{
return decodeFromBase64();
}
else // if Q:
{
return decodeFromQuatedPrintable();
}
}
string decodeFromBase64()
{
istringstream is(extractedEncodedSubjectToConvert);
Poco::Base64Decoder e64(is);
extractedEncodedSubjectToConvert.clear();
string buffer;
while(getline(e64, buffer))
extractedEncodedSubjectToConvert += buffer;
return extractedEncodedSubjectToConvert;
}
string decodeFromQuatedPrintable()
{
replaceAll(extractedEncodedSubjectToConvert, "_", " ");
istringstream is(extractedEncodedSubjectToConvert);
Poco::Net::QuotedPrintableDecoder qp(is);
extractedEncodedSubjectToConvert.clear();
string buffer;
while(getline(qp, buffer))
extractedEncodedSubjectToConvert += buffer;
return extractedEncodedSubjectToConvert;
}
private:
string charset;
char encoding;
string extractedEncodedSubjectToConvert;
bool isStringEncoded;
static constexpr const char* sequenceBeginEncodedText = "=?";
static constexpr const char* sequenceEndEncodedText = "?=";
};
int main()
{
Poco::Net::POP3ClientSession session("poczta.o2.pl");
session.login("my mail", "my password");
Poco::Net::POP3ClientSession::MessageInfoVec messages;
session.listMessages(messages);
Poco::Net::MessageHeader header;
Poco::Net::MailMessage message;
auto currentMessage = messages[0];
session.retrieveHeader(currentMessage.id, header);
session.retrieveMessage(currentMessage.id, message);
const string subject = message.getSubject();
Encoder encoder(subject);
cout << "Original subject: " << subject << endl;
cout << "Encoded: " << encoder.convert() << endl;
}
我用 Poco/Net/POP3ClientSession 下载了邮件,我想将电子邮件主题转换为人类可读的,所以我尝试使用这里的 neagoegab's 解决方案: 不幸的是它不起作用:
#include <Poco/Net/POP3ClientSession.h>
#include <Poco/Net/MailMessage.h>
#include <iostream>
#include <string>
using namespace std;
using namespace Poco::Net;
#include <iconv.h>
const size_t BUF_SIZE=1024;
class IConv {
iconv_t ic_;
public:
IConv(const char* to, const char* from)
: ic_(iconv_open(to,from)) { }
~IConv() { iconv_close(ic_); }
bool convert(char* input, char* output, size_t& out_size) {
size_t inbufsize = strlen(input)+1;
return iconv(ic_, &input, &inbufsize, &output, &out_size);
}
};
int main()
{
POP3ClientSession session("poczta.o2.pl");
session.login("my mail", "my password");
POP3ClientSession::MessageInfoVec messages;
session.listMessages(messages);
cout << "id: " << messages[0].id << " size: " << messages[0].size << endl;
MailMessage message;
session.retrieveMessage(messages[0].id, message);
const string subject = message.getSubject();
cout << "Original subject: " << subject << endl;
IConv iconv_("UTF8","ISO-8859-2");
char from[BUF_SIZE];// "=?ISO-8859-2?Q?Re: M=F3j sen o JP II?=";
subject.copy(from, sizeof(from));
char to[BUF_SIZE] = "bye";
size_t outsize = BUF_SIZE;//you will need it
iconv_.convert(from, to, outsize);
cout << "converted: " << to << endl;
}
输出为:
id: 1 size: 2792
Original subject: =?ISO-8859-2?Q?Re: M=F3j sen o JP II?=
converted: =?ISO-8859-2?Q?Re: M=F3j sen o JP II?=
有趣的是,当我尝试使用 POCO 转换主题时,它失败了:
cout << "Encoded with POCO: " << MailMessage::encodeWord("Re: Mój sen o JP II", "ISO-8859-2") << endl; // output: Encoded with POCO: =?ISO-8859-2?q?Re=3A_M=C3=B3j_sen_o_JP_II?=
但是我要接收的主题是: "Re: Mój sen o JP II" 我发现转换主题的唯一成功方法是: https://docs.python.org/2/library/email.header.html#email.header.decode_header
所以我的问题是 - 如何将 C++ 中的电子邮件主题转换为 UTF-8 等格式?
与您的情况相关的 RFC 是 RFC 2047。该 RFC 指定了非 ASCII 数据应如何在邮件消息中编码。基本要点是除了可打印的 ASCII 字符之外的所有字节都被转义为“=”字符后跟两个十六进制数字。由于“ó”在 ISO-8859-2 中由字节 0xF3
表示,而 0xF3
不是可打印的 ASCII 字符,因此它被编码为“=F3”。您需要解码邮件中的所有编码字符。
我找到了解决问题的方法(我不确定它是否是 100% 正确的解决方案),但看起来它足以使用: Poco::UTF8Encoding::convert 将 characterCode 转换为 utf8:
#include <Poco/Net/POP3ClientSession.h>
#include <Poco/Net/MessageHeader.h>
#include <Poco/Net/MailMessage.h>
#include <Poco/UTF8Encoding.h>
#include <iostream>
#include <string>
using namespace std;
using namespace Poco::Net;
class EncoderLatin2
{
public:
EncoderLatin2(const string& encodedSubject)
{
/// encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
int charsetBeginPosition = strlen("=?");
int charsetEndPosition = encodedSubject.find("?", charsetBeginPosition);
charset = encodedSubject.substr(charsetBeginPosition, charsetEndPosition-charsetBeginPosition);
int encodingPosition = charsetEndPosition + strlen("?");
encoding = encodedSubject[encodingPosition];
if ("ISO-8859-2" != charset)
throw std::invalid_argument("Invalid encoding!");
const int lenghtOfEncodedText = encodedSubject.length() - encodingPosition-strlen("?=")-2;
extractedEncodedSubjectToConvert = encodedSubject.substr(encodingPosition+2, lenghtOfEncodedText);
}
string convert()
{
size_t positionOfAssignment = -1;
while (true)
{
positionOfAssignment = extractedEncodedSubjectToConvert.find('=', positionOfAssignment+1);
if (string::npos != positionOfAssignment)
{
const string& charHexCode = extractedEncodedSubjectToConvert.substr(positionOfAssignment + 1, 2);
replaceAllSubstringsWithUnicode(extractedEncodedSubjectToConvert, charHexCode);
}
else
break;
}
return extractedEncodedSubjectToConvert;
}
void replaceAllSubstringsWithUnicode(string& s, const string& charHexCode)
{
const int charCode = stoi(charHexCode, nullptr, 16);
char buffer[10] = {};
encodingConverter.convert(charCode, (unsigned char*)buffer, sizeof(buffer));
replaceAll(s, '=' + charHexCode, buffer);
}
void replaceAll(string& s, const string& replaceFrom, const string& replaceTo)
{
size_t needlePosition = -1;
while (true)
{
needlePosition = s.find(replaceFrom, needlePosition + 1);
if (string::npos == needlePosition)
break;
s.replace(needlePosition, replaceFrom.length(), replaceTo);
}
}
private:
string charset;
char encoding;
Poco::UTF8Encoding encodingConverter;
string extractedEncodedSubjectToConvert;
};
int main()
{
POP3ClientSession session("poczta.o2.pl");
session.login("my mail", "my password");
POP3ClientSession::MessageInfoVec messages;
session.listMessages(messages);
MessageHeader header;
MailMessage message;
auto currentMessage = messages[0];
session.retrieveHeader(currentMessage.id, header);
session.retrieveMessage(currentMessage.id, message);
const string subject = message.getSubject();
EncoderLatin2 encoder(subject);
cout << "Original subject: " << subject << endl;
cout << "Encoded: " << encoder.convert() << endl;
}
我找到了另一个解决方案,比以前更好。 一些电子邮件主题有不同的编码,我注意到:
- Latin2,编码如下:=?ISO-8859-2?Q?...?=
- UTF-8 Base64 像: =?utf-8?B?Wm9iYWN6Y2llIGNvIGRsYSBXYXMgcHJ6eWdvdG93YWxpxZtteSAvIHN0eWN6ZcWEIHcgTGFzZXJwYXJrdQ==?=
- UTF-8 引用可打印,如: =?utf-8?Q?...?=
- 无编码(如果只有 ASCII 字符)如:...
因此,使用 POCO(Base64Decoder、Latin2Encoding、UTF8Encoding、QuotedPrintableDecoder)我设法转换了所有大小写:
#include <iostream>
#include <string>
#include <sstream>
#include <Poco/Net/POP3ClientSession.h>
#include <Poco/Net/MessageHeader.h>
#include <Poco/Net/MailMessage.h>
#include <Poco/Base64Decoder.h>
#include <Poco/Latin2Encoding.h>
#include <Poco/UTF8Encoding.h>
#include <Poco/Net/QuotedPrintableDecoder.h>
using namespace std;
class Encoder
{
public:
Encoder(const string& encodedText)
{
isStringEncoded = isEncoded(encodedText);
if (!isStringEncoded)
{
extractedEncodedSubjectToConvert = encodedText;
return;
}
splitEncodedText(encodedText);
}
string convert()
{
if (isStringEncoded)
{
if (Poco::Latin2Encoding().isA(charset))
return decodeFromLatin2();
if (Poco::UTF8Encoding().isA(charset))
return decodeFromUtf8();
}
return extractedEncodedSubjectToConvert;
}
private:
void splitEncodedText(const string& encodedText)
{
/// encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
const int charsetBeginPosition = strlen(sequenceBeginEncodedText);
const int charsetEndPosition = encodedText.find("?", charsetBeginPosition);
charset = encodedText.substr(charsetBeginPosition, charsetEndPosition-charsetBeginPosition);
const int encodingPosition = charsetEndPosition + strlen("?");
encoding = encodedText[encodingPosition];
const int lenghtOfEncodedText = encodedText.length() - encodingPosition-strlen(sequenceBeginEncodedText)-strlen(sequenceEndEncodedText);
extractedEncodedSubjectToConvert = encodedText.substr(encodingPosition+2, lenghtOfEncodedText);
}
bool isEncoded(const string& encodedSubject)
{
if (encodedSubject.size() < 4)
return false;
if (0 != encodedSubject.find(sequenceBeginEncodedText))
return false;
const unsigned positionOfLastTwoCharacters = encodedSubject.size() - strlen(sequenceEndEncodedText);
return positionOfLastTwoCharacters == encodedSubject.rfind(sequenceEndEncodedText);
}
string decodeFromLatin2()
{
size_t positionOfAssignment = -1;
while (true)
{
positionOfAssignment = extractedEncodedSubjectToConvert.find('=', positionOfAssignment+1);
if (string::npos != positionOfAssignment)
{
const string& charHexCode = extractedEncodedSubjectToConvert.substr(positionOfAssignment + 1, 2);
replaceAllSubstringsWithUnicode(extractedEncodedSubjectToConvert, charHexCode);
}
else
break;
}
return extractedEncodedSubjectToConvert;
}
void replaceAllSubstringsWithUnicode(string& s, const string& charHexCode)
{
static Poco::UTF8Encoding encodingConverter;
const int charCode = stoi(charHexCode, nullptr, 16);
char buffer[10] = {};
encodingConverter.convert(charCode, (unsigned char*)buffer, sizeof(buffer));
replaceAll(s, '=' + charHexCode, buffer);
}
void replaceAll(string& s, const string& replaceFrom, const string& replaceTo)
{
size_t needlePosition = -1;
while (true)
{
needlePosition = s.find(replaceFrom, needlePosition + 1);
if (string::npos == needlePosition)
break;
s.replace(needlePosition, replaceFrom.length(), replaceTo);
}
}
string decodeFromUtf8()
{
if('B' == toupper(encoding))
{
return decodeFromBase64();
}
else // if Q:
{
return decodeFromQuatedPrintable();
}
}
string decodeFromBase64()
{
istringstream is(extractedEncodedSubjectToConvert);
Poco::Base64Decoder e64(is);
extractedEncodedSubjectToConvert.clear();
string buffer;
while(getline(e64, buffer))
extractedEncodedSubjectToConvert += buffer;
return extractedEncodedSubjectToConvert;
}
string decodeFromQuatedPrintable()
{
replaceAll(extractedEncodedSubjectToConvert, "_", " ");
istringstream is(extractedEncodedSubjectToConvert);
Poco::Net::QuotedPrintableDecoder qp(is);
extractedEncodedSubjectToConvert.clear();
string buffer;
while(getline(qp, buffer))
extractedEncodedSubjectToConvert += buffer;
return extractedEncodedSubjectToConvert;
}
private:
string charset;
char encoding;
string extractedEncodedSubjectToConvert;
bool isStringEncoded;
static constexpr const char* sequenceBeginEncodedText = "=?";
static constexpr const char* sequenceEndEncodedText = "?=";
};
int main()
{
Poco::Net::POP3ClientSession session("poczta.o2.pl");
session.login("my mail", "my password");
Poco::Net::POP3ClientSession::MessageInfoVec messages;
session.listMessages(messages);
Poco::Net::MessageHeader header;
Poco::Net::MailMessage message;
auto currentMessage = messages[0];
session.retrieveHeader(currentMessage.id, header);
session.retrieveMessage(currentMessage.id, message);
const string subject = message.getSubject();
Encoder encoder(subject);
cout << "Original subject: " << subject << endl;
cout << "Encoded: " << encoder.convert() << endl;
}