如何将 wstring 转换为 u16string?
how can I convert wstring to u16string?
我想在 C++ 中将 wstring
转换为 u16string
。
我可以将 wstring
转换为字符串,或反向。但我不知道如何转换为 u16string
.
u16string CTextConverter::convertWstring2U16(wstring str)
{
int iSize;
u16string szDest[256] = {};
memset(szDest, 0, 256);
iSize = WideCharToMultiByte(CP_UTF8, NULL, str.c_str(), -1, NULL, 0,0,0);
WideCharToMultiByte(CP_UTF8, NULL, str.c_str(), -1, szDest, iSize,0,0);
u16string s16 = szDest;
return s16;
}
WideCharToMultiByte(CP_UTF8, NULL, str.c_str(), -1, szDest, iSize,0,0); 的 szDest
中出错。 u16string
的原因不能与 LPSTR
.
一起使用
如何修复此代码?
对于独立于平台的解决方案,请参阅。
如果您只需要Windows平台的解决方案,下面的代码就足够了:
std::wstring wstr( L"foo" );
std::u16string u16str( wstr.begin(), wstr.end() );
在 Windows 平台上,std::wstring
可与 std::u16string
互换,因为 sizeof(wstring::value_type) == sizeof(u16string::value_type)
两者都是 UTF-16(小端)编码。
wstring::value_type = wchar_t
u16string::value_type = char16_t
唯一的区别是 wchar_t
是有符号的,而 char16_t
是无符号的。所以你只需要做符号转换,这可以通过使用以迭代器对作为参数的 u16string
构造函数来执行。此构造函数会将 wchar_t
隐式转换为 char16_t
.
完整示例控制台应用程序:
#include <windows.h>
#include <string>
int main()
{
static_assert( sizeof(std::wstring::value_type) == sizeof(std::u16string::value_type),
"std::wstring and std::u16string are expected to have the same character size" );
std::wstring wstr( L"foo" );
std::u16string u16str( wstr.begin(), wstr.end() );
// The u16string constructor performs an implicit conversion like:
wchar_t wch = L'A';
char16_t ch16 = wch;
// Need to reinterpret_cast because char16_t const* is not implicitly convertible
// to LPCWSTR (aka wchar_t const*).
::MessageBoxW( 0, reinterpret_cast<LPCWSTR>( u16str.c_str() ), L"test", 0 );
return 0;
}
更新
原以为标准版不行,其实是因为Visual C++和libstdc++ 3.4.21运行时库有bug。它确实适用于 clang++ -std=c++14 -stdlib=libc++
。这是一个测试标准方法是否适用于您的编译器的版本:
#include <codecvt>
#include <cstdlib>
#include <cstring>
#include <cwctype>
#include <iostream>
#include <locale>
#include <clocale>
#include <vector>
using std::cout;
using std::endl;
using std::exit;
using std::memcmp;
using std::size_t;
using std::wcout;
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic for this to work.
#include <io.h>
#include <fcntl.h>
#include <locale.h>
#endif
using std::size_t;
void init_locale(void)
// Does magic so that wcout can work.
{
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic.
constexpr char cp_utf16le[] = ".1200";
setlocale( LC_ALL, cp_utf16le );
_setmode( _fileno(stdout), _O_U16TEXT );
#else
// The correct locale name may vary by OS, e.g., "en_US.utf8".
constexpr char locale_name[] = "";
std::locale::global(std::locale(locale_name));
std::wcout.imbue(std::locale());
#endif
}
int main(void)
{
constexpr char16_t msg_utf16[] = u"¡Hola, mundo! \U0001F600"; // Shouldn't assume endianness.
constexpr wchar_t msg_w[] = L"¡Hola, mundo! \U0001F600";
constexpr char32_t msg_utf32[] = U"¡Hola, mundo! \U0001F600";
constexpr char msg_utf8[] = u8"¡Hola, mundo! \U0001F600";
init_locale();
const std::codecvt_utf16<wchar_t, 0x1FFFF, std::little_endian> converter_w;
const size_t max_len = sizeof(msg_utf16);
std::vector<char> out(max_len);
std::mbstate_t state;
const wchar_t* from_w = nullptr;
char* to_next = nullptr;
converter_w.out( state, msg_w, msg_w+sizeof(msg_w)/sizeof(wchar_t), from_w, out.data(), out.data() + out.size(), to_next );
if (memcmp( msg_utf8, out.data(), sizeof(msg_utf8) ) == 0 ) {
wcout << L"std::codecvt_utf16<wchar_t> converts to UTF-8, not UTF-16!" << endl;
} else if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
wcout << L"std::codecvt_utf16<wchar_t> conversion not equal!" << endl;
} else {
wcout << L"std::codecvt_utf16<wchar_t> conversion is correct." << endl;
}
out.clear();
out.resize(max_len);
const std::codecvt_utf16<char32_t, 0x1FFFF, std::little_endian> converter_u32;
const char32_t* from_u32 = nullptr;
converter_u32.out( state, msg_utf32, msg_utf32+sizeof(msg_utf32)/sizeof(char32_t), from_u32, out.data(), out.data() + out.size(), to_next );
if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
wcout << L"std::codecvt_utf16<char32_t> conversion not equal!" << endl;
} else {
wcout << L"std::codecvt_utf16<char32_t> conversion is correct." << endl;
}
wcout << msg_w << endl;
return EXIT_SUCCESS;
}
上一个
有点晚了,但这里有一个版本会额外检查 wchar_t
是否为 32 位(因为它在 Linux 上),如果是,则执行代理对转换.我建议将此源保存为带有 BOM 的 UTF-8。 Here is a link to it on ideone.
#include <cassert>
#include <cwctype>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <locale>
#include <string>
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic for this to work.
#include <io.h>
#include <fcntl.h>
#include <locale.h>
#endif
using std::size_t;
void init_locale(void)
// Does magic so that wcout can work.
{
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic.
constexpr char cp_utf16le[] = ".1200";
setlocale( LC_ALL, cp_utf16le );
_setmode( _fileno(stdout), _O_U16TEXT );
#else
// The correct locale name may vary by OS, e.g., "en_US.utf8".
constexpr char locale_name[] = "";
std::locale::global(std::locale(locale_name));
std::wcout.imbue(std::locale());
#endif
}
std::u16string make_u16string( const std::wstring& ws )
/* Creates a UTF-16 string from a wide-character string. Any wide characters
* outside the allowed range of UTF-16 are mapped to the sentinel value U+FFFD,
* per the Unicode documentation. (http://www.unicode.org/faq/private_use.html
* retrieved 12 March 2017.) Unpaired surrogates in ws are also converted to
* sentinel values. Noncharacters, however, are left intact. As a fallback,
* if wide characters are the same size as char16_t, this does a more trivial
* construction using that implicit conversion.
*/
{
/* We assume that, if this test passes, a wide-character string is already
* UTF-16, or at least converts to it implicitly without needing surrogate
* pairs.
*/
if ( sizeof(wchar_t) == sizeof(char16_t) ) {
return std::u16string( ws.begin(), ws.end() );
} else {
/* The conversion from UTF-32 to UTF-16 might possibly require surrogates.
* A surrogate pair suffices to represent all wide characters, because all
* characters outside the range will be mapped to the sentinel value
* U+FFFD. Add one character for the terminating NUL.
*/
const size_t max_len = 2 * ws.length() + 1;
// Our temporary UTF-16 string.
std::u16string result;
result.reserve(max_len);
for ( const wchar_t& wc : ws ) {
const std::wint_t chr = wc;
if ( chr < 0 || chr > 0x10FFFF || (chr >= 0xD800 && chr <= 0xDFFF) ) {
// Invalid code point. Replace with sentinel, per Unicode standard:
constexpr char16_t sentinel = u'\uFFFD';
result.push_back(sentinel);
} else if ( chr < 0x10000UL ) { // In the BMP.
result.push_back(static_cast<char16_t>(wc));
} else {
const char16_t leading = static_cast<char16_t>(
((chr-0x10000UL) / 0x400U) + 0xD800U );
const char16_t trailing = static_cast<char16_t>(
((chr-0x10000UL) % 0x400U) + 0xDC00U );
result.append({leading, trailing});
} // end if
} // end for
/* The returned string is shrunken to fit, which might not be the Right
* Thing if there is more to be added to the string.
*/
result.shrink_to_fit();
// We depend here on the compiler to optimize the move constructor.
return result;
} // end if
// Not reached.
}
int main(void)
{
static const std::wstring wtest(L"☪☮∈✡℩☯✝ \U0001F644");
static const std::u16string u16test(u"☪☮∈✡℩☯✝ \U0001F644");
const std::u16string converted = make_u16string(wtest);
init_locale();
std::wcout << L"sizeof(wchar_t) == " << sizeof(wchar_t) << L".\n";
for( size_t i = 0; i <= u16test.length(); ++i ) {
if ( u16test[i] != converted[i] ) {
std::wcout << std::hex << std::showbase
<< std::right << std::setfill(L'0')
<< std::setw(4) << (unsigned)converted[i] << L" ≠ "
<< std::setw(4) << (unsigned)u16test[i] << L" at "
<< i << L'.' << std::endl;
return EXIT_FAILURE;
} // end if
} // end for
std::wcout << wtest << std::endl;
return EXIT_SUCCESS;
}
脚注
因为有人问:我建议使用带 BOM 的 UTF-8 的原因是某些编译器(包括 MSVC 2015)会假定源文件根据当前代码页进行编码,除非有 BOM 或您指定编码在命令行上。不幸的是,没有一种编码适用于所有工具链,但我使用的每一个足够现代以支持 C++14 的工具也能理解 BOM。
- To convert CString to std:wstring and string
string CString2string(CString str)
{
int bufLen = WideCharToMultiByte(CP_UTF8, 0, (LPCTSTR)str, -1, NULL, 0, NULL,NULL);
char *buf = new char[bufLen];
WideCharToMultiByte(CP_UTF8, 0, (LPCTSTR)str, -1, buf, bufLen, NULL, NULL);
string sRet(buf);
delete[] buf;
return sRet;
}
CString strFileName = "test.txt";
wstring wFileName(strFileName.GetBuffer());
strFileName.ReleaseBuffer();
string sFileName = CString2string(strFileName);
- To convert string to CString
CString string2CString(string s)
{
int bufLen = MultiByteToWideChar(CP_ACP, 0, s.c_str(), -1, NULL, 0);
WCHAR *buf = new WCHAR[bufLen];
MultiByteToWideChar(CP_ACP, 0, s.c_str(), -1, buf, bufLen);
CString strRet(buf);
delete[] buf;
return strRet;
}
string sFileName = "test.txt";
CString strFileName = string2CString(sFileName);
我想在 C++ 中将 wstring
转换为 u16string
。
我可以将 wstring
转换为字符串,或反向。但我不知道如何转换为 u16string
.
u16string CTextConverter::convertWstring2U16(wstring str)
{
int iSize;
u16string szDest[256] = {};
memset(szDest, 0, 256);
iSize = WideCharToMultiByte(CP_UTF8, NULL, str.c_str(), -1, NULL, 0,0,0);
WideCharToMultiByte(CP_UTF8, NULL, str.c_str(), -1, szDest, iSize,0,0);
u16string s16 = szDest;
return s16;
}
WideCharToMultiByte(CP_UTF8, NULL, str.c_str(), -1, szDest, iSize,0,0); 的 szDest
中出错。 u16string
的原因不能与 LPSTR
.
如何修复此代码?
对于独立于平台的解决方案,请参阅
如果您只需要Windows平台的解决方案,下面的代码就足够了:
std::wstring wstr( L"foo" );
std::u16string u16str( wstr.begin(), wstr.end() );
在 Windows 平台上,std::wstring
可与 std::u16string
互换,因为 sizeof(wstring::value_type) == sizeof(u16string::value_type)
两者都是 UTF-16(小端)编码。
wstring::value_type = wchar_t
u16string::value_type = char16_t
唯一的区别是 wchar_t
是有符号的,而 char16_t
是无符号的。所以你只需要做符号转换,这可以通过使用以迭代器对作为参数的 u16string
构造函数来执行。此构造函数会将 wchar_t
隐式转换为 char16_t
.
完整示例控制台应用程序:
#include <windows.h>
#include <string>
int main()
{
static_assert( sizeof(std::wstring::value_type) == sizeof(std::u16string::value_type),
"std::wstring and std::u16string are expected to have the same character size" );
std::wstring wstr( L"foo" );
std::u16string u16str( wstr.begin(), wstr.end() );
// The u16string constructor performs an implicit conversion like:
wchar_t wch = L'A';
char16_t ch16 = wch;
// Need to reinterpret_cast because char16_t const* is not implicitly convertible
// to LPCWSTR (aka wchar_t const*).
::MessageBoxW( 0, reinterpret_cast<LPCWSTR>( u16str.c_str() ), L"test", 0 );
return 0;
}
更新
原以为标准版不行,其实是因为Visual C++和libstdc++ 3.4.21运行时库有bug。它确实适用于 clang++ -std=c++14 -stdlib=libc++
。这是一个测试标准方法是否适用于您的编译器的版本:
#include <codecvt>
#include <cstdlib>
#include <cstring>
#include <cwctype>
#include <iostream>
#include <locale>
#include <clocale>
#include <vector>
using std::cout;
using std::endl;
using std::exit;
using std::memcmp;
using std::size_t;
using std::wcout;
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic for this to work.
#include <io.h>
#include <fcntl.h>
#include <locale.h>
#endif
using std::size_t;
void init_locale(void)
// Does magic so that wcout can work.
{
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic.
constexpr char cp_utf16le[] = ".1200";
setlocale( LC_ALL, cp_utf16le );
_setmode( _fileno(stdout), _O_U16TEXT );
#else
// The correct locale name may vary by OS, e.g., "en_US.utf8".
constexpr char locale_name[] = "";
std::locale::global(std::locale(locale_name));
std::wcout.imbue(std::locale());
#endif
}
int main(void)
{
constexpr char16_t msg_utf16[] = u"¡Hola, mundo! \U0001F600"; // Shouldn't assume endianness.
constexpr wchar_t msg_w[] = L"¡Hola, mundo! \U0001F600";
constexpr char32_t msg_utf32[] = U"¡Hola, mundo! \U0001F600";
constexpr char msg_utf8[] = u8"¡Hola, mundo! \U0001F600";
init_locale();
const std::codecvt_utf16<wchar_t, 0x1FFFF, std::little_endian> converter_w;
const size_t max_len = sizeof(msg_utf16);
std::vector<char> out(max_len);
std::mbstate_t state;
const wchar_t* from_w = nullptr;
char* to_next = nullptr;
converter_w.out( state, msg_w, msg_w+sizeof(msg_w)/sizeof(wchar_t), from_w, out.data(), out.data() + out.size(), to_next );
if (memcmp( msg_utf8, out.data(), sizeof(msg_utf8) ) == 0 ) {
wcout << L"std::codecvt_utf16<wchar_t> converts to UTF-8, not UTF-16!" << endl;
} else if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
wcout << L"std::codecvt_utf16<wchar_t> conversion not equal!" << endl;
} else {
wcout << L"std::codecvt_utf16<wchar_t> conversion is correct." << endl;
}
out.clear();
out.resize(max_len);
const std::codecvt_utf16<char32_t, 0x1FFFF, std::little_endian> converter_u32;
const char32_t* from_u32 = nullptr;
converter_u32.out( state, msg_utf32, msg_utf32+sizeof(msg_utf32)/sizeof(char32_t), from_u32, out.data(), out.data() + out.size(), to_next );
if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
wcout << L"std::codecvt_utf16<char32_t> conversion not equal!" << endl;
} else {
wcout << L"std::codecvt_utf16<char32_t> conversion is correct." << endl;
}
wcout << msg_w << endl;
return EXIT_SUCCESS;
}
上一个
有点晚了,但这里有一个版本会额外检查 wchar_t
是否为 32 位(因为它在 Linux 上),如果是,则执行代理对转换.我建议将此源保存为带有 BOM 的 UTF-8。 Here is a link to it on ideone.
#include <cassert>
#include <cwctype>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <locale>
#include <string>
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic for this to work.
#include <io.h>
#include <fcntl.h>
#include <locale.h>
#endif
using std::size_t;
void init_locale(void)
// Does magic so that wcout can work.
{
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic.
constexpr char cp_utf16le[] = ".1200";
setlocale( LC_ALL, cp_utf16le );
_setmode( _fileno(stdout), _O_U16TEXT );
#else
// The correct locale name may vary by OS, e.g., "en_US.utf8".
constexpr char locale_name[] = "";
std::locale::global(std::locale(locale_name));
std::wcout.imbue(std::locale());
#endif
}
std::u16string make_u16string( const std::wstring& ws )
/* Creates a UTF-16 string from a wide-character string. Any wide characters
* outside the allowed range of UTF-16 are mapped to the sentinel value U+FFFD,
* per the Unicode documentation. (http://www.unicode.org/faq/private_use.html
* retrieved 12 March 2017.) Unpaired surrogates in ws are also converted to
* sentinel values. Noncharacters, however, are left intact. As a fallback,
* if wide characters are the same size as char16_t, this does a more trivial
* construction using that implicit conversion.
*/
{
/* We assume that, if this test passes, a wide-character string is already
* UTF-16, or at least converts to it implicitly without needing surrogate
* pairs.
*/
if ( sizeof(wchar_t) == sizeof(char16_t) ) {
return std::u16string( ws.begin(), ws.end() );
} else {
/* The conversion from UTF-32 to UTF-16 might possibly require surrogates.
* A surrogate pair suffices to represent all wide characters, because all
* characters outside the range will be mapped to the sentinel value
* U+FFFD. Add one character for the terminating NUL.
*/
const size_t max_len = 2 * ws.length() + 1;
// Our temporary UTF-16 string.
std::u16string result;
result.reserve(max_len);
for ( const wchar_t& wc : ws ) {
const std::wint_t chr = wc;
if ( chr < 0 || chr > 0x10FFFF || (chr >= 0xD800 && chr <= 0xDFFF) ) {
// Invalid code point. Replace with sentinel, per Unicode standard:
constexpr char16_t sentinel = u'\uFFFD';
result.push_back(sentinel);
} else if ( chr < 0x10000UL ) { // In the BMP.
result.push_back(static_cast<char16_t>(wc));
} else {
const char16_t leading = static_cast<char16_t>(
((chr-0x10000UL) / 0x400U) + 0xD800U );
const char16_t trailing = static_cast<char16_t>(
((chr-0x10000UL) % 0x400U) + 0xDC00U );
result.append({leading, trailing});
} // end if
} // end for
/* The returned string is shrunken to fit, which might not be the Right
* Thing if there is more to be added to the string.
*/
result.shrink_to_fit();
// We depend here on the compiler to optimize the move constructor.
return result;
} // end if
// Not reached.
}
int main(void)
{
static const std::wstring wtest(L"☪☮∈✡℩☯✝ \U0001F644");
static const std::u16string u16test(u"☪☮∈✡℩☯✝ \U0001F644");
const std::u16string converted = make_u16string(wtest);
init_locale();
std::wcout << L"sizeof(wchar_t) == " << sizeof(wchar_t) << L".\n";
for( size_t i = 0; i <= u16test.length(); ++i ) {
if ( u16test[i] != converted[i] ) {
std::wcout << std::hex << std::showbase
<< std::right << std::setfill(L'0')
<< std::setw(4) << (unsigned)converted[i] << L" ≠ "
<< std::setw(4) << (unsigned)u16test[i] << L" at "
<< i << L'.' << std::endl;
return EXIT_FAILURE;
} // end if
} // end for
std::wcout << wtest << std::endl;
return EXIT_SUCCESS;
}
脚注
因为有人问:我建议使用带 BOM 的 UTF-8 的原因是某些编译器(包括 MSVC 2015)会假定源文件根据当前代码页进行编码,除非有 BOM 或您指定编码在命令行上。不幸的是,没有一种编码适用于所有工具链,但我使用的每一个足够现代以支持 C++14 的工具也能理解 BOM。
- To convert CString to std:wstring and string
string CString2string(CString str)
{
int bufLen = WideCharToMultiByte(CP_UTF8, 0, (LPCTSTR)str, -1, NULL, 0, NULL,NULL);
char *buf = new char[bufLen];
WideCharToMultiByte(CP_UTF8, 0, (LPCTSTR)str, -1, buf, bufLen, NULL, NULL);
string sRet(buf);
delete[] buf;
return sRet;
}
CString strFileName = "test.txt";
wstring wFileName(strFileName.GetBuffer());
strFileName.ReleaseBuffer();
string sFileName = CString2string(strFileName);
- To convert string to CString
CString string2CString(string s)
{
int bufLen = MultiByteToWideChar(CP_ACP, 0, s.c_str(), -1, NULL, 0);
WCHAR *buf = new WCHAR[bufLen];
MultiByteToWideChar(CP_ACP, 0, s.c_str(), -1, buf, bufLen);
CString strRet(buf);
delete[] buf;
return strRet;
}
string sFileName = "test.txt";
CString strFileName = string2CString(sFileName);