将 Linux 中的 UTF-32 宽字符转换为 UTF-16 宽字符以获得补充平面字符
Convert UTF-32 wide char to UTF-16 wide char in Linux for Supplementary Plane characters
我们使用 ICU 在 RHEL 上部署了一个 C++ 应用程序。
我们有一种情况需要在 linux 上将 UChar* 转换为 wchar_t*。我们使用 u_strToWCS 来执行转换。
#include <iostream>
#include <wchar.h>
#include "unicode/ustring.h"
void convertUnicodeStringtoWideChar(const UChar* cuniszSource,
const int32_t cunii32SourceLength,
wchar_t*& rpwcharDestination,
int32_t& destCapacity)
{
UErrorCode uniUErrorCode = U_ZERO_ERROR;
int32_t pDestLength = 0;
rpwcharDestination = 0;
destCapacity = 0;
u_strToWCS(rpwcharDestination,
destCapacity,
&pDestLength,
cuniszSource,
cunii32SourceLength,
&uniUErrorCode);
uniUErrorCode = U_ZERO_ERROR;
rpwcharDestination = new wchar_t[pDestLength+1];
if(rpwcharDestination)
{
destCapacity = pDestLength+1;
u_strToWCS(rpwcharDestination,
destCapacity,
&pDestLength,
cuniszSource,
cunii32SourceLength,
&uniUErrorCode);
destCapacity = wcslen(rpwcharDestination);
}
} //function ends
int main()
{
// a ä Š € ( )
UChar input[20] = { 0x0061, 0x00e4, 0x0160, 0x20ac, 0xd87e, 0xdd29, 0x0000 };
wchar_t * output;
int32_t outlen = 0;
convertUnicodeStringtoWideChar( input, 6, output, outlen );
for ( int i = 0; i < outlen; ++i )
{
std::cout << std::hex << output[i] << "\n";
}
return 0;
}
这适用于输入最多 65535 个字符(因为 UChar 在 linux 上在内部实现为 uint16_t)。它无法转换基本多语言平面之外的字符(例如 CJK Unified Ideographs Extension B)
关于如何执行转换有什么想法吗?
更新 1:好的。我看错方向了。 u_strToWCS 工作正常。出现问题是因为我需要使用 CORBA 将该宽字符串传递给 windows 上的 java 应用程序。由于linux中的wchar_t是32位的,我需要找到一种方法将32位wchar_t转换为16位wchar_t
更新2:我用过的代码可以找到
在 C++11 及更高版本中,此转换在标准库中,在 <codecvt>
header 中。下面是一些在 UTF-16、UCS-4 和 wchar_t
之间转换的示例代码。 (由于开发树中已修复的错误,它在 libstdc++ 6.4.9 上中断。)
#include <codecvt>
#include <cstdlib>
#include <cstring>
#include <cwctype>
#include <iostream>
#include <locale>
#include <vector>
using std::cout;
using std::endl;
using std::exit;
using std::memcmp;
using std::size_t;
using std::wcout;
int main(void)
{
constexpr char16_t msg_utf16[] = u"¡Hola, mundo! \U0001F600"; // Shouldn't assume endianness.
constexpr wchar_t msg_w[] = L"¡Hola, mundo! \U0001F600";
constexpr char32_t msg_utf32[] = U"¡Hola, mundo! \U0001F600";
constexpr char msg_utf8[] = u8"¡Hola, mundo! \U0001F600";
// May vary from OS to OS> "" is the most standard, but might require, e.g. "en_US.utf8".
constexpr char locale_name[] = "";
std::locale::global(std::locale(locale_name)); //
wcout.imbue(std::locale());
const std::codecvt_utf16<wchar_t, 0x1FFFF, std::little_endian> converter_w;
const size_t max_len = sizeof(msg_utf16);
std::vector<char> out(max_len);
std::mbstate_t state;
const wchar_t* from_w = nullptr;
char* to_next = nullptr;
converter_w.out( state, msg_w, msg_w+sizeof(msg_w)/sizeof(wchar_t), from_w, out.data(), out.data() + out.size(), to_next );
if (memcmp( msg_utf8, out.data(), sizeof(msg_utf8) ) == 0 ) {
wcout << L"std::codecvt_utf16<wchar_t> converts to UTF-8, not UTF-16!" << endl;
} else if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
wcout << L"std::codecvt_utf16<wchar_t> conversion not equal!" << endl;
} else {
wcout << L"std::codecvt_utf16<wchar_t> conversion is correct." << endl;
}
out.clear();
out.resize(max_len);
const std::codecvt_utf16<char32_t, 0x1FFFF, std::little_endian> converter_u32;
const char32_t* from_u32 = nullptr;
converter_u32.out( state, msg_utf32, msg_utf32+sizeof(msg_utf32)/sizeof(char32_t), from_u32, out.data(), out.data() + out.size(), to_next );
if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
wcout << L"std::codecvt_utf16<char32_t> conversion not equal!" << endl;
} else {
wcout << L"std::codecvt_utf16<char32_t> conversion is correct." << endl;
}
wcout << msg_w << endl;
return EXIT_SUCCESS;
}
这两个方面将在 C++17 中弃用,但并非 <codecvt>
中的所有方面都是。特别是,标准库将支持 std::codecvt<char, char, std::mbstate_t>
、std::codecvt<char16_t, char, std::mbstate_t>
、std::codecvt<char32_t, char, std::mbstate_t>
和 std::codecvt<wchar_t, char, std::mbstate_t>
.
您不会在 Linux 上深入了解此 UTF-16 数据的来源,但这可能会建议一种方法。如果要处理文件,您可以在流上使用 imbue()
来转换读取和写入的数据,如果要处理 Qt 框架,QString
和 QTextCodex
提供转换功能。不过,ICU 应该支持整个范围的 UTF-16。
更新 1
真正的问题是如何从宽字符串转换为 UTF-16。我的示例就是这样做的,但是如果您想使用 ICU,它有 u_strFromWCS()
、u_strFromUTF32()
和 UnicodeString::fromUTF32()
.
如果您更喜欢 ICU 而不是 STL 的原因是 STL 的转换器方面声称是 locale-independent,请注意这些 ICU 转换器功能也都声称是 locale-independent。这是因为不同 UTF 编码之间的转换是完全算法化的并且独立于语言环境! (排序顺序和大小写映射之类的其他东西不是,但就是这样。)事实上,STL 确实允许您从特定语言环境请求转换器方面,如果您愿意,可以使用 locale::use_facet<codecvt<...>>()
,并且这在C++17。然而,只有与 UTF-8 之间的转换需要以这种方式实现。 “In addition, every locale object constructed in a C++ program implements its own (locale-specific) versions of these four specializations.” 在我的测试中,库的现有实现不支持 locale().use_facet<std_codecvt<wchar_t,char16_t,mbstate_t>>()
。
更新 2
我正在重新发布手册 wchar_t
到 utf_16
转换器 from my answer here。它需要一个 std::wstring
和 returns 一个 std::u16string
,但该算法可以很容易地适应任何其他容器。不过,u16string
至少与任何其他需要动态内存的数据结构一样高效。
你可能想做的一个改变是,我为最坏的可能情况分配了足够的内存,给定输入字符串的长度,然后是 shrink_to_fit()
。这应该不会比最初像 UTF-32 那样对字符串进行编码浪费更多的内存。但是,none 的数据极不可能出现在 BMP 中,因此您可以进行初始传递以计算转换需要多少内存,或者假设 [= 中的代理项对很少55=] 使用并接受必须调整大小和复制目标数组的不太可能的可能性。
#include <cassert>
#include <cwctype>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <locale>
#include <string>
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic for this to work.
#include <io.h>
#include <fcntl.h>
#include <locale.h>
#endif
using std::size_t;
void init_locale(void)
// Does magic so that wcout can work.
{
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic.
constexpr char cp_utf16le[] = ".1200";
setlocale( LC_ALL, cp_utf16le );
_setmode( _fileno(stdout), _O_U16TEXT );
#else
// The correct locale name may vary by OS, e.g., "en_US.utf8".
constexpr char locale_name[] = "";
std::locale::global(std::locale(locale_name));
std::wcout.imbue(std::locale());
#endif
}
std::u16string make_u16string( const std::wstring& ws )
/* Creates a UTF-16 string from a wide-character string. Any wide characters
* outside the allowed range of UTF-16 are mapped to the sentinel value U+FFFD,
* per the Unicode documentation. (http://www.unicode.org/faq/private_use.html
* retrieved 12 March 2017.) Unpaired surrogates in ws are also converted to
* sentinel values. Noncharacters, however, are left intact. As a fallback,
* if wide characters are the same size as char16_t, this does a more trivial
* construction using that implicit conversion.
*/
{
/* We assume that, if this test passes, a wide-character string is already
* UTF-16, or at least converts to it implicitly without needing surrogate
* pairs.
*/
if ( sizeof(wchar_t) == sizeof(char16_t) ) {
return std::u16string( ws.begin(), ws.end() );
} else {
/* The conversion from UTF-32 to UTF-16 might possibly require surrogates.
* A surrogate pair suffices to represent all wide characters, because all
* characters outside the range will be mapped to the sentinel value
* U+FFFD. Add one character for the terminating NUL.
*/
const size_t max_len = 2 * ws.length() + 1;
// Our temporary UTF-16 string.
std::u16string result;
result.reserve(max_len);
for ( const wchar_t& wc : ws ) {
const std::wint_t chr = wc;
if ( chr < 0 || chr > 0x10FFFF || (chr >= 0xD800 && chr <= 0xDFFF) ) {
// Invalid code point. Replace with sentinel, per Unicode standard:
constexpr char16_t sentinel = u'\uFFFD';
result.push_back(sentinel);
} else if ( chr < 0x10000UL ) { // In the BMP.
result.push_back(static_cast<char16_t>(wc));
} else {
const char16_t leading = static_cast<char16_t>(
((chr-0x10000UL) / 0x400U) + 0xD800U );
const char16_t trailing = static_cast<char16_t>(
((chr-0x10000UL) % 0x400U) + 0xDC00U );
result.append({leading, trailing});
} // end if
} // end for
/* The returned string is shrunken to fit, which might not be the Right
* Thing if there is more to be added to the string.
*/
result.shrink_to_fit();
// We depend here on the compiler to optimize the move constructor.
return result;
} // end if
// Not reached.
}
int main(void)
{
static const std::wstring wtest(L"☪☮∈✡℩☯✝ \U0001F644");
static const std::u16string u16test(u"☪☮∈✡℩☯✝ \U0001F644");
const std::u16string converted = make_u16string(wtest);
init_locale();
std::wcout << L"sizeof(wchar_t) == " << sizeof(wchar_t) << L".\n";
for( size_t i = 0; i <= u16test.length(); ++i ) {
if ( u16test[i] != converted[i] ) {
std::wcout << std::hex << std::showbase
<< std::right << std::setfill(L'0')
<< std::setw(4) << (unsigned)converted[i] << L" ≠ "
<< std::setw(4) << (unsigned)u16test[i] << L" at "
<< i << L'.' << std::endl;
return EXIT_FAILURE;
} // end if
} // end for
std::wcout << wtest << std::endl;
return EXIT_SUCCESS;
}
以下是将UTF-32编码的宽字符转为UTF-16的代码
//Function to convert a Unicode string from platform-specific "wide characters" (wchar_t) to UTF-16.
void ConvertUTF32ToUTF16(wchar_t* source,
const uint32_t sourceLength,
wchar_t*& destination,
uint32_t& destinationLength)
{
wchar_t wcharCharacter;
uint32_t uniui32Counter = 0;
wchar_t* pwszDestinationStart = destination;
wchar_t* sourceStart = source;
if(0 != destination)
{
while(uniui32Counter < sourceLength)
{
wcharCharacter = *source++;
if(wcharCharacter <= 0x0000FFFF)
{
/* UTF-16 surrogate values are illegal in UTF-32
0xFFFF or 0xFFFE are both reserved values */
if(wcharCharacter >= 0xD800 &&
wcharCharacter <= 0xDFFF)
{
*destination++ = 0x0000FFFD;
destinationLength += 1;
}
else
{
/* source is a BMP Character */
destinationLength += 1;
*destination++ = wcharCharacter;
}
}
else if(wcharCharacter > 0x0010FFFF)
{
/* U+10FFFF is the largest code point of Unicode Character Set */
*destination++ = 0x0000FFFD;
destinationLength += 1;
}
else
{
/* source is a character in range 0xFFFF - 0x10FFFF */
wcharCharacter -= 0x0010000UL;
*destination++ = (wchar_t)((wcharCharacter >> 10) + 0xD800);
*destination++ = (wchar_t)((wcharCharacter & 0x3FFUL) + 0xDC00);
destinationLength += 2;
}
++uniui32Counter;
}
destination = pwszDestinationStart;
destination[destinationLength] = '[=10=]';
}
source = sourceStart;
} //function ends
我们使用 ICU 在 RHEL 上部署了一个 C++ 应用程序。
我们有一种情况需要在 linux 上将 UChar* 转换为 wchar_t*。我们使用 u_strToWCS 来执行转换。
#include <iostream>
#include <wchar.h>
#include "unicode/ustring.h"
void convertUnicodeStringtoWideChar(const UChar* cuniszSource,
const int32_t cunii32SourceLength,
wchar_t*& rpwcharDestination,
int32_t& destCapacity)
{
UErrorCode uniUErrorCode = U_ZERO_ERROR;
int32_t pDestLength = 0;
rpwcharDestination = 0;
destCapacity = 0;
u_strToWCS(rpwcharDestination,
destCapacity,
&pDestLength,
cuniszSource,
cunii32SourceLength,
&uniUErrorCode);
uniUErrorCode = U_ZERO_ERROR;
rpwcharDestination = new wchar_t[pDestLength+1];
if(rpwcharDestination)
{
destCapacity = pDestLength+1;
u_strToWCS(rpwcharDestination,
destCapacity,
&pDestLength,
cuniszSource,
cunii32SourceLength,
&uniUErrorCode);
destCapacity = wcslen(rpwcharDestination);
}
} //function ends
int main()
{
// a ä Š € ( )
UChar input[20] = { 0x0061, 0x00e4, 0x0160, 0x20ac, 0xd87e, 0xdd29, 0x0000 };
wchar_t * output;
int32_t outlen = 0;
convertUnicodeStringtoWideChar( input, 6, output, outlen );
for ( int i = 0; i < outlen; ++i )
{
std::cout << std::hex << output[i] << "\n";
}
return 0;
}
这适用于输入最多 65535 个字符(因为 UChar 在 linux 上在内部实现为 uint16_t)。它无法转换基本多语言平面之外的字符(例如 CJK Unified Ideographs Extension B)
关于如何执行转换有什么想法吗?
更新 1:好的。我看错方向了。 u_strToWCS 工作正常。出现问题是因为我需要使用 CORBA 将该宽字符串传递给 windows 上的 java 应用程序。由于linux中的wchar_t是32位的,我需要找到一种方法将32位wchar_t转换为16位wchar_t
更新2:我用过的代码可以找到
在 C++11 及更高版本中,此转换在标准库中,在 <codecvt>
header 中。下面是一些在 UTF-16、UCS-4 和 wchar_t
之间转换的示例代码。 (由于开发树中已修复的错误,它在 libstdc++ 6.4.9 上中断。)
#include <codecvt>
#include <cstdlib>
#include <cstring>
#include <cwctype>
#include <iostream>
#include <locale>
#include <vector>
using std::cout;
using std::endl;
using std::exit;
using std::memcmp;
using std::size_t;
using std::wcout;
int main(void)
{
constexpr char16_t msg_utf16[] = u"¡Hola, mundo! \U0001F600"; // Shouldn't assume endianness.
constexpr wchar_t msg_w[] = L"¡Hola, mundo! \U0001F600";
constexpr char32_t msg_utf32[] = U"¡Hola, mundo! \U0001F600";
constexpr char msg_utf8[] = u8"¡Hola, mundo! \U0001F600";
// May vary from OS to OS> "" is the most standard, but might require, e.g. "en_US.utf8".
constexpr char locale_name[] = "";
std::locale::global(std::locale(locale_name)); //
wcout.imbue(std::locale());
const std::codecvt_utf16<wchar_t, 0x1FFFF, std::little_endian> converter_w;
const size_t max_len = sizeof(msg_utf16);
std::vector<char> out(max_len);
std::mbstate_t state;
const wchar_t* from_w = nullptr;
char* to_next = nullptr;
converter_w.out( state, msg_w, msg_w+sizeof(msg_w)/sizeof(wchar_t), from_w, out.data(), out.data() + out.size(), to_next );
if (memcmp( msg_utf8, out.data(), sizeof(msg_utf8) ) == 0 ) {
wcout << L"std::codecvt_utf16<wchar_t> converts to UTF-8, not UTF-16!" << endl;
} else if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
wcout << L"std::codecvt_utf16<wchar_t> conversion not equal!" << endl;
} else {
wcout << L"std::codecvt_utf16<wchar_t> conversion is correct." << endl;
}
out.clear();
out.resize(max_len);
const std::codecvt_utf16<char32_t, 0x1FFFF, std::little_endian> converter_u32;
const char32_t* from_u32 = nullptr;
converter_u32.out( state, msg_utf32, msg_utf32+sizeof(msg_utf32)/sizeof(char32_t), from_u32, out.data(), out.data() + out.size(), to_next );
if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
wcout << L"std::codecvt_utf16<char32_t> conversion not equal!" << endl;
} else {
wcout << L"std::codecvt_utf16<char32_t> conversion is correct." << endl;
}
wcout << msg_w << endl;
return EXIT_SUCCESS;
}
这两个方面将在 C++17 中弃用,但并非 <codecvt>
中的所有方面都是。特别是,标准库将支持 std::codecvt<char, char, std::mbstate_t>
、std::codecvt<char16_t, char, std::mbstate_t>
、std::codecvt<char32_t, char, std::mbstate_t>
和 std::codecvt<wchar_t, char, std::mbstate_t>
.
您不会在 Linux 上深入了解此 UTF-16 数据的来源,但这可能会建议一种方法。如果要处理文件,您可以在流上使用 imbue()
来转换读取和写入的数据,如果要处理 Qt 框架,QString
和 QTextCodex
提供转换功能。不过,ICU 应该支持整个范围的 UTF-16。
更新 1
真正的问题是如何从宽字符串转换为 UTF-16。我的示例就是这样做的,但是如果您想使用 ICU,它有 u_strFromWCS()
、u_strFromUTF32()
和 UnicodeString::fromUTF32()
.
如果您更喜欢 ICU 而不是 STL 的原因是 STL 的转换器方面声称是 locale-independent,请注意这些 ICU 转换器功能也都声称是 locale-independent。这是因为不同 UTF 编码之间的转换是完全算法化的并且独立于语言环境! (排序顺序和大小写映射之类的其他东西不是,但就是这样。)事实上,STL 确实允许您从特定语言环境请求转换器方面,如果您愿意,可以使用 locale::use_facet<codecvt<...>>()
,并且这在C++17。然而,只有与 UTF-8 之间的转换需要以这种方式实现。 “In addition, every locale object constructed in a C++ program implements its own (locale-specific) versions of these four specializations.” 在我的测试中,库的现有实现不支持 locale().use_facet<std_codecvt<wchar_t,char16_t,mbstate_t>>()
。
更新 2
我正在重新发布手册 wchar_t
到 utf_16
转换器 from my answer here。它需要一个 std::wstring
和 returns 一个 std::u16string
,但该算法可以很容易地适应任何其他容器。不过,u16string
至少与任何其他需要动态内存的数据结构一样高效。
你可能想做的一个改变是,我为最坏的可能情况分配了足够的内存,给定输入字符串的长度,然后是 shrink_to_fit()
。这应该不会比最初像 UTF-32 那样对字符串进行编码浪费更多的内存。但是,none 的数据极不可能出现在 BMP 中,因此您可以进行初始传递以计算转换需要多少内存,或者假设 [= 中的代理项对很少55=] 使用并接受必须调整大小和复制目标数组的不太可能的可能性。
#include <cassert>
#include <cwctype>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <locale>
#include <string>
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic for this to work.
#include <io.h>
#include <fcntl.h>
#include <locale.h>
#endif
using std::size_t;
void init_locale(void)
// Does magic so that wcout can work.
{
#if _WIN32 || _WIN64
// Windows needs a little non-standard magic.
constexpr char cp_utf16le[] = ".1200";
setlocale( LC_ALL, cp_utf16le );
_setmode( _fileno(stdout), _O_U16TEXT );
#else
// The correct locale name may vary by OS, e.g., "en_US.utf8".
constexpr char locale_name[] = "";
std::locale::global(std::locale(locale_name));
std::wcout.imbue(std::locale());
#endif
}
std::u16string make_u16string( const std::wstring& ws )
/* Creates a UTF-16 string from a wide-character string. Any wide characters
* outside the allowed range of UTF-16 are mapped to the sentinel value U+FFFD,
* per the Unicode documentation. (http://www.unicode.org/faq/private_use.html
* retrieved 12 March 2017.) Unpaired surrogates in ws are also converted to
* sentinel values. Noncharacters, however, are left intact. As a fallback,
* if wide characters are the same size as char16_t, this does a more trivial
* construction using that implicit conversion.
*/
{
/* We assume that, if this test passes, a wide-character string is already
* UTF-16, or at least converts to it implicitly without needing surrogate
* pairs.
*/
if ( sizeof(wchar_t) == sizeof(char16_t) ) {
return std::u16string( ws.begin(), ws.end() );
} else {
/* The conversion from UTF-32 to UTF-16 might possibly require surrogates.
* A surrogate pair suffices to represent all wide characters, because all
* characters outside the range will be mapped to the sentinel value
* U+FFFD. Add one character for the terminating NUL.
*/
const size_t max_len = 2 * ws.length() + 1;
// Our temporary UTF-16 string.
std::u16string result;
result.reserve(max_len);
for ( const wchar_t& wc : ws ) {
const std::wint_t chr = wc;
if ( chr < 0 || chr > 0x10FFFF || (chr >= 0xD800 && chr <= 0xDFFF) ) {
// Invalid code point. Replace with sentinel, per Unicode standard:
constexpr char16_t sentinel = u'\uFFFD';
result.push_back(sentinel);
} else if ( chr < 0x10000UL ) { // In the BMP.
result.push_back(static_cast<char16_t>(wc));
} else {
const char16_t leading = static_cast<char16_t>(
((chr-0x10000UL) / 0x400U) + 0xD800U );
const char16_t trailing = static_cast<char16_t>(
((chr-0x10000UL) % 0x400U) + 0xDC00U );
result.append({leading, trailing});
} // end if
} // end for
/* The returned string is shrunken to fit, which might not be the Right
* Thing if there is more to be added to the string.
*/
result.shrink_to_fit();
// We depend here on the compiler to optimize the move constructor.
return result;
} // end if
// Not reached.
}
int main(void)
{
static const std::wstring wtest(L"☪☮∈✡℩☯✝ \U0001F644");
static const std::u16string u16test(u"☪☮∈✡℩☯✝ \U0001F644");
const std::u16string converted = make_u16string(wtest);
init_locale();
std::wcout << L"sizeof(wchar_t) == " << sizeof(wchar_t) << L".\n";
for( size_t i = 0; i <= u16test.length(); ++i ) {
if ( u16test[i] != converted[i] ) {
std::wcout << std::hex << std::showbase
<< std::right << std::setfill(L'0')
<< std::setw(4) << (unsigned)converted[i] << L" ≠ "
<< std::setw(4) << (unsigned)u16test[i] << L" at "
<< i << L'.' << std::endl;
return EXIT_FAILURE;
} // end if
} // end for
std::wcout << wtest << std::endl;
return EXIT_SUCCESS;
}
以下是将UTF-32编码的宽字符转为UTF-16的代码
//Function to convert a Unicode string from platform-specific "wide characters" (wchar_t) to UTF-16.
void ConvertUTF32ToUTF16(wchar_t* source,
const uint32_t sourceLength,
wchar_t*& destination,
uint32_t& destinationLength)
{
wchar_t wcharCharacter;
uint32_t uniui32Counter = 0;
wchar_t* pwszDestinationStart = destination;
wchar_t* sourceStart = source;
if(0 != destination)
{
while(uniui32Counter < sourceLength)
{
wcharCharacter = *source++;
if(wcharCharacter <= 0x0000FFFF)
{
/* UTF-16 surrogate values are illegal in UTF-32
0xFFFF or 0xFFFE are both reserved values */
if(wcharCharacter >= 0xD800 &&
wcharCharacter <= 0xDFFF)
{
*destination++ = 0x0000FFFD;
destinationLength += 1;
}
else
{
/* source is a BMP Character */
destinationLength += 1;
*destination++ = wcharCharacter;
}
}
else if(wcharCharacter > 0x0010FFFF)
{
/* U+10FFFF is the largest code point of Unicode Character Set */
*destination++ = 0x0000FFFD;
destinationLength += 1;
}
else
{
/* source is a character in range 0xFFFF - 0x10FFFF */
wcharCharacter -= 0x0010000UL;
*destination++ = (wchar_t)((wcharCharacter >> 10) + 0xD800);
*destination++ = (wchar_t)((wcharCharacter & 0x3FFUL) + 0xDC00);
destinationLength += 2;
}
++uniui32Counter;
}
destination = pwszDestinationStart;
destination[destinationLength] = '[=10=]';
}
source = sourceStart;
} //function ends