String.cString(using: String.Encoding.utf16) 是否正常工作?
Is String.cString(using: String.Encoding.utf16) working correctly?
我发现 string.cString(使用:String.Encoding.utf16)有奇怪的行为。
我想要做的是在 UnsafePointer<'UInt16>. However, String.Encoding.utf16 seems to terminate conversion after it outputs first '\0' 中将 Swift 字符串转换为 utf-16。
相同的逻辑适用于 utf-8 或 shiftJIS,因为它们在途中没有 NULL 字节。
这是代码。
func test() {
let str = "helloはろー"
let cnt = str.characters.count
// (A) utf-8 ==> works fine
let utf8 = str.cString(using: String.Encoding.utf8)!
let int8p = utf8.withUnsafeBufferPointer {
UnsafeRawPointer([=12=].baseAddress!).assumingMemoryBound(to: Int8.self)
}
dumpmem(0, int8p)
let str1 = String(cString: int8p)
print(str1) // helloはろー
// (B) shift-JIS ==> works fine
let sjis = str.cString(using: String.Encoding.shiftJIS)!
let uint8p = sjis.withUnsafeBufferPointer {
UnsafeRawPointer([=12=].baseAddress!).assumingMemoryBound(to: UInt8.self)
}
dumpmem(0, uint8p)
let str2 = String(sjisptr: uint8p)!
print(str2) // helloはろー
// (C) utf-16 ==> doesn't work nicely
let utf16 = str.cString(using: String.Encoding.utf16)!
let uint16p = utf16.withUnsafeBufferPointer {
UnsafeRawPointer([=12=].baseAddress!).assumingMemoryBound(to: UInt16.self)
}
dumpmem(Int32(str.characters.count*2), uint16p) // only top char (='h' converted)
let str3 = String(utf16ptr: uint16p)!
print(str3) // h+garbage ...
// (D) utf-16 w/ iteration ==> works fine.
let u16s = UnsafeMutablePointer<UInt16>.allocate(capacity: cnt*4)
var len = 0
for code in str.utf16 {
u16s[len] = code
len += 1
}
u16s[len] = 0
dumpmem(Int32(len*2), u16s)
let str4 = String(utf16ptr: u16s)!
u16s.deallocate(capacity: cnt*4)
print(str4) // helloはろー
}
然后是支持函数。
// dump memory
void dumpmem(int len, const void *ptr) {
unsigned char *p = (unsigned char *)ptr;
if (!len) len = (int)strlen((char *)p);
for (int i = 0 ; i < len ; i++) {
printf("[%d] : %d : 0x%x", i, *p, *p);
if (isascii(*p)) printf(" : %c", *p);
printf("\n");
p++;
}
}
// create string from Shift JIS pointer=UInt8
extension String {
init?(sjisptr: UnsafePointer<UInt8>) {
var len = 0
while (sjisptr[len] != 0) {
len += 1
}
let data = Data(bytes: UnsafePointer<UInt8>(sjisptr), count: len)
if let ns = NSString(data: data, encoding: String.Encoding.shiftJIS.rawValue) {
self = ns as String
} else {
return nil
}
}
}
// create string from UTF-16 pointer=UInt16
extension String {
init?(utf16ptr: UnsafePointer<UInt16>) {
var len = 0
while (utf16ptr[len] != 0) {
len += 1
}
len += len
let data = Data(bytes: utf16ptr, count: len)
if let ns = NSString(data: data, encoding: String.Encoding.utf16LittleEndian.rawValue) {
self = ns as String
} else {
return nil
}
}
}
由于 utf-16 转换不能很好地使用 cString(使用:String.Encoding.utf16)(C),我最终迭代(扫描)字符串,如上面的 (D) 所示。
我可能遗漏了什么。谁能解释为什么会这样?
String.cString(使用:String.Encoding.utf16)是否正常工作?
答案是否定的
单词"CString"表示以NUL结尾的字节序列。而 NUL 在 CString 中总是表示单字节 0x00。
如您所知,UTF-16 表示可能包含许多 0x00 字节,此类编码不能用作 CString。
以及修复代码的其他要点:
- 您不应该从传递给
withUnsafeBufferPointer(_:)
的闭包中取出 bufferPointer 或其 baseAddress
。
Parameters
body
A closure with an UnsafeBufferPointer parameter that points to the
contiguous storage for the array. If body has a return value, it is
used as the return value for the withUnsafeBufferPointer(_:) method.
The pointer argument is valid only for the duration of the closure’s execution.
而且在很多情况下,你不需要使用withUnsafeBufferPointer(_:)
。
要获取 UTF-16 表示的大小,您应该使用 str.utf16.count
,而不是 str.characters.count
。
Swift3 的 String
有一个初始化 init(cString:encoding:)
方法。您无需重新实现它。
len += len
有点像 "unreadable"。使用 len * MemoryLayout<UInt16>.size
或至少 len * 2
.
试试这个代码:
// dump memory written in Swift
func dumpmem(_ len: Int, _ ptr: UnsafeRawPointer) {
var p = ptr.assumingMemoryBound(to: UInt8.self)
let len = len == 0 ? Int(strlen(ptr.assumingMemoryBound(to: CChar.self))) : len
for i in 0..<len {
print(String(format: "[%d] : %2$d : 0x%2$x", i, p.pointee), terminator: "")
if isascii(Int32(p.pointee)) != 0 {print(String(format: " : %c", p.pointee), terminator: "")}
print()
p += 1
}
}
// If you use UnsafePointer<CChar>, this can be simplified.
extension String {
init?(sjisptr: UnsafePointer<UInt8>) {
self.init(cString: UnsafeRawPointer(sjisptr).assumingMemoryBound(to: CChar.self), encoding: .shiftJIS)
}
}
// create string from UTF-16 with terminating U+0000
extension String {
init?(utf16ptr: UnsafePointer<UInt16>) {
var len = 0
while (utf16ptr[len] != 0) { //Detecting U+0000 as a terminator.
len += 1
}
let data = Data(bytes: utf16ptr, count: len * MemoryLayout<UInt16>.size)
self.init(data: data, encoding: .utf16LittleEndian)
}
}
func test() {
let str = "helloはろー"
// (A) utf-8
let utf8 = str.cString(using: .utf8)!
dumpmem(0, utf8)
let str1 = String(cString: utf8)
print(str1) // helloはろー
// (B) shift-JIS
let sjis = str.cString(using: .shiftJIS)!
sjis.withUnsafeBufferPointer {
let uint8p = UnsafeRawPointer([=10=].baseAddress!).assumingMemoryBound(to: UInt8.self)
dumpmem(0, uint8p)
let str2 = String(sjisptr: uint8p)!
print(str2) // helloはろー
}
// (C) utf-16 ==> to create a byte representation of UTF-16 terminated with U+0000
var utf16 = str.data(using: .utf16LittleEndian)!
utf16.append(contentsOf: [0,0]) //Append U+0000 as terminator.
utf16.withUnsafeBytes {(uint16p: UnsafePointer<UInt16>) in
dumpmem(utf16.count, uint16p)
let str3 = String(utf16ptr: uint16p)!
print(str3)
}
// (D) utf-16
let u16s = UnsafeMutablePointer<UInt16>.allocate(capacity: str.utf16.count + 1) //<- `cnt * 4` is not appropriate
var len = 0
for code in str.utf16 {
u16s[len] = code
len += 1
}
u16s[len] = 0 //Append U+0000 as terminator.
dumpmem((len+1) * MemoryLayout<UInt16>.size, u16s)
let str4 = String(utf16ptr: u16s)!
u16s.deallocate(capacity: str.utf16.count + 1)
print(str4) // helloはろー
}
我发现 string.cString(使用:String.Encoding.utf16)有奇怪的行为。 我想要做的是在 UnsafePointer<'UInt16>. However, String.Encoding.utf16 seems to terminate conversion after it outputs first '\0' 中将 Swift 字符串转换为 utf-16。 相同的逻辑适用于 utf-8 或 shiftJIS,因为它们在途中没有 NULL 字节。
这是代码。
func test() {
let str = "helloはろー"
let cnt = str.characters.count
// (A) utf-8 ==> works fine
let utf8 = str.cString(using: String.Encoding.utf8)!
let int8p = utf8.withUnsafeBufferPointer {
UnsafeRawPointer([=12=].baseAddress!).assumingMemoryBound(to: Int8.self)
}
dumpmem(0, int8p)
let str1 = String(cString: int8p)
print(str1) // helloはろー
// (B) shift-JIS ==> works fine
let sjis = str.cString(using: String.Encoding.shiftJIS)!
let uint8p = sjis.withUnsafeBufferPointer {
UnsafeRawPointer([=12=].baseAddress!).assumingMemoryBound(to: UInt8.self)
}
dumpmem(0, uint8p)
let str2 = String(sjisptr: uint8p)!
print(str2) // helloはろー
// (C) utf-16 ==> doesn't work nicely
let utf16 = str.cString(using: String.Encoding.utf16)!
let uint16p = utf16.withUnsafeBufferPointer {
UnsafeRawPointer([=12=].baseAddress!).assumingMemoryBound(to: UInt16.self)
}
dumpmem(Int32(str.characters.count*2), uint16p) // only top char (='h' converted)
let str3 = String(utf16ptr: uint16p)!
print(str3) // h+garbage ...
// (D) utf-16 w/ iteration ==> works fine.
let u16s = UnsafeMutablePointer<UInt16>.allocate(capacity: cnt*4)
var len = 0
for code in str.utf16 {
u16s[len] = code
len += 1
}
u16s[len] = 0
dumpmem(Int32(len*2), u16s)
let str4 = String(utf16ptr: u16s)!
u16s.deallocate(capacity: cnt*4)
print(str4) // helloはろー
}
然后是支持函数。
// dump memory
void dumpmem(int len, const void *ptr) {
unsigned char *p = (unsigned char *)ptr;
if (!len) len = (int)strlen((char *)p);
for (int i = 0 ; i < len ; i++) {
printf("[%d] : %d : 0x%x", i, *p, *p);
if (isascii(*p)) printf(" : %c", *p);
printf("\n");
p++;
}
}
// create string from Shift JIS pointer=UInt8
extension String {
init?(sjisptr: UnsafePointer<UInt8>) {
var len = 0
while (sjisptr[len] != 0) {
len += 1
}
let data = Data(bytes: UnsafePointer<UInt8>(sjisptr), count: len)
if let ns = NSString(data: data, encoding: String.Encoding.shiftJIS.rawValue) {
self = ns as String
} else {
return nil
}
}
}
// create string from UTF-16 pointer=UInt16
extension String {
init?(utf16ptr: UnsafePointer<UInt16>) {
var len = 0
while (utf16ptr[len] != 0) {
len += 1
}
len += len
let data = Data(bytes: utf16ptr, count: len)
if let ns = NSString(data: data, encoding: String.Encoding.utf16LittleEndian.rawValue) {
self = ns as String
} else {
return nil
}
}
}
由于 utf-16 转换不能很好地使用 cString(使用:String.Encoding.utf16)(C),我最终迭代(扫描)字符串,如上面的 (D) 所示。
我可能遗漏了什么。谁能解释为什么会这样?
String.cString(使用:String.Encoding.utf16)是否正常工作?
答案是否定的
单词"CString"表示以NUL结尾的字节序列。而 NUL 在 CString 中总是表示单字节 0x00。
如您所知,UTF-16 表示可能包含许多 0x00 字节,此类编码不能用作 CString。
以及修复代码的其他要点:
- 您不应该从传递给
withUnsafeBufferPointer(_:)
的闭包中取出 bufferPointer 或其baseAddress
。
Parameters
body
A closure with an UnsafeBufferPointer parameter that points to the contiguous storage for the array. If body has a return value, it is used as the return value for the withUnsafeBufferPointer(_:) method. The pointer argument is valid only for the duration of the closure’s execution.
而且在很多情况下,你不需要使用withUnsafeBufferPointer(_:)
。
要获取 UTF-16 表示的大小,您应该使用
str.utf16.count
,而不是str.characters.count
。Swift3 的
String
有一个初始化init(cString:encoding:)
方法。您无需重新实现它。len += len
有点像 "unreadable"。使用len * MemoryLayout<UInt16>.size
或至少len * 2
.
试试这个代码:
// dump memory written in Swift
func dumpmem(_ len: Int, _ ptr: UnsafeRawPointer) {
var p = ptr.assumingMemoryBound(to: UInt8.self)
let len = len == 0 ? Int(strlen(ptr.assumingMemoryBound(to: CChar.self))) : len
for i in 0..<len {
print(String(format: "[%d] : %2$d : 0x%2$x", i, p.pointee), terminator: "")
if isascii(Int32(p.pointee)) != 0 {print(String(format: " : %c", p.pointee), terminator: "")}
print()
p += 1
}
}
// If you use UnsafePointer<CChar>, this can be simplified.
extension String {
init?(sjisptr: UnsafePointer<UInt8>) {
self.init(cString: UnsafeRawPointer(sjisptr).assumingMemoryBound(to: CChar.self), encoding: .shiftJIS)
}
}
// create string from UTF-16 with terminating U+0000
extension String {
init?(utf16ptr: UnsafePointer<UInt16>) {
var len = 0
while (utf16ptr[len] != 0) { //Detecting U+0000 as a terminator.
len += 1
}
let data = Data(bytes: utf16ptr, count: len * MemoryLayout<UInt16>.size)
self.init(data: data, encoding: .utf16LittleEndian)
}
}
func test() {
let str = "helloはろー"
// (A) utf-8
let utf8 = str.cString(using: .utf8)!
dumpmem(0, utf8)
let str1 = String(cString: utf8)
print(str1) // helloはろー
// (B) shift-JIS
let sjis = str.cString(using: .shiftJIS)!
sjis.withUnsafeBufferPointer {
let uint8p = UnsafeRawPointer([=10=].baseAddress!).assumingMemoryBound(to: UInt8.self)
dumpmem(0, uint8p)
let str2 = String(sjisptr: uint8p)!
print(str2) // helloはろー
}
// (C) utf-16 ==> to create a byte representation of UTF-16 terminated with U+0000
var utf16 = str.data(using: .utf16LittleEndian)!
utf16.append(contentsOf: [0,0]) //Append U+0000 as terminator.
utf16.withUnsafeBytes {(uint16p: UnsafePointer<UInt16>) in
dumpmem(utf16.count, uint16p)
let str3 = String(utf16ptr: uint16p)!
print(str3)
}
// (D) utf-16
let u16s = UnsafeMutablePointer<UInt16>.allocate(capacity: str.utf16.count + 1) //<- `cnt * 4` is not appropriate
var len = 0
for code in str.utf16 {
u16s[len] = code
len += 1
}
u16s[len] = 0 //Append U+0000 as terminator.
dumpmem((len+1) * MemoryLayout<UInt16>.size, u16s)
let str4 = String(utf16ptr: u16s)!
u16s.deallocate(capacity: str.utf16.count + 1)
print(str4) // helloはろー
}