String.cString(using: String.Encoding.utf16) 是否正常工作?

Is String.cString(using: String.Encoding.utf16) working correctly?

我发现 string.cString(使用:String.Encoding.utf16)有奇怪的行为。 我想要做的是在 UnsafePointer<'UInt16>. However, String.Encoding.utf16 seems to terminate conversion after it outputs first '\0' 中将 Swift 字符串转换为 utf-16。 相同的逻辑适用于 utf-8 或 shiftJIS,因为它们在途中没有 NULL 字节。

这是代码。

func test() {
  let str = "helloはろー"
  let cnt = str.characters.count
  // (A) utf-8 ==> works fine
  let utf8 = str.cString(using: String.Encoding.utf8)!
  let int8p = utf8.withUnsafeBufferPointer {
    UnsafeRawPointer([=12=].baseAddress!).assumingMemoryBound(to: Int8.self)
  }
  dumpmem(0, int8p)
  let str1 = String(cString: int8p)
  print(str1)  // helloはろー
  // (B) shift-JIS ==> works fine
  let sjis = str.cString(using: String.Encoding.shiftJIS)!
  let uint8p = sjis.withUnsafeBufferPointer {
    UnsafeRawPointer([=12=].baseAddress!).assumingMemoryBound(to: UInt8.self)
  }
  dumpmem(0, uint8p)
  let str2 = String(sjisptr: uint8p)!
  print(str2)  // helloはろー
  // (C) utf-16 ==> doesn't work nicely
  let utf16 = str.cString(using: String.Encoding.utf16)!
  let uint16p = utf16.withUnsafeBufferPointer {
    UnsafeRawPointer([=12=].baseAddress!).assumingMemoryBound(to: UInt16.self)
  }
  dumpmem(Int32(str.characters.count*2), uint16p)  // only top char (='h' converted)
  let str3 = String(utf16ptr: uint16p)!
  print(str3)  // h+garbage ...
  // (D) utf-16 w/ iteration ==> works fine.
  let u16s = UnsafeMutablePointer<UInt16>.allocate(capacity: cnt*4)
  var len = 0
  for code in str.utf16 {
    u16s[len] = code
    len += 1
  }
  u16s[len] = 0
  dumpmem(Int32(len*2), u16s)
  let str4 = String(utf16ptr: u16s)!
  u16s.deallocate(capacity: cnt*4)
  print(str4)  // helloはろー
}

然后是支持函数。

// dump memory
void dumpmem(int len, const void *ptr) {
  unsigned char *p = (unsigned char *)ptr;
  if (!len)  len = (int)strlen((char *)p);
  for (int i = 0 ; i < len ; i++) {
    printf("[%d] : %d : 0x%x", i, *p, *p);
    if (isascii(*p))  printf(" : %c", *p);
    printf("\n");
    p++;
  }
}
// create string from Shift JIS pointer=UInt8
extension String {
  init?(sjisptr: UnsafePointer<UInt8>) {
    var len = 0
    while (sjisptr[len] != 0) {
      len += 1
    }
    let data = Data(bytes: UnsafePointer<UInt8>(sjisptr), count: len)
    if let ns = NSString(data: data, encoding: String.Encoding.shiftJIS.rawValue) {
      self = ns as String
    } else {
      return nil
    }
  }
}
// create string from UTF-16 pointer=UInt16
extension String {
  init?(utf16ptr: UnsafePointer<UInt16>) {
    var len = 0
    while (utf16ptr[len] != 0) {
      len += 1
    }
    len += len
    let data = Data(bytes: utf16ptr, count: len)
    if let ns = NSString(data: data, encoding: String.Encoding.utf16LittleEndian.rawValue) {
      self = ns as String
    } else {
      return nil
    }
  }
}

由于 utf-16 转换不能很好地使用 cString(使用:String.Encoding.utf16)(C),我最终迭代(扫描)字符串,如上面的 (D) 所示。

我可能遗漏了什么。谁能解释为什么会这样?

String.cString(使用:String.Encoding.utf16)是否正常工作?

答案是否定的

单词"CString"表示以NUL结尾的字节序列。而 NUL 在 CString 中总是表示单字节 0x00。

如您所知,UTF-16 表示可能包含许多 0x00 字节,此类编码不能用作 CString。

以及修复代码的其他要点:

  • 您不应该从传递给 withUnsafeBufferPointer(_:) 的闭包中取出 bufferPointer 或其 baseAddress

withUnsafeBufferPointer(_:)

Parameters

body

A closure with an UnsafeBufferPointer parameter that points to the contiguous storage for the array. If body has a return value, it is used as the return value for the withUnsafeBufferPointer(_:) method. The pointer argument is valid only for the duration of the closure’s execution.

而且在很多情况下,你不需要使用withUnsafeBufferPointer(_:)

  • 要获取 UTF-16 表示的大小,您应该使用 str.utf16.count,而不是 str.characters.count

  • Swift3 的 String 有一个初始化 init(cString:encoding:) 方法。您无需重新实现它。

  • len += len 有点像 "unreadable"。使用 len * MemoryLayout<UInt16>.size 或至少 len * 2.

试试这个代码:

// dump memory written in Swift
func dumpmem(_ len: Int, _ ptr: UnsafeRawPointer) {
    var p = ptr.assumingMemoryBound(to: UInt8.self)
    let len = len == 0 ? Int(strlen(ptr.assumingMemoryBound(to: CChar.self))) : len
    for i in 0..<len {
        print(String(format: "[%d] : %2$d : 0x%2$x", i, p.pointee), terminator: "")
        if isascii(Int32(p.pointee)) != 0 {print(String(format: " : %c", p.pointee), terminator: "")}
        print()
        p += 1
    }
}
// If you use UnsafePointer<CChar>, this can be simplified.
extension String {
    init?(sjisptr: UnsafePointer<UInt8>) {
        self.init(cString: UnsafeRawPointer(sjisptr).assumingMemoryBound(to: CChar.self), encoding: .shiftJIS)
    }
}
// create string from UTF-16 with terminating U+0000
extension String {
    init?(utf16ptr: UnsafePointer<UInt16>) {
        var len = 0
        while (utf16ptr[len] != 0) { //Detecting U+0000 as a terminator.
            len += 1
        }
        let data = Data(bytes: utf16ptr, count: len * MemoryLayout<UInt16>.size)
        self.init(data: data, encoding: .utf16LittleEndian)
    }
}
func test() {
    let str = "helloはろー"

    // (A) utf-8
    let utf8 = str.cString(using: .utf8)!
    dumpmem(0, utf8)
    let str1 = String(cString: utf8)
    print(str1)  // helloはろー

    // (B) shift-JIS
    let sjis = str.cString(using: .shiftJIS)!
    sjis.withUnsafeBufferPointer {
        let uint8p = UnsafeRawPointer([=10=].baseAddress!).assumingMemoryBound(to: UInt8.self)
        dumpmem(0, uint8p)
        let str2 = String(sjisptr: uint8p)!
        print(str2)  // helloはろー
    }

    // (C) utf-16 ==> to create a byte representation of UTF-16 terminated with U+0000
    var utf16 = str.data(using: .utf16LittleEndian)!
    utf16.append(contentsOf: [0,0]) //Append U+0000 as terminator.
    utf16.withUnsafeBytes {(uint16p: UnsafePointer<UInt16>) in
        dumpmem(utf16.count, uint16p)
        let str3 = String(utf16ptr: uint16p)!
        print(str3)
    }

    // (D) utf-16
    let u16s = UnsafeMutablePointer<UInt16>.allocate(capacity: str.utf16.count + 1) //<- `cnt * 4` is not appropriate
    var len = 0
    for code in str.utf16 {
        u16s[len] = code
        len += 1
    }
    u16s[len] = 0 //Append U+0000 as terminator.
    dumpmem((len+1) * MemoryLayout<UInt16>.size, u16s)
    let str4 = String(utf16ptr: u16s)!
    u16s.deallocate(capacity: str.utf16.count + 1)
    print(str4)  // helloはろー
}