一个 3 字节宽的 UTF-8 字符如何只使用一个 UTF-16 代码单元?
How can a 3-byte wide UTF-8 character only use a single UTF-16 code unit?
来自这个参考:
表示string.length的值实际上是UTF-16编码单元的个数,而不是字符个数。
我显然天真地假设任何 3 或 4 字节宽的 UTF-8 字符都必须占用 2 个 UTF-16 代码单元。这就是我的意思:
我对字符串进行了一些试验:œ´®†¥¨ˆøπ¬˚∆˙©ƒ∂ßåΩ≈ç√∫˜µ≤ユーザーコードa
,其中混合了 1、2、3 和 4 字节宽的字符。我得到了一些令人惊讶的结果。
每个 UTF-16 编码单元为 2 个字节宽。字符串中的字符个数是35,取字符串的string.length等于36,意思是只有一个字符占用了2个UTF-16编码单元,而有几个UTF-8字符是3字节和4字节宽
使用下面的代码,我检查了每个 UTF-8 字符、它使用的字节数以及它的 string.length。令我感兴趣的是,所有 3 字节 UTF-8 字符仅使用一个 UTF-16 代码单元。唯一需要 2 个代码单元的字符是 4 字节宽的表情符号。
谁能解释一下这是怎么回事?谢谢!
代码:
function iterateCharacters(str) {
let te = new TextEncoder();
let totalBytes = 0;
let totalCodeUnits1 = 0;
let totalCodeUnits2 = 0;
let arr = [...str];
for (let i = 0; i < arr.length; i++) {
let bytes = te.encode(arr[i]).length;
let length = arr[i].length;
totalBytes += bytes;
console.log(" i: " + i + " char: " + arr[i] + " bytes: " + bytes + " length: " + length);
// Erroneous assumption that more than 2 utf8 bytes would occupy 2 UTF-16 code units:
totalCodeUnits1 += bytes < 3 ? 1 : 2;
totalCodeUnits2 += length;
}
console.log(" total UTF-16 code units (erroneous calculation): " + totalCodeUnits1)
console.log(" total UTF-16 code units (correct calculation): " + totalCodeUnits2)
console.log(" total characters: " + arr.length)
console.log(" total UTF-8 bytes: " + totalBytes)
}
var sample = "œ´®†¥¨ˆøπ¬˚∆˙©ƒ∂ßåΩ≈ç√∫˜µ≤ユーザーコードa";
iterateCharacters(sample);
console.log("total number of UTF-16 code units: " + sample.length);
console.log("total number of characters: " + [...sample].length);
console.log("total number of UTF-8 bytes: " + (new TextEncoder().encode(sample)).length);
结果:
i: 0 char: bytes: 4 length: 2
i: 1 char: œ bytes: 2 length: 1
i: 2 char: ´ bytes: 2 length: 1
i: 3 char: ® bytes: 2 length: 1
i: 4 char: † bytes: 3 length: 1
i: 5 char: ¥ bytes: 2 length: 1
i: 6 char: ¨ bytes: 2 length: 1
i: 7 char: ˆ bytes: 2 length: 1
i: 8 char: ø bytes: 2 length: 1
i: 9 char: π bytes: 2 length: 1
i: 10 char: ¬ bytes: 2 length: 1
i: 11 char: ˚ bytes: 2 length: 1
i: 12 char: ∆ bytes: 3 length: 1
i: 13 char: ˙ bytes: 2 length: 1
i: 14 char: © bytes: 2 length: 1
i: 15 char: ƒ bytes: 2 length: 1
i: 16 char: ∂ bytes: 3 length: 1
i: 17 char: ß bytes: 2 length: 1
i: 18 char: å bytes: 2 length: 1
i: 19 char: Ω bytes: 2 length: 1
i: 20 char: ≈ bytes: 3 length: 1
i: 21 char: ç bytes: 2 length: 1
i: 22 char: √ bytes: 3 length: 1
i: 23 char: ∫ bytes: 3 length: 1
i: 24 char: ˜ bytes: 2 length: 1
i: 25 char: µ bytes: 2 length: 1
i: 26 char: ≤ bytes: 3 length: 1
i: 27 char: ユ bytes: 3 length: 1
i: 28 char: ー bytes: 3 length: 1
i: 29 char: ザ bytes: 3 length: 1
i: 30 char: ー bytes: 3 length: 1
i: 31 char: コ bytes: 3 length: 1
i: 32 char: ー bytes: 3 length: 1
i: 33 char: ド bytes: 3 length: 1
i: 34 char: a bytes: 1 length: 1
total UTF-16 code units (erroneous calculation): 50
total UTF-16 code units (correct calculation): 36
total characters: 35
total UTF-8 bytes: 85
total number of UTF-16 code units: 36
total number of characters: 35
total number of UTF-8 bytes: 85
(另见 Jsfiddle:https://jsfiddle.net/Allasso/o5zpmrc9/)
UTF-16 与 UTF-8 不同。 00010000(十六进制)以下的所有 Unicode 字符都可以用单个 UTF-16 字符表示。在此之上,您溢出了 2 个 UTF-16 字符。然而,这意味着所有 3 字节 UTF-8 编码都适合单个 UTF-16 字符。
请记住,UTF-8 字符的 3 个字节并未完全用于实际代码点(数字)。一些位被“标记”位占用,这些位向解释软件指示代码序列已经开始。 UTF-16 也是如此,但方案(标记位模式)不同。
来自这个参考:
表示string.length的值实际上是UTF-16编码单元的个数,而不是字符个数。
我显然天真地假设任何 3 或 4 字节宽的 UTF-8 字符都必须占用 2 个 UTF-16 代码单元。这就是我的意思:
我对字符串进行了一些试验:œ´®†¥¨ˆøπ¬˚∆˙©ƒ∂ßåΩ≈ç√∫˜µ≤ユーザーコードa
,其中混合了 1、2、3 和 4 字节宽的字符。我得到了一些令人惊讶的结果。
每个 UTF-16 编码单元为 2 个字节宽。字符串中的字符个数是35,取字符串的string.length等于36,意思是只有一个字符占用了2个UTF-16编码单元,而有几个UTF-8字符是3字节和4字节宽
使用下面的代码,我检查了每个 UTF-8 字符、它使用的字节数以及它的 string.length。令我感兴趣的是,所有 3 字节 UTF-8 字符仅使用一个 UTF-16 代码单元。唯一需要 2 个代码单元的字符是 4 字节宽的表情符号。
谁能解释一下这是怎么回事?谢谢!
代码:
function iterateCharacters(str) {
let te = new TextEncoder();
let totalBytes = 0;
let totalCodeUnits1 = 0;
let totalCodeUnits2 = 0;
let arr = [...str];
for (let i = 0; i < arr.length; i++) {
let bytes = te.encode(arr[i]).length;
let length = arr[i].length;
totalBytes += bytes;
console.log(" i: " + i + " char: " + arr[i] + " bytes: " + bytes + " length: " + length);
// Erroneous assumption that more than 2 utf8 bytes would occupy 2 UTF-16 code units:
totalCodeUnits1 += bytes < 3 ? 1 : 2;
totalCodeUnits2 += length;
}
console.log(" total UTF-16 code units (erroneous calculation): " + totalCodeUnits1)
console.log(" total UTF-16 code units (correct calculation): " + totalCodeUnits2)
console.log(" total characters: " + arr.length)
console.log(" total UTF-8 bytes: " + totalBytes)
}
var sample = "œ´®†¥¨ˆøπ¬˚∆˙©ƒ∂ßåΩ≈ç√∫˜µ≤ユーザーコードa";
iterateCharacters(sample);
console.log("total number of UTF-16 code units: " + sample.length);
console.log("total number of characters: " + [...sample].length);
console.log("total number of UTF-8 bytes: " + (new TextEncoder().encode(sample)).length);
结果:
i: 0 char: bytes: 4 length: 2
i: 1 char: œ bytes: 2 length: 1
i: 2 char: ´ bytes: 2 length: 1
i: 3 char: ® bytes: 2 length: 1
i: 4 char: † bytes: 3 length: 1
i: 5 char: ¥ bytes: 2 length: 1
i: 6 char: ¨ bytes: 2 length: 1
i: 7 char: ˆ bytes: 2 length: 1
i: 8 char: ø bytes: 2 length: 1
i: 9 char: π bytes: 2 length: 1
i: 10 char: ¬ bytes: 2 length: 1
i: 11 char: ˚ bytes: 2 length: 1
i: 12 char: ∆ bytes: 3 length: 1
i: 13 char: ˙ bytes: 2 length: 1
i: 14 char: © bytes: 2 length: 1
i: 15 char: ƒ bytes: 2 length: 1
i: 16 char: ∂ bytes: 3 length: 1
i: 17 char: ß bytes: 2 length: 1
i: 18 char: å bytes: 2 length: 1
i: 19 char: Ω bytes: 2 length: 1
i: 20 char: ≈ bytes: 3 length: 1
i: 21 char: ç bytes: 2 length: 1
i: 22 char: √ bytes: 3 length: 1
i: 23 char: ∫ bytes: 3 length: 1
i: 24 char: ˜ bytes: 2 length: 1
i: 25 char: µ bytes: 2 length: 1
i: 26 char: ≤ bytes: 3 length: 1
i: 27 char: ユ bytes: 3 length: 1
i: 28 char: ー bytes: 3 length: 1
i: 29 char: ザ bytes: 3 length: 1
i: 30 char: ー bytes: 3 length: 1
i: 31 char: コ bytes: 3 length: 1
i: 32 char: ー bytes: 3 length: 1
i: 33 char: ド bytes: 3 length: 1
i: 34 char: a bytes: 1 length: 1
total UTF-16 code units (erroneous calculation): 50
total UTF-16 code units (correct calculation): 36
total characters: 35
total UTF-8 bytes: 85
total number of UTF-16 code units: 36
total number of characters: 35
total number of UTF-8 bytes: 85
(另见 Jsfiddle:https://jsfiddle.net/Allasso/o5zpmrc9/)
UTF-16 与 UTF-8 不同。 00010000(十六进制)以下的所有 Unicode 字符都可以用单个 UTF-16 字符表示。在此之上,您溢出了 2 个 UTF-16 字符。然而,这意味着所有 3 字节 UTF-8 编码都适合单个 UTF-16 字符。
请记住,UTF-8 字符的 3 个字节并未完全用于实际代码点(数字)。一些位被“标记”位占用,这些位向解释软件指示代码序列已经开始。 UTF-16 也是如此,但方案(标记位模式)不同。