用高/低代理计算 java 中的单词和字符?
Counting words and characters in java with high / low surrogates?
我知道关于这个主题有一些 SO,但所有提出的解决方案似乎都采用了与我在 javascript 中看到的示例不同的方法。
这里是 java 脚本示例,它计算文本字符串中键入的段落、句子单词和字符,其中包括检查高/低代理项以专门计算字符数:
java脚本版本
count(text);
function count(original) {
var trimmed = original.replace(/[\u200B]+/, '').trim();
return {
paragraphs: trimmed ? (trimmed.match(/\n+/g) || []).length + 1 : 0,
sentences: trimmed ? (trimmed.match(/[.?!…\n]+./g) || []).length + 1 : 0,
words: trimmed ? (trimmed.replace(/['";:,.?¿\-!¡]+/g, '').match(/\S+/g) || []).length : 0,
characters: trimmed ? _decode(trimmed.replace(/\s/g, '')).length : 0,
all: _decode(original).length
};
};
function _decode(string) {
var output = [],
counter = 0,
length = string.length,
value, extra;
while (counter < length) {
value = string.charCodeAt(counter++);
if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
// High surrogate, and there is a next character.
extra = string.charCodeAt(counter++);
if ((extra & 0xFC00) === 0xDC00) {
// Low surrogate.
output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
} else {
// unmatched surrogate; only append this code unit, in case the next
// code unit is the high surrogate of a surrogate pair
output.push(value, extra);
counter--;
}
} else {
output.push(value);
}
}
return output;
}
下面和 jsfiddle
中的演示
var text = 'This is a paragraph. This is the 2nd sentence in the 1st paragraph.\nThis is another paragraph.';
var count = doCount(text);
document.body.innerHTML = '<pre>' + text + '</pre><hr>';
for (i in count) {
document.body.innerHTML += '<p>'+ i +': ' + count[i] + '</p>';
}
/* COUNTING LIBRARY */
/**
* Extracted from https://github.com/RadLikeWhoa/Countable/, which in
* turn uses `ucs2decode` function from the punycode.js library.
*/
function doCount(original) {
var trimmed = original.replace(/[\u200B]+/, '').trim();
return {
paragraphs: trimmed ? (trimmed.match(/\n+/g) || []).length + 1 : 0,
sentences: trimmed ? (trimmed.match(/[.?!…\n]+./g) || []).length + 1 : 0,
words: trimmed ? (trimmed.replace(/['";:,.?¿\-!¡]+/g, '').match(/\S+/g) || []).length : 0,
characters: trimmed ? _decode(trimmed.replace(/\s/g, '')).length : 0,
all: _decode(original).length
};
};
/**
* `ucs2decode` function from the punycode.js library.
*
* Creates an array containing the decimal code points of each Unicode
* character in the string. While JavaScript uses UCS-2 internally, this
* function will convert a pair of surrogate halves (each of which UCS-2
* exposes as separate characters) into a single code point, matching
* UTF-16.
*
* @see <http://goo.gl/8M09r>
* @see <http://goo.gl/u4UUC>
*
* @param {String} string The Unicode input string (UCS-2).
*
* @return {Array} The new array of code points.
*/
function _decode(string) {
var output = [],
counter = 0,
length = string.length,
value, extra;
while (counter < length) {
value = string.charCodeAt(counter++);
if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
// High surrogate, and there is a next character.
extra = string.charCodeAt(counter++);
if ((extra & 0xFC00) === 0xDC00) {
// Low surrogate.
output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
} else {
// unmatched surrogate; only append this code unit, in case the next
// code unit is the high surrogate of a surrogate pair
output.push(value, extra);
counter--;
}
} else {
output.push(value);
}
}
return output;
}
我不精通字符编码方案和高/低代理之类的东西,但是使用 java 计数时不需要这些吗?
我对 java 脚本实现的结果很满意,我想在我的 java 后端进行计数,但我不确定相同的方法是否均匀需要或应该如何完成。
所以 javascript 版本所做的是,如果代理项对出现在正在解码的文本中,则将它们读取为一个字符。这在 Javascript 中是可能的,因为根据 Javascript 引擎允许 both UCS-2 and UTF-16 并且 UTF-16 支持高代理项,这意味着单个 visible 字符使用代码点进行编码。为了正确计算长度,库考虑了额外的代码点,因此它们被计为一个。
在 Java 中你有类似的问题,除了在 Java 中你可以有更多的编码方案。幸运的是,Java 已经 returns 包含高代理项的字符串的正确长度。尽管如此,如果您想分离组合代码点甚至删除它们,Java 提供文本中的 Normalizer (example of removing diacritics。
string = Normalizer.normalize(string, Normalizer.Form.NFD);
我知道关于这个主题有一些 SO,但所有提出的解决方案似乎都采用了与我在 javascript 中看到的示例不同的方法。
这里是 java 脚本示例,它计算文本字符串中键入的段落、句子单词和字符,其中包括检查高/低代理项以专门计算字符数:
java脚本版本
count(text);
function count(original) {
var trimmed = original.replace(/[\u200B]+/, '').trim();
return {
paragraphs: trimmed ? (trimmed.match(/\n+/g) || []).length + 1 : 0,
sentences: trimmed ? (trimmed.match(/[.?!…\n]+./g) || []).length + 1 : 0,
words: trimmed ? (trimmed.replace(/['";:,.?¿\-!¡]+/g, '').match(/\S+/g) || []).length : 0,
characters: trimmed ? _decode(trimmed.replace(/\s/g, '')).length : 0,
all: _decode(original).length
};
};
function _decode(string) {
var output = [],
counter = 0,
length = string.length,
value, extra;
while (counter < length) {
value = string.charCodeAt(counter++);
if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
// High surrogate, and there is a next character.
extra = string.charCodeAt(counter++);
if ((extra & 0xFC00) === 0xDC00) {
// Low surrogate.
output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
} else {
// unmatched surrogate; only append this code unit, in case the next
// code unit is the high surrogate of a surrogate pair
output.push(value, extra);
counter--;
}
} else {
output.push(value);
}
}
return output;
}
下面和 jsfiddle
中的演示var text = 'This is a paragraph. This is the 2nd sentence in the 1st paragraph.\nThis is another paragraph.';
var count = doCount(text);
document.body.innerHTML = '<pre>' + text + '</pre><hr>';
for (i in count) {
document.body.innerHTML += '<p>'+ i +': ' + count[i] + '</p>';
}
/* COUNTING LIBRARY */
/**
* Extracted from https://github.com/RadLikeWhoa/Countable/, which in
* turn uses `ucs2decode` function from the punycode.js library.
*/
function doCount(original) {
var trimmed = original.replace(/[\u200B]+/, '').trim();
return {
paragraphs: trimmed ? (trimmed.match(/\n+/g) || []).length + 1 : 0,
sentences: trimmed ? (trimmed.match(/[.?!…\n]+./g) || []).length + 1 : 0,
words: trimmed ? (trimmed.replace(/['";:,.?¿\-!¡]+/g, '').match(/\S+/g) || []).length : 0,
characters: trimmed ? _decode(trimmed.replace(/\s/g, '')).length : 0,
all: _decode(original).length
};
};
/**
* `ucs2decode` function from the punycode.js library.
*
* Creates an array containing the decimal code points of each Unicode
* character in the string. While JavaScript uses UCS-2 internally, this
* function will convert a pair of surrogate halves (each of which UCS-2
* exposes as separate characters) into a single code point, matching
* UTF-16.
*
* @see <http://goo.gl/8M09r>
* @see <http://goo.gl/u4UUC>
*
* @param {String} string The Unicode input string (UCS-2).
*
* @return {Array} The new array of code points.
*/
function _decode(string) {
var output = [],
counter = 0,
length = string.length,
value, extra;
while (counter < length) {
value = string.charCodeAt(counter++);
if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
// High surrogate, and there is a next character.
extra = string.charCodeAt(counter++);
if ((extra & 0xFC00) === 0xDC00) {
// Low surrogate.
output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
} else {
// unmatched surrogate; only append this code unit, in case the next
// code unit is the high surrogate of a surrogate pair
output.push(value, extra);
counter--;
}
} else {
output.push(value);
}
}
return output;
}
我不精通字符编码方案和高/低代理之类的东西,但是使用 java 计数时不需要这些吗?
我对 java 脚本实现的结果很满意,我想在我的 java 后端进行计数,但我不确定相同的方法是否均匀需要或应该如何完成。
所以 javascript 版本所做的是,如果代理项对出现在正在解码的文本中,则将它们读取为一个字符。这在 Javascript 中是可能的,因为根据 Javascript 引擎允许 both UCS-2 and UTF-16 并且 UTF-16 支持高代理项,这意味着单个 visible 字符使用代码点进行编码。为了正确计算长度,库考虑了额外的代码点,因此它们被计为一个。
在 Java 中你有类似的问题,除了在 Java 中你可以有更多的编码方案。幸运的是,Java 已经 returns 包含高代理项的字符串的正确长度。尽管如此,如果您想分离组合代码点甚至删除它们,Java 提供文本中的 Normalizer (example of removing diacritics。
string = Normalizer.normalize(string, Normalizer.Form.NFD);