Javascript 带 unicode 和标点符号的正则表达式
Javascript regexp with unicode and punctuation
我有以下用于拆分 unicoded 单词的测试用例,但不知道该怎么做 javascript。
describe("garden: utils", () => {
it("should split correctly", () => {
assert.deepEqual(segmentation('Hockey is a popular sport in Canada.'), [
'Hockey', 'is', 'a', 'popular', 'sport', 'in', 'Canada', '.'
]);
assert.deepEqual(segmentation('How many provinces are there in Canada?'), [
'How', 'many', 'provinces', 'are', 'there', 'in', 'Canada', '?'
]);
assert.deepEqual(segmentation('The forest is on fire!'), [
'The', 'forest', 'is', 'on', 'fire', '!'
]);
assert.deepEqual(segmentation('Emily Carr, who was born in 1871, was a great painter.'), [
'Emily', 'Carr', ',', 'who', 'was', 'born', 'in', '1871', ',', 'was', 'a', 'great', 'painter', '.'
]);
assert.deepEqual(segmentation('This is David\'s computer.'), [
'This', 'is', 'David', '\'', 's', 'computer', '.'
]);
assert.deepEqual(segmentation('The prime minister said, "We will win the election."'), [
'The', 'prime', 'minister', 'said', ',', '"', 'We', 'will', 'win', 'the', 'election', '.', '"'
]);
assert.deepEqual(segmentation('There are three positions in hockey: goalie, defence, and forward.'), [
'There', 'are', 'three', 'positions', 'in', 'hockey', ':', 'goalie', ',', 'defence', ',', 'and', 'forward', '.'
]);
assert.deepEqual(segmentation('The festival is very popular; people from all over the world visit each year.'), [
'The', 'festival', 'is', 'very', 'popular', ';', 'people', 'from', 'all', 'over', 'the', 'world',
'visit', 'each', 'year', '.'
]);
assert.deepEqual(segmentation('Mild, wet, and cloudy - these are the characteristics of weather in Vancouver.'), [
'Mild', ',', 'wet', ',', 'and', 'cloudy', '-', 'these', 'are', 'the', 'characteristics', 'of', 'weather',
'in', 'Vancouver', '.'
]);
assert.deepEqual(segmentation('sweet-smelling'), [
'sweet', '-', 'smelling'
]);
});
it("should not split unicoded words", () => {
assert.deepEqual(segmentation('hacer a propósito'), [
'hacer', 'a', 'propósito'
]);
assert.deepEqual(segmentation('nhà em có con mèo'), [
'nhà', 'em', 'có', 'con', 'mèo'
]);
});
it("should group periods", () => {
assert.deepEqual(segmentation('So are ... the fishes.'), [
'So', 'are', '...', 'the', 'fishes', '.'
]);
assert.deepEqual(segmentation('So are ...... the fishes.'), [
'So', 'are', '......', 'the', 'fishes', '.'
]);
assert.deepEqual(segmentation('arriba arriba ja....'), [
'arriba', 'arriba', 'ja', '....'
]);
});
});
这是 python 中的等效表达式:
class Segmentation(BaseNLPProcessor):
pattern = re.compile('((?u)\w+|\.{2,}|[%s])' % string.punctuation)
@classmethod
def ignore_value(cls, value):
# type: (str) -> bool
return negate(compose(is_empty, string.strip))(value)
def split(self):
# type: () -> List[str]
return filter(self.ignore_value, self.pattern.split(self.value()))
我想在 python 中为 javascript 编写一个等效函数,以按统一编码的单词和标点符号拆分,按多个点分组 ...
Segmentation("Hockey is a popular sport in Canada.").split()
相当复杂,因为在 JavaScript RegExp 中没有负面的后视断言,而且 Unicode 支持还不是官方的(目前仅在 Firefox 中通过标志支持)。这使用库 (XRegExp) 来处理 unicode 类。如果您需要完整的正则表达式,它很大。只需评论并让我知道,我将更新答案以使用包含 Unicode 范围的展开的普通 RegExp 语句。
const rxLetterToOther = XRegExp('(\p{L})((?!\s)\P{L})','g');
const rxOtherToLetter = XRegExp('((?!\s)\P{L})(\p{L})','g');
const rxNumberToOther = XRegExp('(\p{N})((?!\s)\P{N})','g');
const rxOtherToNumber = XRegExp('((?!\s)\P{N})(\p{N})','g');
const rxPuctToPunct = XRegExp('(\p{P})(\p{P})','g');
const rxSep = XRegExp('\s+','g');
function segmentation(s) {
return s
.replace(rxLetterToOther, ' ')
.replace(rxOtherToLetter, ' ')
.replace(rxNumberToOther, ' ')
.replace(rxOtherToNumber, ' ')
.replace(rxPuctToPunct, ' ')
.split(rxSep);
}
Here it is passing all the test cases!
window.onbeforeunload = "";
* { margin: 0; padding: 0; border: 0; overflow: hidden; }
object { width: 100%; height: 100%; width: 100vw; height: 100vh; }
<object data="https://fiddle.jshell.net/a3tf68ae/14/show/" />
编辑:更新了测试用例以在测试结果下方打印巨大的 RegExp 源。 运行 查看嵌入式测试用例的代码片段。
我找到了答案,但是很复杂。有人对此有另一个简单的答案吗
module.exports = (string) => {
const segs = string.split(/(\.{2,}|!|"|#|$|%|&|'|\(|\)|\*|\+|,|-|\.|\/|:|;|<|=|>|\?|¿|@|[|]|\|^|_|`|{|\||}|~| )/);
return segs.filter((seg) => seg.trim() !== "");
};
我有以下用于拆分 unicoded 单词的测试用例,但不知道该怎么做 javascript。
describe("garden: utils", () => {
it("should split correctly", () => {
assert.deepEqual(segmentation('Hockey is a popular sport in Canada.'), [
'Hockey', 'is', 'a', 'popular', 'sport', 'in', 'Canada', '.'
]);
assert.deepEqual(segmentation('How many provinces are there in Canada?'), [
'How', 'many', 'provinces', 'are', 'there', 'in', 'Canada', '?'
]);
assert.deepEqual(segmentation('The forest is on fire!'), [
'The', 'forest', 'is', 'on', 'fire', '!'
]);
assert.deepEqual(segmentation('Emily Carr, who was born in 1871, was a great painter.'), [
'Emily', 'Carr', ',', 'who', 'was', 'born', 'in', '1871', ',', 'was', 'a', 'great', 'painter', '.'
]);
assert.deepEqual(segmentation('This is David\'s computer.'), [
'This', 'is', 'David', '\'', 's', 'computer', '.'
]);
assert.deepEqual(segmentation('The prime minister said, "We will win the election."'), [
'The', 'prime', 'minister', 'said', ',', '"', 'We', 'will', 'win', 'the', 'election', '.', '"'
]);
assert.deepEqual(segmentation('There are three positions in hockey: goalie, defence, and forward.'), [
'There', 'are', 'three', 'positions', 'in', 'hockey', ':', 'goalie', ',', 'defence', ',', 'and', 'forward', '.'
]);
assert.deepEqual(segmentation('The festival is very popular; people from all over the world visit each year.'), [
'The', 'festival', 'is', 'very', 'popular', ';', 'people', 'from', 'all', 'over', 'the', 'world',
'visit', 'each', 'year', '.'
]);
assert.deepEqual(segmentation('Mild, wet, and cloudy - these are the characteristics of weather in Vancouver.'), [
'Mild', ',', 'wet', ',', 'and', 'cloudy', '-', 'these', 'are', 'the', 'characteristics', 'of', 'weather',
'in', 'Vancouver', '.'
]);
assert.deepEqual(segmentation('sweet-smelling'), [
'sweet', '-', 'smelling'
]);
});
it("should not split unicoded words", () => {
assert.deepEqual(segmentation('hacer a propósito'), [
'hacer', 'a', 'propósito'
]);
assert.deepEqual(segmentation('nhà em có con mèo'), [
'nhà', 'em', 'có', 'con', 'mèo'
]);
});
it("should group periods", () => {
assert.deepEqual(segmentation('So are ... the fishes.'), [
'So', 'are', '...', 'the', 'fishes', '.'
]);
assert.deepEqual(segmentation('So are ...... the fishes.'), [
'So', 'are', '......', 'the', 'fishes', '.'
]);
assert.deepEqual(segmentation('arriba arriba ja....'), [
'arriba', 'arriba', 'ja', '....'
]);
});
});
这是 python 中的等效表达式:
class Segmentation(BaseNLPProcessor):
pattern = re.compile('((?u)\w+|\.{2,}|[%s])' % string.punctuation)
@classmethod
def ignore_value(cls, value):
# type: (str) -> bool
return negate(compose(is_empty, string.strip))(value)
def split(self):
# type: () -> List[str]
return filter(self.ignore_value, self.pattern.split(self.value()))
我想在 python 中为 javascript 编写一个等效函数,以按统一编码的单词和标点符号拆分,按多个点分组 ...
Segmentation("Hockey is a popular sport in Canada.").split()
相当复杂,因为在 JavaScript RegExp 中没有负面的后视断言,而且 Unicode 支持还不是官方的(目前仅在 Firefox 中通过标志支持)。这使用库 (XRegExp) 来处理 unicode 类。如果您需要完整的正则表达式,它很大。只需评论并让我知道,我将更新答案以使用包含 Unicode 范围的展开的普通 RegExp 语句。
const rxLetterToOther = XRegExp('(\p{L})((?!\s)\P{L})','g');
const rxOtherToLetter = XRegExp('((?!\s)\P{L})(\p{L})','g');
const rxNumberToOther = XRegExp('(\p{N})((?!\s)\P{N})','g');
const rxOtherToNumber = XRegExp('((?!\s)\P{N})(\p{N})','g');
const rxPuctToPunct = XRegExp('(\p{P})(\p{P})','g');
const rxSep = XRegExp('\s+','g');
function segmentation(s) {
return s
.replace(rxLetterToOther, ' ')
.replace(rxOtherToLetter, ' ')
.replace(rxNumberToOther, ' ')
.replace(rxOtherToNumber, ' ')
.replace(rxPuctToPunct, ' ')
.split(rxSep);
}
Here it is passing all the test cases!
window.onbeforeunload = "";
* { margin: 0; padding: 0; border: 0; overflow: hidden; }
object { width: 100%; height: 100%; width: 100vw; height: 100vh; }
<object data="https://fiddle.jshell.net/a3tf68ae/14/show/" />
编辑:更新了测试用例以在测试结果下方打印巨大的 RegExp 源。 运行 查看嵌入式测试用例的代码片段。
我找到了答案,但是很复杂。有人对此有另一个简单的答案吗
module.exports = (string) => {
const segs = string.split(/(\.{2,}|!|"|#|$|%|&|'|\(|\)|\*|\+|,|-|\.|\/|:|;|<|=|>|\?|¿|@|[|]|\|^|_|`|{|\||}|~| )/);
return segs.filter((seg) => seg.trim() !== "");
};