JavaScript 正则表达式匹配句子中的单词
JavaScript regex matching word in sentences
匹配JavaScript中每个句子中特定单词的正则表达式应该是什么?
句子匹配规则明确:
它应该以点 (.) 结尾,下一个字母应该是大写的。
但我需要实现的是在每个句子中匹配一个单词。所以我想我应该使用组。或者我应该把字符串单词放在正则表达式中吗?
这是我的 java 循环句子的正则表达式
enter link
这是我的 java 正则表达式,用于匹配 -5 +5 单词上下文中的单词:
enter link
但我需要在 JavaScript.
中将两者结合起来
我的目标:
输入:
Cliffs have collapsed in New Zealand during an earthquake in the city
of Christchurch on the South Island. No serious damage or fatalities
were reported in the Valentine's Day quake that struck at 13:13 local
time. Based on the med. report everybody were ok.
所选单词“on”的输出:
- Cliffs have collapsed in New Zealand during an earthquake in the city of Christchurch on the South Island
- Based on the med. report everybody were ok.
更新:我在下面提供两种解决方案。我原来的回答只提供了第一个。
一种解决方案使用单个正则表达式来尝试解析整个原始段落。可以做到,但如下所述,可能不是最佳解决方案。
另一种解决方案是更复杂的算法,但使用更轻的正则表达式。它将文本分成句子并分别处理每个句子。这个解决方案效率更高,而且,我可以说,更优雅。
解决方案 1:单一正则表达式
运行 下面的第一个代码片段演示了这个解决方案。它会找到包含您想要的任何关键字的所有句子(如您定义的那样)。完整的正则表达式是...
\. +([A-Z]([^.]|.(?! +[A-Z]))*?" + keyword + "([^.]|.(?! +[A-Z]))*?\.(?= +[A-Z]))
...但是代码将其分解为更容易理解的部分。
单击 'Run code snippet' 按钮后,需要几秒钟才能 运行。
这是一个相当依赖正则表达式的解决方案。它可能相当慢。使用您提供的示例段落,此例程变得非常慢。即使这么慢,它实际上还不够复杂,因为它无法判断关键字何时嵌入到另一个词中。 (例如,当查找 "cats" 时,它也会找到 "catsup")。尝试避免这种嵌入是可能的,但这只会让整个过程变得太慢,甚至无法演示。
var text = "I like cats. I really like cats. I also like dogs. Dogs and cats are pets. Approx. half of pets are cats. Approx. half of pets are dogs. Some cats are v. expensive.";
var keyword = "cats";
var reStr =
"\. +" + // a preceding sentence-ender, i.e. a period
// followed by one or more spaces
"(" + // begin remembering the match (i.e. arr[1] below)
"[A-Z]" + // a sentence-starter, i.e. an uppercase letter
"(" + // start of a sentence-continuer, which is either
"[^.]" + // anything but a period
"|" + // or
"\.(?! +[A-Z])" + // a period not followed by one or more spaces
// and an uppercase letter
")" + // end of a sentence-continuer
"*?" + // zero or more of the preceding sentence-continuers
// but as few as possible
keyword + // the keyword being sought
"([^.]|\.(?! +[A-Z]))" + // a sentence-continuer, as described above
"*?" + // zero or more of them but as few as possible
"\." + // a sentence-ender, i.e. a period
"(?= +[A-Z])" + // followed by one or more spaces and an
// uppercase letter, which is not remembered
")"; // finish remembering the match
// That ends up being the following:
// "\. +([A-Z]([^.]|.(?! +[A-Z]))*?" + keyword + "([^.]|.(?! +[A-Z]))*?\.(?= +[A-Z]))"
var re = new RegExp(reStr, "g"); // construct the regular expression
var sentencesWithKeyword = []; // initialize an array to keep the hits
var arr; // prepare an array to temporarily keep 'exec' return values
var expandedText = ". " + text + " A";
// add a sentence-ender (i.e. a period) before the text
// and a sentence-starter (i.e. an uppercase letter) after the text
// to facilitate finding the first and last sentences
while ((arr = re.exec(expandedText)) !== null) { // while hits are found
sentencesWithKeyword.push(arr[1]); // remember the sentence found
re.lastIndex -= 2; // start the next search two characters back
// to allow for starting the next match
// with the period that ended the current match
}
// show the results
show("Text to search:");
show(text);
show("Query string: " + keyword);
show("Hits:");
for (var num = 0; num < sentencesWithKeyword.length; num += 1) {
show((num + 1) + ". " + sentencesWithKeyword[num]);
}
function show(msg) {
document.write("<p>" + msg + "</p>");
}
解决方案 2:分而治之
在这里,您执行以下操作:
- 将原文拆分为句子元素数组
- 在每个句子中搜索关键字
- 保留那些有关键字的,丢弃那些没有的
这样一来,您使用的任何正则表达式都不必同时处理拆分成句子、搜索关键字、保持命中和丢弃非命中,所有这些都在一个庞大的正则表达式中。
var textToSearch = "I like cats. I really like cats. I also like dogs. Cats are great. Catsup is tasty. Dogs and cats are pets. Approx. half of pets are cats. Approx. half of pets are dogs. Some cats are v. expensive.";
var keyword = "cats";
var sentences = {
all : [],
withKeyword : [],
withNoKeyword : []
}
var sentenceRegex = new RegExp("([.]) +([A-Z])", "g");
var sentenceSeparator = "__SENTENCE SEPARATOR__";
var modifiedText = textToSearch.replace(sentenceRegex, "" + sentenceSeparator + "");
sentences.all = modifiedText.split(sentenceSeparator);
sentences.all.forEach(function(sentence) {
var keywordRegex = new RegExp("(^| +)" + keyword + "( +|[.])", "i");
var keywordFound = keywordRegex.test(sentence);
if (keywordFound) {
sentences.withKeyword.push(sentence);
} else {
sentences.withNoKeyword.push(sentence);
}
});
document.write("<pre>" + JSON.stringify(sentences, null, 2) + "</pre>");
匹配JavaScript中每个句子中特定单词的正则表达式应该是什么?
句子匹配规则明确: 它应该以点 (.) 结尾,下一个字母应该是大写的。
但我需要实现的是在每个句子中匹配一个单词。所以我想我应该使用组。或者我应该把字符串单词放在正则表达式中吗?
这是我的 java 循环句子的正则表达式 enter link
这是我的 java 正则表达式,用于匹配 -5 +5 单词上下文中的单词: enter link 但我需要在 JavaScript.
中将两者结合起来我的目标:
输入:
Cliffs have collapsed in New Zealand during an earthquake in the city of Christchurch on the South Island. No serious damage or fatalities were reported in the Valentine's Day quake that struck at 13:13 local time. Based on the med. report everybody were ok.
所选单词“on”的输出:
- Cliffs have collapsed in New Zealand during an earthquake in the city of Christchurch on the South Island
- Based on the med. report everybody were ok.
更新:我在下面提供两种解决方案。我原来的回答只提供了第一个。
一种解决方案使用单个正则表达式来尝试解析整个原始段落。可以做到,但如下所述,可能不是最佳解决方案。
另一种解决方案是更复杂的算法,但使用更轻的正则表达式。它将文本分成句子并分别处理每个句子。这个解决方案效率更高,而且,我可以说,更优雅。
解决方案 1:单一正则表达式
运行 下面的第一个代码片段演示了这个解决方案。它会找到包含您想要的任何关键字的所有句子(如您定义的那样)。完整的正则表达式是...
\. +([A-Z]([^.]|.(?! +[A-Z]))*?" + keyword + "([^.]|.(?! +[A-Z]))*?\.(?= +[A-Z]))
...但是代码将其分解为更容易理解的部分。
单击 'Run code snippet' 按钮后,需要几秒钟才能 运行。
这是一个相当依赖正则表达式的解决方案。它可能相当慢。使用您提供的示例段落,此例程变得非常慢。即使这么慢,它实际上还不够复杂,因为它无法判断关键字何时嵌入到另一个词中。 (例如,当查找 "cats" 时,它也会找到 "catsup")。尝试避免这种嵌入是可能的,但这只会让整个过程变得太慢,甚至无法演示。
var text = "I like cats. I really like cats. I also like dogs. Dogs and cats are pets. Approx. half of pets are cats. Approx. half of pets are dogs. Some cats are v. expensive.";
var keyword = "cats";
var reStr =
"\. +" + // a preceding sentence-ender, i.e. a period
// followed by one or more spaces
"(" + // begin remembering the match (i.e. arr[1] below)
"[A-Z]" + // a sentence-starter, i.e. an uppercase letter
"(" + // start of a sentence-continuer, which is either
"[^.]" + // anything but a period
"|" + // or
"\.(?! +[A-Z])" + // a period not followed by one or more spaces
// and an uppercase letter
")" + // end of a sentence-continuer
"*?" + // zero or more of the preceding sentence-continuers
// but as few as possible
keyword + // the keyword being sought
"([^.]|\.(?! +[A-Z]))" + // a sentence-continuer, as described above
"*?" + // zero or more of them but as few as possible
"\." + // a sentence-ender, i.e. a period
"(?= +[A-Z])" + // followed by one or more spaces and an
// uppercase letter, which is not remembered
")"; // finish remembering the match
// That ends up being the following:
// "\. +([A-Z]([^.]|.(?! +[A-Z]))*?" + keyword + "([^.]|.(?! +[A-Z]))*?\.(?= +[A-Z]))"
var re = new RegExp(reStr, "g"); // construct the regular expression
var sentencesWithKeyword = []; // initialize an array to keep the hits
var arr; // prepare an array to temporarily keep 'exec' return values
var expandedText = ". " + text + " A";
// add a sentence-ender (i.e. a period) before the text
// and a sentence-starter (i.e. an uppercase letter) after the text
// to facilitate finding the first and last sentences
while ((arr = re.exec(expandedText)) !== null) { // while hits are found
sentencesWithKeyword.push(arr[1]); // remember the sentence found
re.lastIndex -= 2; // start the next search two characters back
// to allow for starting the next match
// with the period that ended the current match
}
// show the results
show("Text to search:");
show(text);
show("Query string: " + keyword);
show("Hits:");
for (var num = 0; num < sentencesWithKeyword.length; num += 1) {
show((num + 1) + ". " + sentencesWithKeyword[num]);
}
function show(msg) {
document.write("<p>" + msg + "</p>");
}
解决方案 2:分而治之
在这里,您执行以下操作:
- 将原文拆分为句子元素数组
- 在每个句子中搜索关键字
- 保留那些有关键字的,丢弃那些没有的
这样一来,您使用的任何正则表达式都不必同时处理拆分成句子、搜索关键字、保持命中和丢弃非命中,所有这些都在一个庞大的正则表达式中。
var textToSearch = "I like cats. I really like cats. I also like dogs. Cats are great. Catsup is tasty. Dogs and cats are pets. Approx. half of pets are cats. Approx. half of pets are dogs. Some cats are v. expensive.";
var keyword = "cats";
var sentences = {
all : [],
withKeyword : [],
withNoKeyword : []
}
var sentenceRegex = new RegExp("([.]) +([A-Z])", "g");
var sentenceSeparator = "__SENTENCE SEPARATOR__";
var modifiedText = textToSearch.replace(sentenceRegex, "" + sentenceSeparator + "");
sentences.all = modifiedText.split(sentenceSeparator);
sentences.all.forEach(function(sentence) {
var keywordRegex = new RegExp("(^| +)" + keyword + "( +|[.])", "i");
var keywordFound = keywordRegex.test(sentence);
if (keywordFound) {
sentences.withKeyword.push(sentence);
} else {
sentences.withNoKeyword.push(sentence);
}
});
document.write("<pre>" + JSON.stringify(sentences, null, 2) + "</pre>");